Using Distiluse Multilingual Pre-trained Model

In [None]:
!pip install sentence-transformers # Will install necessary modules required for our analysis

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distiluse-base-multilingual-cased-v2') # We are using this particular BERT model from hugging face

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/610 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/341 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/539M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/531 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

In [None]:
import pandas as pd
df = pd.read_csv('realdata.csv')

In [None]:
z1 = model.encode(df.iloc[:,1])  # We are using this to encode the 1st news article in all the samples
print(z1.shape) # Each sample is encoded into a 768 feature vector
ok = pd.DataFrame(z1)
ok.to_csv('new1.csv')  # The encoded feature vector for the 1st article of each sample is stored in this csv file

(1043, 512)


In [None]:
z2 = model.encode(df.iloc[:,2])  # We are using this to encode the 2st news article in all the samples
print(z2.shape) # Each sample is encoded into a 768 feature vector
ok = pd.DataFrame(z2)
ok.to_csv('new2.csv')  # The encoded feature vector for the 2nd article of each sample is stored in this csv file

(1043, 512)


In [None]:
from scipy.stats import pearsonr
import numpy as np
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
import pandas as pd

df = pd.read_csv('realdata.csv')
def bound(a):  # This function makes sure that all the predicted values are between [0,4]
    for i in range(a.shape[0]):
        if a[i]>4:
            a[i] = 4
        else:
            if a[i]<0:
                a[i] = 0
    return a

model = RandomForestRegressor()
a1 = pd.read_csv('new1.csv').values  # Taking the feature vectors of 1st article
a2 = pd.read_csv('new2.csv').values  # Taking the feature vectors of 2nd article
a = a1*a2     # Converting 2 features into a single feature.
y_label = (df["Overall"].values)
# Splitting the data into training and test dataset
train_x,test_x,train_y,test_y = train_test_split(a,y_label,test_size = 0.3,shuffle = True,random_state = 42)
model.fit(train_x,train_y) # Training the model and predicting the model 
train_pred = bound(model.predict(train_x)) # Making predictions for training and test datasets
test_pred = bound(model.predict(test_x))
print("Pearson Correlation coefficient on Training Data = ",pearsonr(train_pred,train_y)[0]) # Pearson coefficient for training data
print("Pearson Correlation coefficient on Test Data = ",pearsonr(test_pred,test_y)[0]) # Pearson coefficient for test data

Pearson Correlation coefficient on Training Data =  0.9872949226822345
Pearson Correlation coefficient on Test Data =  0.7300854580792784


Using a Deep Learning Model

In [None]:
from scipy.stats import pearsonr
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
import torch.nn as nn


df = pd.read_csv('realdata.csv')  #Reading the data from the csv file



def bound(a):  # This function makes sure that all the predicted values are between [0,4]
    for i in range(a.shape[0]):
        if a[i]>4:
            a[i] = 4
        else:
            if a[i]<0:
                a[i] = 0
    return a



a1 = pd.read_csv('new1.csv').values  # Taking the feature vectors of 1st article
a2 = pd.read_csv('new2.csv').values  # Taking the feature vectors of 2nd article
a = a1*a2                            # Converting 2 features into a single feature.
y_label = (df["Overall"].values)


# Splitting the data into training and test dataset
train_x,test_x,train_y,test_y = train_test_split(a,y_label,test_size = 0.3,shuffle = True,random_state = 42)


train_x = torch.from_numpy(train_x).to(dtype=torch.float32,device='cuda') # Converting a numpy array into a tensor and moving it to GPU
train_y = torch.from_numpy(train_y).to(dtype=torch.float32,device='cuda') # Converting a numpy array into a tensor and moving it to GPU
test_x = torch.from_numpy(test_x).to(dtype=torch.float32,device='cuda') # Converting a numpy array into a tensor and moving it to GPU
test_y = torch.from_numpy(test_y).to(dtype=torch.float32,device='cuda') # Converting a numpy array into a tensor and moving it to GPU


# This function trains the neural network given the number of epochs,optimizer,loss function,training data
def training(epochs,optimizer,model,loss_fn,train_x,train_y):
    for ii in range(epochs):
        y_pred = model(train_x)
        loss = loss_fn(y_pred[:,0],train_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Creating a Feed Forward Neural Network
model = nn.Sequential(
    nn.Linear(513,400),
    nn.BatchNorm1d(400),
    nn.ReLU(),
    nn.Linear(400,200),
    nn.BatchNorm1d(200),
    nn.ReLU(),
    nn.Linear(200,100),
    nn.BatchNorm1d(100),
    nn.ReLU(),
    nn.Linear(100,10),
    nn.BatchNorm1d(10),
    nn.ReLU(),
    nn.Linear(10,1)
)

model.to(device='cuda')    # Moving the model to a gpu
optimizer = torch.optim.Adam(model.parameters())      # We used Adam optimizer
training(3000,optimizer,model,nn.MSELoss(),train_x,train_y) # Training with 3000 epochs and MSE loss
y_pred = model(test_x)  # Prediction on Test Data
train_pred = model(train_x) # Prediction on Training Data


y_pred = bound(y_pred).to(device='cpu').detach()  # Moving the predictions to cpu
train_pred = bound(train_pred).to(device='cpu').detach() # Moving the predictions to cpu
train_y = train_y.to(device='cpu')
test_y = test_y.to(device='cpu')

print("Pearson Correlation coefficient on Training Data = ",pearsonr(train_pred[:,0],train_y)[0]) # Pearson coefficient for training data
print("Pearson Correlation coefficient on Test Data = ",pearsonr(y_pred[:,0],test_y)[0]) # Pearson coefficient for test data

Pearson Correlation coefficient on Training Data =  0.24933742296966205
Pearson Correlation coefficient on Test Data =  0.009917843696135993
