# Importing Necessary Modules

In [None]:
import pandas as pd
import json
import os

# Code for Data Extraction

In [None]:
df = pd.read_csv('input.csv') # Here input.csv is the input csv file from the Semeval website
a = df['pair_id'].tolist() # This array stores the pair_id for each sample
b = [[i[8]+i[9],i[19]+i[20]] for i in a] 
s = "/content/drive/MyDrive/training_data/output_dir/" # This is the path to the data
x = []    # This array stores the text data for all the News Article pair
for i in range(len(a)):
    o = []     # This array stores text data for a single News Article Pair
    h = s+b[i][0]+'/'+a[i][:10]+'.json'    # Path corresponding to the 1st article
    if os.path.exists(h)==False:     # If the path does not exist the text data will be [-1,-1]
        x.append([-1,-1])
        continue
    with open(h,'r') as f:    # If the path exists we read the data
        f = json.load(f)
    o.append(f['text'])
    h = s+b[i][1]+'/'+a[i][11:]+'.json'  # Path corresponding to 2nd article
    if os.path.exists(h)==False:    # If the path does not exist the text data will be [-1,-1]
        x.append([-1,-1])
        continue
    with open(h,'r') as f:    # If the path exists we read the data
        f = json.load(f)
    o.append(f['text'])
    x.append(o)
df = pd.DataFrame(x)
df.to_csv('textdata.csv')  # Saving the text data as a separate csv file


In [None]:
# Adding labels to our text data
df1 = pd.read_csv('input.csv')
df2 = pd.read_csv('textdata.csv')
df2['Geography'] = df1['Geography']
df2['Entities'] = df1['Entities']
df2['Time'] = df1['Time']
df2['Narrative'] = df1['Narrative']
df2['Overall'] = df1['Overall']
df2['Style'] = df1['Style']
df2['Tone'] = df1['Tone']
z = []  # It will store the indices of each valid sample
for i in range(df2.shape[0]):
    if df2.iloc[i,1]=='-1' or df2.iloc[i,2]=='-1': # If it contains -1 it means its an invalid sample.
        continue
    else:
        z.append(i)
df2 = df2.iloc[z,1:]
df2.to_csv('realdata.csv') # This csv contains our final data with valid samples

# Code for Model Implementation

In [None]:
!pip install sentence-transformers # Will install necessary modules required for our analysis

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-multilingual-cased') # We are using this particular BERT model from hugging face

In [None]:
df = pd.read_csv('realdata.csv')

z1 = model.encode(df[:,1])  # We are using this to encode the 1st news article in all the samples
print(z1.shape) # Each sample is encoded into a 768 feature vector
ok = pd.DataFrame(z1)
ok.to_csv('hi1.csv')  # The encoded feature vector for the 1st article of each sample is stored in this csv file

z2 = model.encode(df[:,2])  # We are using this to encode the 2st news article in all the samples
print(z2.shape) # Each sample is encoded into a 768 feature vector
ok = pd.DataFrame(z2)
ok.to_csv('hi2.csv')  # The encoded feature vector for the 2nd article of each sample is stored in this csv file

In [None]:
from scipy.stats import pearsonr
import numpy as np
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
import pandas as pd

df = pd.read_csv('realdata.csv')
def length(a):  # This vector computes the magnitude of a vector
    a = a*a
    return np.sqrt(np.sum(a))
def sigmoid(x):  # This implements the sigmoid function
    return 1/(1+np.exp(-x))
def bound(a):  # This function makes sure that all the predicted values are between [0,4]
    for i in range(a.shape[0]):
        if a[i]>4:
            a[i] = 4
        else:
            if a[i]<0:
                a[i] = 0
    return a

model = DecisionTreeRegressor()
a1 = pd.read_csv('hi1.csv').values  # Taking the feature vectors of 1st article
a2 = pd.read_csv('hi2.csv').values  # Taking the feature vectors of 2nd article
a = a1*a2     # Converting 2 features into a single feature.
y_label = (df["Overall"].values)
# Splitting the data into training and test dataset
train_x,test_x,train_y,test_y = train_test_split(a,y_label,test_size = 0.3,shuffle = True,random_state = 42)
model.fit(train_x,train_y) # Training the model and predicting the model 
train_pred = bound(model.predict(train_x)) # Making predictions for training and test datasets
test_pred = bound(model.predict(test_x))
print("Pearson Correlation coefficient on Training Data = ",pearsonr(train_pred,train_y)[0]) # Pearson coefficient for training data
print("Pearson Correlation coefficient on Test Data = ",pearsonr(test_pred,test_y)[0]) # Pearson coefficient for test data

Pearson Correlation coefficient on Training Data =  0.9835387632076156
Pearson Correlation coefficient on Test Data =  0.604725053572058
