# Import Libraries

In [37]:
!pip install transformers
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [38]:
import pandas as pd
import numpy as np
import random
import json
import regex as re

import matplotlib.pyplot as plt
import os
import sentencepiece
import torch
import torch.nn as nn
from transformers import AutoTokenizer
from transformers import AdamW
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from transformers import XLMRobertaConfig, XLMRobertaModel, XLMRobertaTokenizer

torch.cuda.empty_cache()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

from google.colab import drive
drive.mount('/content/drive')
LIBRARY_PATH = '/content/drive/MyDrive/My Project/'

# Seed
seed = 123
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Preprocessing

In [39]:
original_train_path = 'Train Data/train_df.csv'
original_train = pd.read_csv(LIBRARY_PATH + original_train_path)

In [40]:
# Check the size of the original dataframe.
original_size = original_train.shape
original_num_rows = original_size[0]
original_num_columns = original_size[1]

print(f"Number of rows: {original_num_rows}")
print(f"Number of columns: {original_num_columns}")
original_train.head(2)

Number of rows: 4955
Number of columns: 16


Unnamed: 0,url1_lang,url2_lang,pair_id,link1,link2,ia_link1,ia_link2,Geography,Entities,Time,Narrative,Overall,Style,Tone,text1,text2
0,en,en,1484084337_1484110209,https://www.washingtonpost.com/local/virginia-...,https://www.washingtonpost.com/world/the_ameri...,https://web.archive.org/web/www.washingtonpost...,https://web.archive.org/web/www.washingtonpost...,4.0,4.0,1.0,4.0,4.0,1.666667,2.0,"MARTINSBURG, W.Va. — A suspected drunken drive...","PORT-AU-PRINCE, Haiti — Haitian President Jove..."
1,en,en,1484396422_1483924666,https://www.stlucianewsonline.com/guyana-three...,https://www.thestar.com/news/world/europe/2020...,https://web.archive.org/web/www.stlucianewsonl...,https://web.archive.org/web/www.thestar.com/ne...,4.0,4.0,1.0,4.0,3.666667,1.666667,1.333333,(NEWS ROOM GUYANA) — Three persons are current...,Sign In The Star Edition Change Location Sign ...


In [41]:
# Delete columns that contain NaN sentences.
processed_data = original_train[original_train['text1'].notna()]
processed_data = processed_data[processed_data['text2'].notna()]

print("After removing NA text columns, we lose {0} rows.".format(original_train.shape[0] - processed_data.shape[0]))

After removing NA text columns, we lose 113 rows.


In [42]:
# Truncation of dataset.
processed_data = processed_data.head(200)

In [43]:
# split into train and development.
train, dev = train_test_split(processed_data, test_size=0.1, random_state = 42)

# Train the Model

In [44]:
max_len = 512
batch_size = 5
lr = 5e-6
weight_decay = 1e-4
num_epochs = 50

In [45]:
def get_data_loader(data, batch_size_flg = True):
  tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
  input_ids, attention_masks, labels = [], [], []
  for idx, row in data.iterrows():
      text1, text2 = row['text1'], row['text2']
      encode_dict = tokenizer(text1,text2,
                                  max_length=max_len,
                                  padding='max_length',
                                  truncation=True,
                                  add_special_tokens=True
                                  )

      input_ids.append(encode_dict['input_ids'])
      attention_masks.append(encode_dict['attention_mask'])
      # Convert to only 1 label.
      labels.append([float(x) for x in [row['Geography'],row['Entities'],row['Time'],row['Narrative'],row['Overall'],row['Style'],row['Tone']]])

  input_ids = torch.tensor(input_ids)
  attention_masks = torch.tensor(attention_masks)
  labels = torch.tensor(labels)

  data = TensorDataset(input_ids, attention_masks, labels)
  if(batch_size_flg):
      data_loader = DataLoader(data, batch_size=batch_size, shuffle=True, drop_last=True)
  else:
      data_loader = DataLoader(data)
  return data_loader

In [46]:
train_data_loader = get_data_loader(train)
eval_data_loader = get_data_loader(dev, False)

In [47]:
class Custom_XLMRoberta(nn.Module):
    def __init__(self, model, hidden_size):
        super(Custom_XLMRoberta, self).__init__()
        self.reg_model = model
        self.fc1 = nn.Linear(hidden_size, 100)
        self.dropout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(100, 7) # Currently processes the 7 labels that we have defined for 7 output types.
        self.activation = nn.GELU()

    def forward(self, input_ids, attention_masks):
        output1 = self.reg_model(input_ids, attention_masks)[1]
        output1 = self.dropout(output1)
        output2 = self.activation(self.fc1(output1))
        output2 = self.dropout(output2)
        # x = self.dropout(x)
        logits1 = self.fc2(output2)
        return logits1

In [48]:
def predict(model, data_loader):
  model.eval()
  overall_pred, overall_true = [], []
  with torch.no_grad():
    for idx, (ids, att_msks, y) in enumerate(data_loader):
      ids, att_msks, y = ids.to(device), att_msks.to(device), y.to(device)
      y_pred = model(ids, att_msks)
      y_pred, y = torch.squeeze(y_pred).cpu().numpy().tolist(), torch.squeeze(y).cpu().numpy().tolist()
      overall_pred.append(y_pred[4])
      overall_true.append(y[4])
  return overall_pred, overall_true

def weighted_loss( y_pred, y, criterion, loss_weights):
  loss = 0.0
  for i in range(7):
    y_pred_i, y_i = y_pred[:, i], y[:, i]
    loss += criterion(y_pred_i, y_i) * loss_weights[i]
  return loss

def train(model, model_path, train_data_loader, eval_data_loader, optimizer, loss_weights, epochs):
  model.train()
  criterion = nn.MSELoss()
  best_pearson = 0
  for i in range(epochs):
    train_loss_sum = 0
    for idx, (ids, att_msks, y) in enumerate(train_data_loader):
      ids, att_msks, y = ids.to(device), att_msks.to(device), y.to(device)
      optimizer.zero_grad()
      y_pred = model(ids, att_msks)
      y_pred, y = torch.squeeze(y_pred), torch.squeeze(y)
      loss = weighted_loss(y_pred, y, criterion, loss_weights)
      loss.backward()
      optimizer.step()
      train_loss_sum += loss.item()

    print(f"Loss at epoch {i}: {train_loss_sum:.4f}")

    # Determine best epoch model using correlation coefficient for Overall in dev data.
    eval_pred_overall, eval_true_overall = predict(model, eval_data_loader)
    curr_pearson = np.corrcoef(eval_pred_overall, eval_true_overall)[0][1]
    print(curr_pearson)
    if curr_pearson > best_pearson:
      best_pearson = curr_pearson
      torch.save(model.state_dict(), model_path)

In [49]:
# Run model finetuning and save fine-tuned model.
# pre_trained_model = XLMRobertaModel.from_pretrained("xlm-roberta-large")
torch.cuda.empty_cache()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

pre_trained_model = XLMRobertaModel.from_pretrained("xlm-roberta-base")
config = XLMRobertaConfig.from_pretrained("xlm-roberta-base")
hidden_size = config.hidden_size
# hidden_size = 768

loss_weights = [0.5 if i == 4 else (1-0.5)/6 for i in range(7)]

model = Custom_XLMRoberta(pre_trained_model, hidden_size)
model.to(device)

model_name = f'XLM_Roberta_base.pth'
model_path = f"/content/drive/MyDrive/My Project/Models/{model_name}"

print(f"Model name for this run: {model_name}")

optimizer = AdamW(model.parameters(), lr=lr, weight_decay = weight_decay)
train(model, model_path, train_data_loader, eval_data_loader, optimizer, loss_weights, num_epochs)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model name for this run: XLM_Roberta_base.pth




Loss at epoch 0: 271.6892
0.2169851342512059
Loss at epoch 1: 189.5406
-0.17935105780827315
Loss at epoch 2: 126.9920
0.003972531622936077
Loss at epoch 3: 96.6354
0.33973210465388415
Loss at epoch 4: 79.7731
0.13509878759993452
Loss at epoch 5: 69.0748
0.4233482571392694
Loss at epoch 6: 61.3352
0.5412994927959686
Loss at epoch 7: 54.8802
0.40261229025624246
Loss at epoch 8: 51.2585
0.29739600464637705
Loss at epoch 9: 42.0205
0.5047126341023473
Loss at epoch 10: 36.3526
0.353902243334172
Loss at epoch 11: 33.6871
0.57267180166534
Loss at epoch 12: 27.3723
0.5911380340500451
Loss at epoch 13: 24.2949
0.6077206322079248
Loss at epoch 14: 22.1952
0.5912810794618071
Loss at epoch 15: 18.9587
0.592921050445378
Loss at epoch 16: 16.7330
0.6238506310655609
Loss at epoch 17: 15.4599
0.5865773829378812
Loss at epoch 18: 14.0686
0.5738533649955431
Loss at epoch 19: 12.9447
0.5787041688882693
Loss at epoch 20: 12.4874
0.569426868791921
Loss at epoch 21: 11.4968
0.5400045898508471
Loss at epoch 

# Evaluation

In [50]:
original_test_path = 'Train Data/evaluation_df.csv'
original_test = pd.read_csv(LIBRARY_PATH + original_test_path)

In [51]:
# Check the size of the original dataframe.
original_test_size = original_test.shape
original_test_num_rows = original_test_size[0]
original_test_num_columns = original_test_size[1]

print(f"Number of rows: {original_test_num_rows}")
print(f"Number of columns: {original_test_num_columns}")

# Rename multiple columns
original_test.rename(columns={'GEO': 'Geography', 'ENT': 'Entities', 'TIME': 'Time', 'NAR': 'Narrative', 'STYLE': 'Style', 'TONE': 'Tone'}, inplace=True)

original_test.head(2)

Number of rows: 4902
Number of columns: 16


Unnamed: 0,url1_lang,url2_lang,pair_id,link1,link2,ia_link1,ia_link2,Geography,Entities,Time,Narrative,Overall,Style,Tone,text1,text2
0,en,en,1484189203_1484121193,https://wsvn.com/news/local/broward/police-2-m...,https://wsvn.com/news/local/no-swim-advisory-l...,https://web.archive.org/web/https://wsvn.com/n...,https://web.archive.org/web/https://wsvn.com/n...,1.5,4.0,2.0,4.0,3.5,1.0,1.5,"DAVIE, FLA. (WSVN) - Police need help catching...","DEERFIELD BEACH, FLA. (WSVN) - A no-swim advis..."
1,en,en,1484011097_1484011106,https://www.zdnet.com/article/autoclerk-databa...,https://securityboulevard.com/2019/10/best-wes...,https://web.archive.org/web/https://www.zdnet....,https://web.archive.org/web/https://securitybo...,1.0,2.0,1.0,1.0,1.0,3.5,2.5,Most Popular An open database exposing records...,The Home of the Security Bloggers Network Home...


In [52]:
# Delete columns that contain NaN sentences.
processed_test_data = original_test[original_test['text1'].notna()]
processed_test_data = processed_test_data[processed_test_data['text2'].notna()]

print("After removing NA text columns, we lose {0} rows.".format(original_test.shape[0] - processed_test_data.shape[0]))

After removing NA text columns, we lose 0 rows.


In [53]:
# Truncation of dataset.
processed_test_data = processed_test_data.head(200)

In [54]:
config = XLMRobertaConfig.from_pretrained("xlm-roberta-base")
hidden_size = config.hidden_size

pre_trained_model = XLMRobertaModel.from_pretrained("xlm-roberta-base")
model = Custom_XLMRoberta(pre_trained_model, hidden_size)
model.load_state_dict(torch.load("/content/drive/MyDrive/My Project/Models/XLM_Roberta_base.pth"), strict=False)
model.to(device)

test_data_loader = get_data_loader(processed_test_data, False)
test_pred_overall, test_true_overall = predict(model, test_data_loader)
test_pearson_score = np.corrcoef(test_pred_overall, test_true_overall)[0][1]

print("Pearson score on test dataset is {:.3f}".format(test_pearson_score))

train_all = get_data_loader(processed_data, False)
train_pred_overall, train_true_overall = predict(model, train_all)
train_pearson_score = np.corrcoef(train_pred_overall, train_true_overall)[0][1]
print("Pearson score on entire train dataset is {:.3f}".format(train_pearson_score))

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Pearson score on test dataset is 0.534
Pearson score on entire train dataset is 0.943
