# Setup

In [1]:
!pip install transformers
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m103.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90

In [2]:
import csv
import pandas as pd
from google.colab import drive
import numpy as np

from transformers import AdamW, AutoTokenizer, XLNetTokenizer, XLNetForSequenceClassification, get_linear_schedule_with_warmup, XLNetModel, XLNetConfig


import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW

from sklearn.model_selection import train_test_split, ParameterGrid

from scipy.spatial.distance import cosine
from scipy.stats import pearsonr

In [3]:
drive.mount('/content/drive')
FOLDERNAME = 'ColabNotebooks/263/263 Final Project/Data'
%cd drive/My\ Drive
%cd $FOLDERNAME

Mounted at /content/drive
/content/drive/My Drive
/content/drive/My Drive/ColabNotebooks/263/263 Final Project/Data


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Dataset

In [5]:
train = pd.read_csv("train_df.csv")
evl = pd.read_csv("evaluation_df.csv")

# Drop rows with NaN values in 'text1' or 'text2'
train = train.dropna()

In [6]:
processed_data = train[train['text1'].notna()]
processed_data = processed_data[processed_data['text2'].notna()]

In [7]:
# split into train and development.
train, dev = train_test_split(processed_data[0:1000], test_size=0.2, random_state = 42)

# Encoding

In [6]:
def get_data_loader(data, batch_size_flg = True):
  tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased") # change the tokenizer to xlnet
  input_ids, attention_masks, labels = [], [], []
  for idx, row in data.iterrows():
      text1, text2 = row['text1'], row['text2']
      encode_dict = tokenizer(text1,text2,
                                  max_length=512,
                                  padding='max_length',
                                  truncation=True,
                                  add_special_tokens=True)

      input_ids.append(encode_dict['input_ids'])
      attention_masks.append(encode_dict['attention_mask'])

      # Convert to only 1 label.
      try:
        labels.append([float(x) for x in [row['Geography'],row['Entities'],row['Time'],row['Narrative'],row['Overall'],row['Style'],row['Tone']]])
      except:
        labels.append([float(x) for x in [row['GEO'],row['ENT'],row['TIME'],row['NAR'],row['Overall'],row['STYLE'],row['TONE']]])

  input_ids = torch.tensor(input_ids)
  attention_masks = torch.tensor(attention_masks)
  labels = torch.tensor(labels)

  data = TensorDataset(input_ids, attention_masks, labels)
  if(batch_size_flg):
      data_loader = DataLoader(data, batch_size=5, shuffle=True, drop_last=True)
  else:
      data_loader = DataLoader(data)
  return data_loader

In [9]:
train_data_loader = get_data_loader(train)
eval_data_loader = get_data_loader(dev, False)

Downloading (…)lve/main/config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

# Model

In [7]:
class Custom_XLNet(nn.Module):
    def __init__(self, model, hidden_size):
        super(Custom_XLNet, self).__init__()
        self.reg_model = model
        self.fc1 = nn.Linear(hidden_size, 100)
        self.dropout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(100, 7)
        self.activation = nn.GELU()

    def forward(self, input_ids, attention_masks):
        outputs = self.reg_model(input_ids, attention_masks)
        output1 = outputs.last_hidden_state

        # Apply mean pooling over the sequence dimension.
        output1 = output1.mean(dim=1)

        output1 = self.dropout(output1)
        output2 = self.activation(self.fc1(output1))
        output2 = self.dropout(output2)
        logits1 = self.fc2(output2)
        return logits1


In [8]:
def predict(model, data_loader):
  model.eval()
  overall_pred, overall_true = [], []
  with torch.no_grad():
    for idx, (ids, att_msks, y) in enumerate(data_loader):
      ids, att_msks, y = ids.to(device), att_msks.to(device), y.to(device)
      y_pred = model(ids, att_msks)
      y_pred, y = torch.squeeze(y_pred).cpu().numpy().tolist(), torch.squeeze(y).cpu().numpy().tolist()
      overall_pred.append(y_pred[4])
      overall_true.append(y[4])
  return overall_pred, overall_true

In [9]:
def weighted_loss( y_pred, y, criterion, loss_weights):
  loss = 0.0
  for i in range(7):
    y_pred_i, y_i = y_pred[:, i], y[:, i]
    loss += criterion(y_pred_i, y_i) * loss_weights[i]
  return loss

In [10]:
def train(model, model_path, train_data_loader, eval_data_loader, optimizer, loss_weights, epochs):
  model.train()
  criterion = nn.MSELoss()
  best_pearson = 0
  for i in range(epochs):
    train_loss_sum = 0
    for idx, (ids, att_msks, y) in enumerate(train_data_loader):
      ids, att_msks, y = ids.to(device), att_msks.to(device), y.to(device)
      optimizer.zero_grad()
      y_pred = model(ids, att_msks)
      y_pred, y = torch.squeeze(y_pred), torch.squeeze(y)
      loss = weighted_loss(y_pred, y, criterion, loss_weights)
      loss.backward()
      optimizer.step()
      train_loss_sum += loss.item()

    print(f"Loss at epoch {i}: {train_loss_sum:.4f}")

    # Determine best epoch model using correlation coefficient for Overall in dev data.
    eval_pred_overall, eval_true_overall = predict(model, eval_data_loader)
    curr_pearson = np.corrcoef(eval_pred_overall, eval_true_overall)[0][1]
    print(curr_pearson)
    if curr_pearson > best_pearson:
      best_pearson = curr_pearson
      torch.save(model.state_dict(), model_path)

In [11]:
batch_size = 5
lr = 5e-6
weight_decay = 1e-4
num_epochs = 40
hidden_size = 200

In [15]:
xlnet_model = XLNetModel.from_pretrained("xlnet-base-cased") # instantiate a xlnet model
model = Custom_XLNet(xlnet_model, hidden_size) # change the model instantiation to Custom_XLNet
model = model.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetModel: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

In [17]:
torch.cuda.empty_cache()

pre_trained_model = XLNetModel.from_pretrained("xlnet-base-cased")
config = XLNetConfig.from_pretrained("xlnet-base-cased")
hidden_size = config.d_model

loss_weights = [0.5 if i == 4 else (1-0.5)/6 for i in range(7)]

model = Custom_XLNet(pre_trained_model, hidden_size)
model.to(device)

model_name = f'fine_tune_xlnet.pth'
model_path = f'/content/drive/MyDrive/ColabNotebooks/263/263 Final Project/Data/{model_name}'

print(f"Model name for this run: {model_name}")

optimizer = AdamW(model.parameters(), lr=lr, weight_decay = weight_decay)
train(model, model_path, train_data_loader, eval_data_loader, optimizer, loss_weights, num_epochs)

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetModel: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model name for this run: fine_tune_xlnet.pth
Loss at epoch 0: 461.4909
0.37178275961452145
Loss at epoch 1: 123.6280
0.6429607934347881
Loss at epoch 2: 79.2136
0.6607265001821282
Loss at epoch 3: 56.5625
0.6831153919327043
Loss at epoch 4: 46.0371
0.6620085203382793
Loss at epoch 5: 40.9728
0.6585884606514403
Loss at epoch 6: 36.8828
0.6733973407705407
Loss at epoch 7: 33.8500
0.664166779116083
Loss at epoch 8: 31.0417
0.6570960774991056
Loss at epoch 9: 28.8431
0.6544210339110477
Loss at epoch 10: 26.9055
0.6640825841132925
Loss at epoch 11: 24.8595
0.653430782531102
Loss at epoch 12: 22.8497
0.6466568459003637
Loss at epoch 13: 22.1889
0.6626442101154588
Loss at epoch 14: 20.2911
0.656214097712456
Loss at epoch 15: 17.3339
0.6545622627943518
Loss at epoch 16: 16.5404
0.6607005323975308
Loss at epoch 17: 14.3324
0.6674618307908453
Loss at epoch 18: 12.2916
0.6483375901578241
Loss at epoch 19: 10.7885
0.6748056915565661
Loss at epoch 20: 10.6809
0.6486648730991834
Loss at epoch 21: 9.

# Evaluate

In [12]:
test_data_loader = get_data_loader(evl[0:500])

Downloading (…)lve/main/config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

In [None]:
config = XLNetConfig.from_pretrained("xlnet-base-cased")
hidden_size = config.d_model

pre_trained_model = XLNetModel.from_pretrained("xlnet-base-cased")
model = Custom_XLNet(pre_trained_model, hidden_size)
model.load_state_dict(torch.load("/content/drive/MyDrive/ColabNotebooks/263/263 Final Project/Data/fine_tune_xlnet.pth"), strict=False)
model.to(device)

test_pred_overall, test_true_overall = predict(model, test_data_loader)
test_pearson_score = np.corrcoef(test_pred_overall, test_true_overall)[0][1]

print("Pearson score on test dataset is {:.3f}".format(test_pearson_score))


Downloading pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetModel: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
