# Packages

In [1]:
!pip install transformers
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m

In [2]:
import csv
import pandas as pd
from google.colab import drive
import numpy as np

from transformers import BertTokenizer, BertForSequenceClassification, BertModel, BertConfig, AdamW, AutoTokenizer

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW

from sklearn.model_selection import train_test_split, ParameterGrid

from scipy.spatial.distance import cosine
from scipy.stats import pearsonr

In [3]:
drive.mount('/content/drive')
FOLDERNAME = 'ColabNotebooks/263/263 Final Project/Data'
%cd drive/My\ Drive
%cd $FOLDERNAME

Mounted at /content/drive
/content/drive/My Drive
/content/drive/My Drive/ColabNotebooks/263/263 Final Project/Data


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Dataset

In [11]:
train = pd.read_csv("train_df.csv")
evl = pd.read_csv("evaluation_df.csv")

# Drop rows with NaN values in 'text1' or 'text2'
train = train.dropna()

In [None]:
processed_data = train[train['text1'].notna()]
processed_data = processed_data[processed_data['text2'].notna()]

In [None]:
# split into train and development.
train, dev = train_test_split(processed_data[0:1000], test_size=0.2, random_state = 42)

# Encoding

In [5]:
def get_data_loader(data, batch_size_flg = True):
  tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # change the tokenizer to bert
  input_ids, attention_masks, labels = [], [], []
  for idx, row in data.iterrows():
      text1, text2 = row['text1'], row['text2']
      encode_dict = tokenizer(text1,text2,
                                  max_length=512,
                                  padding='max_length',
                                  truncation=True,
                                  add_special_tokens=True)

      input_ids.append(encode_dict['input_ids'])
      attention_masks.append(encode_dict['attention_mask'])

      # Convert to only 1 label.
      try:
        labels.append([float(x) for x in [row['Geography'],row['Entities'],row['Time'],row['Narrative'],row['Overall'],row['Style'],row['Tone']]])
      except:
        labels.append([float(x) for x in [row['GEO'],row['ENT'],row['TIME'],row['NAR'],row['Overall'],row['STYLE'],row['TONE']]])

  input_ids = torch.tensor(input_ids)
  attention_masks = torch.tensor(attention_masks)
  labels = torch.tensor(labels)

  data = TensorDataset(input_ids, attention_masks, labels)
  if(batch_size_flg):
      data_loader = DataLoader(data, batch_size=5, shuffle=True, drop_last=True)
  else:
      data_loader = DataLoader(data)
  return data_loader

In [None]:
train_data_loader = get_data_loader(train)
eval_data_loader = get_data_loader(dev, False)

# Model

In [6]:
class Custom_Bert(nn.Module): # change the model name to Custom_Bert
    def __init__(self, model, hidden_size):
        super(Custom_Bert, self).__init__() # change the model name to Custom_Bert
        self.reg_model = model
        self.fc1 = nn.Linear(hidden_size, 100)
        self.dropout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(100, 7) # Currently processes the 7 labels that we have defined for 7 output types.
        self.activation = nn.GELU()

    def forward(self, input_ids, attention_masks):
        output1 = self.reg_model(input_ids, attention_masks)[1]
        output1 = self.dropout(output1)
        output2 = self.activation(self.fc1(output1))
        output2 = self.dropout(output2)
        # x = self.dropout(x)
        logits1 = self.fc2(output2)
        return logits1

In [7]:
def predict(model, data_loader):
  model.eval()
  overall_pred, overall_true = [], []
  with torch.no_grad():
    for idx, (ids, att_msks, y) in enumerate(data_loader):
      ids, att_msks, y = ids.to(device), att_msks.to(device), y.to(device)
      y_pred = model(ids, att_msks)
      y_pred, y = torch.squeeze(y_pred).cpu().numpy().tolist(), torch.squeeze(y).cpu().numpy().tolist()
      overall_pred.append(y_pred[4])
      overall_true.append(y[4])
  return overall_pred, overall_true

In [8]:
def weighted_loss( y_pred, y, criterion, loss_weights):
  loss = 0.0
  for i in range(7):
    y_pred_i, y_i = y_pred[:, i], y[:, i]
    loss += criterion(y_pred_i, y_i) * loss_weights[i]
  return loss

In [9]:
def train(model, model_path, train_data_loader, eval_data_loader, optimizer, loss_weights, epochs):
  model.train()
  criterion = nn.MSELoss()
  best_pearson = 0
  for i in range(epochs):
    train_loss_sum = 0
    for idx, (ids, att_msks, y) in enumerate(train_data_loader):
      ids, att_msks, y = ids.to(device), att_msks.to(device), y.to(device)
      optimizer.zero_grad()
      y_pred = model(ids, att_msks)
      y_pred, y = torch.squeeze(y_pred), torch.squeeze(y)
      loss = weighted_loss(y_pred, y, criterion, loss_weights)
      loss.backward()
      optimizer.step()
      train_loss_sum += loss.item()

    print(f"Loss at epoch {i}: {train_loss_sum:.4f}")

    # Determine best epoch model using correlation coefficient for Overall in dev data.
    eval_pred_overall, eval_true_overall = predict(model, eval_data_loader)
    curr_pearson = np.corrcoef(eval_pred_overall, eval_true_overall)[0][1]
    print(curr_pearson)
    if curr_pearson > best_pearson:
      best_pearson = curr_pearson
      torch.save(model.state_dict(), model_path)

In [12]:
batch_size = 5
lr = 5e-6
weight_decay = 1e-4
num_epochs = 40
hidden_size = 200

In [None]:
bert_model = BertModel.from_pretrained("bert-base-uncased") # instantiate a bert model
model = Custom_Bert(bert_model, hidden_size) # change the model instantiation to Custom_Bert and hidden_size to 768
model = model.to(device)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

NameError: ignored

In [None]:
# Run model finetuning and save fine-tuned model.
torch.cuda.empty_cache()

pre_trained_model = BertModel.from_pretrained("bert-base-uncased")
config = BertConfig.from_pretrained("bert-base-uncased")
hidden_size = config.hidden_size

loss_weights = [0.5 if i == 4 else (1-0.5)/6 for i in range(7)]

model = Custom_Bert(pre_trained_model, hidden_size)
model.to(device)

model_name = f'fine_tune_bert.pth'
model_path = f'/content/drive/MyDrive/ColabNotebooks/263/263 Final Project/Data/{model_name}'

print(f"Model name for this run: {model_name}")

optimizer = AdamW(model.parameters(), lr=lr, weight_decay = weight_decay)
train(model, model_path, train_data_loader, eval_data_loader, optimizer, loss_weights, num_epochs)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model name for this run: fine_tune_bert.pth
Loss at epoch 0: 798.9033
0.04622499478890122
Loss at epoch 1: 354.4563
0.022973339601496952
Loss at epoch 2: 219.0531
0.6597725517638584
Loss at epoch 3: 138.3551
0.662869437520367
Loss at epoch 4: 87.6095
0.6720929193585676
Loss at epoch 5: 63.2461
0.6762691209698487
Loss at epoch 6: 51.0267
0.6826924359781372
Loss at epoch 7: 46.0931
0.675778538173513
Loss at epoch 8: 42.1566
0.682523449108444
Loss at epoch 9: 39.2777
0.6852074510580997
Loss at epoch 10: 37.2634
0.6881634255213799
Loss at epoch 11: 35.8171
0.6862303955842604
Loss at epoch 12: 35.1425
0.6883087550166948
Loss at epoch 13: 34.3026
0.6823126008776883
Loss at epoch 14: 33.7007
0.6885819698490511
Loss at epoch 15: 33.2419
0.6869325635265526
Loss at epoch 16: 31.7058
0.6960562866638933
Loss at epoch 17: 31.1189
0.6954562073232964
Loss at epoch 18: 29.9864
0.6868891257201376
Loss at epoch 19: 29.8756
0.6966719609767416
Loss at epoch 20: 29.2578
0.6852550493103785
Loss at epoch 21:

# Evaluate

In [13]:
test_data_loader = get_data_loader(evl)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [14]:
config = BertConfig.from_pretrained("bert-base-uncased")
hidden_size = config.hidden_size

pre_trained_model = BertModel.from_pretrained("bert-base-uncased")
model = Custom_Bert(pre_trained_model, hidden_size)
model.load_state_dict(torch.load("/content/drive/MyDrive/ColabNotebooks/263/263 Final Project/Data/fine_tune_bert.pth"), strict=False)
model.to(device)

test_pred_overall, test_true_overall = predict(model, test_data_loader)
test_pearson_score = np.corrcoef(test_pred_overall, test_true_overall)[0][1]

print("Pearson score on test dataset is {:.3f}".format(test_pearson_score))

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Pearson score on test dataset is 0.723


  c /= stddev[:, None]
  c /= stddev[None, :]
