# Setup

## Packages

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m71.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.15.1 tokenizers-0.13.3 transformers-4.29.2


In [None]:
import csv
import pandas as pd
from google.colab import drive
import numpy as np

from transformers import BertTokenizer, BertForSequenceClassification, BertModel, BertConfig, AdamW

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW

from sklearn.model_selection import train_test_split, ParameterGrid

from scipy.spatial.distance import cosine
from scipy.stats import pearsonr

In [None]:
drive.mount('/content/drive')
FOLDERNAME = 'ColabNotebooks/263/263 Final Project/Data'
%cd drive/My\ Drive
%cd $FOLDERNAME

Mounted at /content/drive
/content/drive/My Drive
/content/drive/My Drive/ColabNotebooks/263/263 Final Project/Data


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Dataset

In [None]:
train = pd.read_csv("train_df.csv")
dev = pd.read_csv("evaluation_df.csv")

# Drop rows with NaN values in 'text1' or 'text2'
train = train.dropna()

In [None]:
len(train)

4842

In [None]:
for column in ['Geography','Entities','Time','Narrative','Overall','Style','Tone']:
    train[column] = train[column] - 1

In [None]:
for column in ['GEO',	'ENT',	'TIME',	'NAR',	'Overall',	'STYLE',	'TONE']:
    dev[column] = dev[column] - 1

In [None]:
def rescale_inverted(old_value, old_min=0, old_max=3, new_min=1, new_max=-1):
    new_value = ((old_value - old_min) / (old_max - old_min)) * (new_min - new_max) + new_max
    return new_value

In [None]:
train['Rescaled_Overall'] = train['Overall'].apply(lambda x: rescale_inverted(x, old_min=0, old_max=3, new_min=-1, new_max=1))

dev['Rescaled_Overall'] = dev['Overall'].apply(lambda x: rescale_inverted(x, old_min=0, old_max=3, new_min=-1, new_max=1))

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train[['text1','text2']], train[['Geography','Entities','Time','Narrative','Rescaled_Overall','Style','Tone']])

In [None]:
labels = torch.tensor(y_train.values, dtype=torch.long)

# Bert (Head)

## Train

### Tokenize (Head)

In [None]:
# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Tokenize the 'text1' and 'text2' columns
text1_sentences = train['text1'].tolist()
text2_sentences = train['text2'].tolist()

# Tokenize the sentences
text1_inputs = tokenizer(text1_sentences, padding=True, truncation=True, return_tensors="pt")
text2_inputs = tokenizer(text2_sentences, padding=True, truncation=True, return_tensors="pt")


In [None]:
# Create tensor datasets separately
dataset1 = TensorDataset(text1_inputs['input_ids'], text1_inputs['attention_mask'])
dataset2 = TensorDataset(text2_inputs['input_ids'], text2_inputs['attention_mask'])


In [None]:
# Set batch size
batch_size = 32

# Create DataLoaders
dataloader1 = DataLoader(dataset1, batch_size=batch_size, shuffle=True)
dataloader2 = DataLoader(dataset2, batch_size=batch_size, shuffle=True)

### Pretrained-Model

In [None]:
# Initialize the BERT model
model = BertModel.from_pretrained('bert-base-uncased').to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Initialize an empty list to hold the similarities
similarities = []

for (batch1, batch2) in zip(dataloader1, dataloader2):
    # Unpack the batch and load onto GPU
    text1_inputs = {key: val.to(device) for key, val in zip(['input_ids', 'attention_mask'], batch1)}
    text2_inputs = {key: val.to(device) for key, val in zip(['input_ids', 'attention_mask'], batch2)}

    # Forward pass
    with torch.no_grad():
        text1_outputs = model(**text1_inputs)
        text2_outputs = model(**text2_inputs)

    # Get the embeddings of the [CLS] token which represents the sentence representation
    text1_embeddings = text1_outputs.last_hidden_state[:, 0, :]
    text2_embeddings = text2_outputs.last_hidden_state[:, 0, :]

    # Calculate cosine similarities for each pair of sentences
    for sent1, sent2 in zip(text1_embeddings, text2_embeddings):
        similarity = 1 - cosine(sent1.cpu().numpy(), sent2.cpu().numpy())
        similarities.append(similarity)

KeyboardInterrupt: ignored

In [None]:
train['similarities'] = similarities

In [None]:
len(similarities)

4842

### Pearson correlation

In [None]:
# Calculate Pearson correlation
correlation, _ = pearsonr(train['similarities'], train['Rescaled_Overall'])

# Save the correlation in the DataFrame
train['correlation'] = correlation

In [None]:
print(np.mean(correlation))

0.01879360899159983


In [None]:
train.to_csv('train_similarities.csv', index=False)

## Evaluation

### Tokenize (Head)

In [None]:
# Tokenize the 'text1' and 'text2' columns
text1_sentences_dev = dev['text1'].tolist()
text2_sentences_dev = dev['text2'].tolist()

# Tokenize the sentences
text1_inputs_dev = tokenizer(text1_sentences_dev, padding=True, truncation=True, return_tensors="pt")
text2_inputs_dev = tokenizer(text2_sentences_dev, padding=True, truncation=True, return_tensors="pt")


In [None]:
# Create tensor datasets separately
dataset1_dev = TensorDataset(text1_inputs_dev['input_ids'], text1_inputs_dev['attention_mask'])
dataset2_dev = TensorDataset(text2_inputs_dev['input_ids'], text2_inputs_dev['attention_mask'])

# Set batch size
batch_size = 32

# Create DataLoaders
dataloader1_dev = DataLoader(dataset1_dev, batch_size=batch_size, shuffle=True)
dataloader2_dev = DataLoader(dataset2_dev, batch_size=batch_size, shuffle=True)

### Pretrained-Model

In [None]:
# Initialize the BERT model
model = BertModel.from_pretrained('bert-base-uncased').to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Initialize an empty list to hold the similarities
similarities = []

for (batch1, batch2) in zip(dataloader1_dev, dataloader2_dev):
    # Unpack the batch and load onto GPU
    text1_inputs = {key: val.to(device) for key, val in zip(['input_ids', 'attention_mask'], batch1)}
    text2_inputs = {key: val.to(device) for key, val in zip(['input_ids', 'attention_mask'], batch2)}

    # Forward pass
    with torch.no_grad():
        text1_outputs = model(**text1_inputs)
        text2_outputs = model(**text2_inputs)

    # Get the embeddings of the [CLS] token which represents the sentence representation
    text1_embeddings = text1_outputs.last_hidden_state[:, 0, :]
    text2_embeddings = text2_outputs.last_hidden_state[:, 0, :]

    # Calculate cosine similarities for each pair of sentences
    for sent1, sent2 in zip(text1_embeddings, text2_embeddings):
        similarity = 1 - cosine(sent1.cpu().numpy(), sent2.cpu().numpy())
        similarities.append(similarity)

In [None]:
len(similarities)

4902

In [None]:
len(dev)

4902

In [None]:
dev['similarities'] = similarities

### Pearson correlation

In [None]:
# Calculate Pearson correlation
correlation, _ = pearsonr(dev['similarities'], dev['Rescaled_Overall'])

# Save the correlation in the DataFrame
dev['Rescaled_correlation'] = correlation

In [None]:
print(np.mean(correlation))

-0.010094246077528821


In [None]:
dev.to_csv('dev_similarities.csv', index=False)