In [None]:
!pip install transformers
!pip install torch
import transformers
from transformers import AutoTokenizer, AutoModel
import torch
import os
import re
import torch.nn.functional as F
import nltk
import pickle
nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import Counter
import string
import random
from collections import deque
from collections import defaultdict
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import scipy.stats as stats
from sklearn.metrics.pairwise import cosine_similarity

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m47.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m76.7 MB/s[0m eta [36m0:00:0

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# loading SCWS dataset
def load_ratings_file(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split('\t')
            entry = {
                'id': parts[0],
                'word1': parts[1],
                'POS_word1': parts[2],
                'word2': parts[3],
                'POS_word2': parts[4],
                'word1_in_context': parts[5],
                'word2_in_context': parts[6],
                'average_rating': float(parts[7]),
                'individual_ratings': [float(rating) for rating in parts[8].split()]
            }
            data.append(entry)
    return data
# from file
file_path = 'ratings.txt'
ratings_data = load_ratings_file(file_path)

In [None]:
# sentences and words to use
context_word1_sentences = [entry['word1_in_context'] for entry in ratings_data]
context_word2_sentences = [entry['word2_in_context'] for entry in ratings_data]
word1_list = [entry['word1'] for entry in ratings_data]
word2_list = [entry['word2'] for entry in ratings_data]

In [None]:
# Initializing the model (BERT) + tokenizer
device = torch.device('cuda')
model_name = "bert-base-uncased"

model = AutoModel.from_pretrained(model_name, output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = model.to(device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
model.eval()

# Define selected layers for combination
selected_layers = [8, 9, 10, 11, 12]  # Layers 8 to 11
embedding_word1_list = []  # empty list to store embeddings for each word in word1

# process each sentence and word in the lists
for sentence, word1 in zip(context_word1_sentences, word1_list):
    # tokenize the sentence and 'word1', and get token IDs and attention masks
    encoded_dict_sentence1 = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length=256,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

    # moving input tensors to GPU
    encoded_dict_sentence1 = {key: value.to('cuda') for key, value in encoded_dict_sentence1.items()}
    # Get embeddings from BERT model
    with torch.no_grad():
        model_outputs_sentence1 = model(**encoded_dict_sentence1, output_hidden_states=True)
        all_hidden_states_sentence1 = model_outputs_sentence1.hidden_states

    # Access tuple elements for each layer
    selected_layer_tensors = [layer for i, layer in enumerate(all_hidden_states_sentence1) if i in selected_layers]

    # Extract the target word's token IDs based on the tokenization
    target_word_tokens = tokenizer.tokenize(word1)
    target_word_token_ids = tokenizer.convert_tokens_to_ids(target_word_tokens)

    # Find the indices in the sentence tokens that correspond to the target word tokens
    target_indices = [i for i, token_id in enumerate(encoded_dict_sentence1['input_ids'][0]) if token_id in target_word_token_ids]

    # Combine selected layers' embeddings
    selected_layer_embeddings = torch.cat([layer[:, target_indices, :] for layer in selected_layer_tensors], dim=-1)

    # Calculate the mean embedding for the target word
    mean_target_embedding = torch.mean(selected_layer_embeddings, dim=1, keepdim=True)

    # Append the mean embedding to the list
    embedding_word1_list.append(mean_target_embedding)

# Converting the list of embeddings to a tensor
embedding_word1_batched = torch.cat(embedding_word1_list, dim=0)

# Checking shape to make sure it's [num_word1, 1, 768]
print("Shape of embedding_word1 tensor:", embedding_word1_batched.shape)

# Checking the number of embeddings (num_word1 words)
num_word1 = len(embedding_word1_list)
print("Number of word1 embeddings:", num_word1)

Shape of embedding_word1 tensor: torch.Size([2003, 1, 3840])
Number of word1 embeddings: 2003


In [None]:
model.eval()

# Define selected layers for combination
selected_layers = [8, 9, 10, 11, 12]  # Layers 8 to 11
embedding_word2_list = []  # empty list to store embeddings for each word in word2

# process each sentence and word in the lists
for sentence, word2 in zip(context_word2_sentences, word2_list):
    # tokenize the sentence and 'word2', and get token IDs and attention masks
    encoded_dict_sentence2 = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length=256,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

    # moving input tensors to GPU
    encoded_dict_sentence2 = {key: value.to('cuda') for key, value in encoded_dict_sentence2.items()}
    # Get embeddings from BERT model
    with torch.no_grad():
        model_outputs_sentence2 = model(**encoded_dict_sentence2, output_hidden_states=True)
        all_hidden_states_sentence2 = model_outputs_sentence2.hidden_states

    # Access tuple elements for each layer
    selected_layer_tensors = [layer for i, layer in enumerate(all_hidden_states_sentence2) if i in selected_layers]

    # Extract the target word's token IDs based on the tokenization
    target_word_tokens = tokenizer.tokenize(word2)
    target_word_token_ids = tokenizer.convert_tokens_to_ids(target_word_tokens)

    # Find the indices in the sentence tokens that correspond to the target word tokens
    target_indices = [i for i, token_id in enumerate(encoded_dict_sentence2['input_ids'][0]) if token_id in target_word_token_ids]

    # Combine selected layers' embeddings
    selected_layer_embeddings = torch.cat([layer[:, target_indices, :] for layer in selected_layer_tensors], dim=-1)

    # Calculate the mean embedding for the target word
    mean_target_embedding = torch.mean(selected_layer_embeddings, dim=1, keepdim=True)

    # Append the mean embedding to the list
    embedding_word2_list.append(mean_target_embedding)

# Converting the list of embeddings to a tensor
embedding_word2_batched = torch.cat(embedding_word2_list, dim=0)

# Checking shape to make sure it's [num_word2, 1, 768]
print("Shape of embedding_word2 tensor:", embedding_word2_batched.shape)

# Checking the number of embeddings (num_word2 words)
num_word2 = len(embedding_word2_list)
print("Number of word2 embeddings:", num_word2)

Shape of embedding_word2 tensor: torch.Size([2003, 1, 3840])
Number of word2 embeddings: 2003


In [None]:
# cosine similarity between each word pair
cosine_similarity_scores = []
for emb1, emb2 in zip(embedding_word1_batched, embedding_word2_batched):
    similarity_score = torch.cosine_similarity(emb1.unsqueeze(0), emb2.unsqueeze(0))
    cosine_similarity_scores.append(similarity_score.mean().item())  # using mean()to get scalar value

for i, score in enumerate(cosine_similarity_scores):
    print("Word Pair {}: Cosine Similarity Score: {:.4f}".format(i+1, score))

Word Pair 1: Cosine Similarity Score: 0.1609
Word Pair 2: Cosine Similarity Score: 0.0750
Word Pair 3: Cosine Similarity Score: 0.1000
Word Pair 4: Cosine Similarity Score: 0.4240
Word Pair 5: Cosine Similarity Score: 0.4458
Word Pair 6: Cosine Similarity Score: 0.1422
Word Pair 7: Cosine Similarity Score: 0.5443
Word Pair 8: Cosine Similarity Score: 0.1771
Word Pair 9: Cosine Similarity Score: 0.4240
Word Pair 10: Cosine Similarity Score: 0.1391
Word Pair 11: Cosine Similarity Score: 0.2625
Word Pair 12: Cosine Similarity Score: 0.3281
Word Pair 13: Cosine Similarity Score: 0.1385
Word Pair 14: Cosine Similarity Score: 0.3771
Word Pair 15: Cosine Similarity Score: 0.1724
Word Pair 16: Cosine Similarity Score: 0.3734
Word Pair 17: Cosine Similarity Score: 0.2854
Word Pair 18: Cosine Similarity Score: 0.1667
Word Pair 19: Cosine Similarity Score: 0.4380
Word Pair 20: Cosine Similarity Score: 0.3698
Word Pair 21: Cosine Similarity Score: 0.2635
Word Pair 22: Cosine Similarity Score: 0.27

In [None]:
# calculating spearman rho between avg. human rating and cosine scores

average_human_ratings = [entry['average_rating'] for entry in ratings_data]

spearman_rho, p_value = stats.spearmanr(average_human_ratings, cosine_similarity_scores)

print("Spearman's Rank Correlation Coefficient (rho): {:.4f}".format(spearman_rho))
print("p-value: {:.4f}".format(p_value))


Spearman's Rank Correlation Coefficient (rho): 0.6710
p-value: 0.0000
