# Backpack

In [None]:
import torch
from transformers import AutoConfig, AutoModelForCausalLM

model_id = "stanfordnlp/backpack-gpt2"
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, config=config, trust_remote_code=True)
model.eval()

In [17]:
from transformers import AutoTokenizer
from transformers import GPT2Tokenizer
import torch
import os
import numpy as np
import torch.nn.functional as F
from scipy.stats import spearmanr

In [12]:
tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2",pad_token = '<pad>')

In [13]:
def simm(w1,w2):
    x1 = tokenizer(w1,return_tensors='pt')['input_ids']
    x1 = model.backpack.word_embeddings(x1)
    x1 = model.backpack.sense_network(x1).detach()
    x1 = x1[:,:,0,:]
    x1 = x1.squeeze()
    x1 = F.normalize(x1,p=2,dim=1)
    x2 = tokenizer(w2,return_tensors='pt')['input_ids']
    x2 = model.backpack.word_embeddings(x2)
    x2 = model.backpack.sense_network(x2).detach()
    x2 = x2[:,:,0,:]
    x2 = x2.squeeze()
    x2 = F.normalize(x2,p=2,dim=1)
    sim_list = torch.sum(torch.multiply(x1,x2),dim=1)
    #print(sim_list)
    return(sim_list.min())

## SIMVERB

In [14]:
word1 = []
word2 = []  # add space
h_score = []
m_score =[]

In [15]:
with open('/home/piyush/srinath/NLP/Project/NLP/Hamvir/SimVerb-3500.txt', 'r') as file:
    # Iterate through each line
    for line in file:
        # Split the line into columns
        columns = line.strip().split('\t')

        # Extract word1, word2, and the score
        word1.append(" "+ columns[0])
        word2.append(" "+ columns[1])
        h_score.append(float(columns[3]))  # Assuming the score is a floating-point number


In [16]:
for i in range(len(word1)):
    m_score.append(simm(word1[i],word2[i]))

In [18]:
spearman_corr, _ = spearmanr(h_score, m_score)
spearman_corr

0.4468468531403537

## SIMMLEX

In [19]:
word1 = []
word2 = []  # add space
h_score = []
m_score =[]

In [20]:
with open('/home/piyush/srinath/NLP/Project/NLP/Hamvir/SimLex-999.txt', 'r') as file:
    # Iterate through each line
    header = next(file)
    for line in file:
        # Split the line into columns
        columns = line.strip().split('\t')

        # Extract word1, word2, and the score
        word1.append(" "+ columns[0])
        word2.append(" "+ columns[1])
        h_score.append(float(columns[3]))  # Assuming the score is a floating-point number


In [21]:
for i in range(len(word1)):
    m_score.append(simm(word1[i],word2[i]))

In [22]:
spearman_corr, _ = spearmanr(h_score, m_score)
spearman_corr

0.5396491328014148

# GPT

In [34]:
from transformers import GPT2LMHeadModel
model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")

In [35]:
from transformers import AutoTokenizer
from transformers import GPT2Tokenizer
import torch
import os
import numpy as np
import torch.nn.functional as F
from scipy.stats import spearmanr

In [36]:
tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2",pad_token = '<pad>')

In [37]:
def simm(w1,w2):
    x1 = tokenizer(w1,return_tensors='pt')['input_ids']
    x1 = model.transformer.wte(x1).detach() # (1, 1, 768)
    x1 = x1[:,-1,:]  # (1,768)
    x1 = F.normalize(x1,p=2,dim=1)
    x2 = tokenizer(w2,return_tensors='pt')['input_ids']
    x2 = model.transformer.wte(x2).detach() # (1, 1, 768)
    x2 = x2[:,-1,:] # (1,768)
    x2 = F.normalize(x2,p=2,dim=1)
    sim_list = torch.sum(torch.multiply(x1,x2))
    #print(sim_list)
    return(sim_list.item())

## SIMMLEX

In [38]:
word1 = []
word2 = []  # add space
h_score = []
m_score =[]

In [39]:
with open('/home/piyush/srinath/NLP/Project/NLP/Hamvir/SimLex-999.txt', 'r') as file:
    # Iterate through each line
    header = next(file)
    for line in file:
        # Split the line into columns
        columns = line.strip().split('\t')

        # Extract word1, word2, and the score
        word1.append(" "+ columns[0])
        word2.append(" "+ columns[1])
        h_score.append(float(columns[3]))  # Assuming the score is a floating-point number


In [40]:
for i in range(len(word1)):
    m_score.append(simm(word1[i],word2[i]))

In [41]:
spearman_corr, _ = spearmanr(h_score, m_score)
spearman_corr

0.46565706841842996

## SIMMVERB

In [42]:
word1 = []
word2 = []  # add space
h_score = []
m_score =[]

In [43]:
with open('/home/piyush/srinath/NLP/Project/NLP/Hamvir/SimVerb-3500.txt', 'r') as file:
    # Iterate through each line
    for line in file:
        # Split the line into columns
        columns = line.strip().split('\t')

        # Extract word1, word2, and the score
        word1.append(" "+ columns[0])
        word2.append(" "+ columns[1])
        h_score.append(float(columns[3]))  # Assuming the score is a floating-point number


In [44]:
for i in range(len(word1)):
    m_score.append(simm(word1[i],word2[i]))

In [45]:
spearman_corr, _ = spearmanr(h_score, m_score)
spearman_corr

0.2911671264006562