In [2]:
from transformers.models.gpt2.configuration_gpt2 import GPT2Config

class BackpackGPT2Config(GPT2Config):
  """
    This is the configuration class to store the configuration of a [`GPT2Model`] or a [`TFGPT2Model`]. It is used to
    instantiate a Backpack GPT-2 model according to the specified arguments, defining the model architecture.

    Configuration objects inherit from [`GPT2Config`] and can be used to control the model outputs. Read the
    documentation from [`GPT2Config`] for more information.

    Args:
        num_senses (`int`, *optional*, defaults to 16):
            The number of sense vectors to define for each word.
        sense_intermediate_scale (`int`, *optional*, defaults ot 4):
            The hidden dimensionality of the sense vector network.

    Example:

    ```python
    >>> from transformers import BackpackGPT2Config, BackpackGPT2Model

    >>> # Initializing a GPT2 configuration
    >>> configuration = BackpackGPT2Config()

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = BackpackGPT2Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
  """

  def __init__(self,
               vocab_size=50264,
               num_senses=16,
               sense_intermediate_scale=4,
               n_positions=512,
               scale_attn_by_inverse_layer_idx=True,
               **kwargs,
  ):
    self.num_senses = num_senses
    self.sense_intermediate_scale = sense_intermediate_scale
    super().__init__(vocab_size=vocab_size, n_positions=n_positions, scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx, **kwargs)


In [3]:
import math
from dataclasses import dataclass
from typing import Optional, Tuple

import torch
import torch.utils.checkpoint
from torch import nn

from transformers.activations import ACT2FN
from transformers.pytorch_utils import Conv1D
from transformers.utils import (
    ModelOutput,
    logging,
)
from transformers.models.gpt2.modeling_gpt2 import GPT2Model, GPT2PreTrainedModel
#from content.configuration_backpack_gpt2 import BackpackGPT2Config

logger = logging.get_logger(__name__)

In [19]:
class BackpackGPT2PreTrainedModel(GPT2PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """
    _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias"]

    config_class = BackpackGPT2Config
    base_model_prefix = "backpack"
    is_parallelizable = True
    supports_gradient_checkpointing = False
    _no_split_modules = ["GPT2Block", "BackpackNoMixBlock"]

    def __init__(self, *inputs, **kwargs):
        super().__init__(*inputs, **kwargs)

class BackpackMLP(nn.Module):

  def __init__(self, embed_dim, intermediate_dim, out_dim, config):
        super().__init__()
        self.c_fc = Conv1D(intermediate_dim, embed_dim)
        self.c_proj = Conv1D(out_dim, intermediate_dim)
        self.act = ACT2FN[config.activation_function]
        self.dropout = nn.Dropout(config.resid_pdrop)

  def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
      hidden_states = self.c_fc(hidden_states)
      hidden_states = self.act(hidden_states)
      hidden_states = self.c_proj(hidden_states)
      hidden_states = self.dropout(hidden_states)
      return hidden_states

class BackpackNoMixBlock(nn.Module):

  def __init__(self, config):
    super().__init__()
    self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
    self.ln_2 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
    self.mlp = BackpackMLP(config.n_embd, config.n_embd*4, config.n_embd, config)
    self.resid_dropout1 = nn.Dropout(config.resid_pdrop)
    self.resid_dropout2 = nn.Dropout(config.resid_pdrop)

  def forward(self, hidden_states, residual):
    residual = self.resid_dropout1(hidden_states) + residual
    hidden_states = self.ln_1(residual)
    mlp_out = self.mlp(hidden_states)
    residual = self.resid_dropout2(mlp_out) + residual
    hidden_states = self.ln_2(residual)
    return hidden_states


class BackpackSenseNetwork(nn.Module):
    def __init__(self, config, num_senses, device=None, dtype=None):
        super().__init__()
        self.num_senses = num_senses
        #self.embeddings = embeddings
        self.n_embd = config.n_embd

        self.dropout = nn.Dropout(config.embd_pdrop)
        self.block = BackpackNoMixBlock(config)
        self.ln = nn.LayerNorm(self.n_embd, eps=config.layer_norm_epsilon)
        self.final_mlp = BackpackMLP(
            embed_dim=config.n_embd,
            intermediate_dim=config.sense_intermediate_scale*config.n_embd,
            out_dim=config.n_embd*config.num_senses,
            config=config,
            )

    def forward(self, input_embeds):
      residual = self.dropout(input_embeds)
      hidden_states = self.ln(residual)
      hidden_states = self.block(hidden_states, residual)
      senses = self.final_mlp(hidden_states)
      bs, s, nvd = senses.shape
      return senses.reshape(bs, s, self.num_senses, self.n_embd).transpose(1,2) # (bs, nv, s, d)

class BackpackWeightNetwork(nn.Module):

  def __init__(self, num_senses, embed_dim):
    super().__init__()
    self.n_embd = embed_dim
    self.num_senses = num_senses
    self.embed_per_sense = embed_dim // num_senses
    self.c_attn = nn.Linear(embed_dim, 2 * num_senses * self.embed_per_sense)
    self.softmax_scale = None

  def forward(self, encoded):
    b, s, d = encoded.shape
    encoded = self.c_attn(encoded) # (b, s, 2*d)
    encoded = encoded.reshape(b, s, 2, self.num_senses, self.embed_per_sense) #(b, s, 2, nv, d//nv)
    batch_size, seqlen = encoded.shape[0], encoded.shape[1]

    # compute scores & mask
    q, k = encoded.unbind(dim=2)
    softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
    scores = torch.einsum('bthd,bshd->bhts', q, k * softmax_scale)
    causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1)
    scores = scores + causal_mask.to(dtype=scores.dtype)

    return torch.softmax(scores, dim=-1, dtype=q.dtype)
  

@dataclass
class BackpackGPT2BaseModelOutput(ModelOutput):
    hidden_states: torch.FloatTensor = None
    contextualization: torch.FloatTensor = None

class BackpackGPT2Model(BackpackGPT2PreTrainedModel):
    _keys_to_ignore_on_load_missing = [r".*attn.masked_bias", r".*attn.bias"]

    def __init__(self, config):
        super().__init__(config)

        self.embed_dim = config.n_embd

        self.num_senses = config.num_senses
        self.gpt2_model = GPT2Model(config)
        self.sense_network = BackpackSenseNetwork(config, self.num_senses, self.gpt2_model.wte)
        self.word_embeddings = self.gpt2_model.wte
        self.position_embeddings = self.gpt2_model.wpe
        self.sense_weight_net = BackpackWeightNetwork(self.num_senses, self.embed_dim)
        # Model parallel
        self.model_parallel = False
        self.device_map = None
        self.gradient_checkpointing = False

    def get_num_senses(self):
        return self.num_senses

    def get_word_embeddings(self):
        return self.word_embeddings

    def get_sense_network(self):
        return self.sense_network

    def forward(self, input_ids, position_ids):
        # Compute senses
        sense_input_embeds = self.word_embeddings(input_ids)
        senses = self.sense_network(sense_input_embeds) # (bs, nv, s, d)

        # Compute contextualization weights
        contextl_hidden_states = self.gpt2_model(input_ids, position_ids=position_ids).last_hidden_state # (bs, s, d)
        contextualization = self.sense_weight_net(contextl_hidden_states) # (bs, nv, s, s)

        # Compute resulting outputs
        hidden_states = torch.sum(contextualization @ senses, dim=1) # (bs, nv, s, d) -> (bs, s, d)
        return BackpackGPT2BaseModelOutput(
            hidden_states=hidden_states,
            contextualization=contextualization,
        )
    
    def run_with_custom_contextualization(self, input_ids, contextualization):
        # Compute senses
        sense_input_embeds = self.word_embeddings(input_ids)
        senses = self.sense_network(sense_input_embeds) # (bs, nv, s, d)

        # Compute resulting outputs
        hidden_states = torch.sum(contextualization @ senses, dim=1) # (bs, nv, s, d) -> (bs, s, d)
        return BackpackGPT2BaseModelOutput(
            hidden_states=hidden_states,
            contextualization=contextualization,
        )

@dataclass
class BackpackGPT2LMHeadModelOutput(ModelOutput):
    logits: torch.FloatTensor = None
    contextualization: torch.FloatTensor = None

class BackpackGPT2LMHeadModel(BackpackGPT2PreTrainedModel):
  _keys_to_ignore_on_load_missing = [r".*attn.masked_bias", r".*attn.bias"]

  def __init__(self, config):
    super().__init__(config)
    self.backpack = BackpackGPT2Model(config)
    self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

    # Model parallel
    self.model_parallel = False
    self.device_map = None

    self.tie_weights()

  def tie_weights(self):
      self.lm_head.weight = self.backpack.word_embeddings.weight # also tied with the underlying underlying transf

  def get_lm_head(self):
      return self.lm_head

  def forward(self, input_ids, position_ids=None):
      outputs = self.backpack(input_ids, position_ids=position_ids)
      hidden_states, contextualization = outputs.hidden_states, outputs.contextualization
      lm_logits = self.lm_head(hidden_states) # (bs, s, V)
      return BackpackGPT2LMHeadModelOutput(
            logits=lm_logits,
            contextualization=contextualization,
        )

  def run_with_custom_contextualization(self, input_ids, contextualization):
      outputs = self.backpack.run_with_custom_contextualization(input_ids, contextualization)
      hidden_states, contextualization = outputs.hidden_states, outputs.contextualization
      lm_logits = self.lm_head(hidden_states)
      return BackpackGPT2LMHeadModelOutput(
        logits=lm_logits,
        contextualization=contextualization,
    )

In [5]:
import json

# Specify the path to your JSON config file
config_file_path = 'config.json'

# Open the file in read mode
with open(config_file_path, 'r') as config_file:
    # Load the JSON data
    config_data = json.load(config_file)

In [20]:
config = BackpackGPT2Config(**config_data)

In [21]:
model = BackpackGPT2LMHeadModel(config)

In [22]:
checkpoint = torch.load("/home/piyush/srinath/NLP/Project/NLP/Hamvir/pytorch_model.bin", map_location=torch.device('cuda'))  # You can specify the device (e.g., 'cuda:0') if using GPU

In [23]:
checkpoint
from collections import OrderedDict

In [None]:
new_state_dict = OrderedDict()
for key, value in checkpoint.items():
    new_state_dict[key] = value

# Load the new_state_dict into the model
model.load_state_dict(new_state_dict, strict=False)

# Set the model to evaluation mode
model.eval()

In [None]:
import torch
from transformers import AutoConfig, AutoModelForCausalLM

model_id = "stanfordnlp/backpack-gpt2"
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
torch_model = AutoModelForCausalLM.from_pretrained(model_id, config=config, trust_remote_code=True)
torch_model.eval()

In [None]:
# input = torch.randint(0, 50264, (1, 512), dtype=torch.long)
# torch_out = torch_model(
#     input,
#     position_ids=None,
# )
# torch_out = torch.nn.functional.softmax(torch_out.logits, dim=-1)
# print(torch_out)

# Perplexity

## open web

In [None]:
import torch
from transformers import AutoConfig, AutoModelForCausalLM

model_id = "stanfordnlp/backpack-gpt2"
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, config=config, trust_remote_code=True)
model.eval()

In [46]:
from transformers import AutoTokenizer
from transformers import GPT2Tokenizer
import torch
import os

In [31]:
tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2",pad_token = '<pad>')

In [32]:
def read_sentences(filename):
    with open(filename, 'r') as file:
        sentences = [line.strip() for line in file if line.strip()]
    return sentences

In [36]:
_loss = torch.nn.CrossEntropyLoss(reduction='sum',ignore_index=tokenizer.pad_token_id)

In [54]:
total_loss = 0
total_tokens = 0
folder_path = '/home/piyush/srinath/NLP/Project/NLP/dataset/openwebtext'
for files in os.listdir(folder_path):
    file_path = os.path.join(folder_path,files)
    sentences = read_sentences(file_path)
    input_list = []
    output_list = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        input_ = tokenizer.bos_token + sentence
        output_ = sentence + tokenizer.eos_token
        input_tokens = tokenizer(input_, max_length=512, truncation=True, padding='max_length', return_tensors='pt')
        output_tokens = tokenizer(output_, max_length=512, truncation=True, padding='max_length', return_tensors='pt')
        input_tokens = input_tokens['input_ids']
        output_tokens = output_tokens['input_ids']
        input_list.append(input_tokens)
        output_list.append(output_tokens)
    print(len(input_list))
    input_ = torch.tensor(input_list)
    output_ = torch.tensor(output_list)
    model_outputs = model(input_)
    output_logits = model_outputs['logits']
    # print(output_logits.shape,output_tokens.shape)
    loss = _loss(output_logits.view(-1, output_logits.size(-1)), output_.view(-1))
        

    #     # Accumulate total loss and total tokens
    #     total_loss += loss.item()
    #     total_tokens += 1
    print(len(sentences))
perplexity = torch.exp(total_loss / total_tokens)
        

TypeError: only integer tensors of a single element can be converted to an index

## wikitext

In [6]:
from transformers import AutoConfig, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
model_id = "stanfordnlp/backpack-gpt2"
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)

In [None]:
!lm_eval --model hf \
    --model_args pretrained=stanfordnlp/backpack-gpt2,config=config,trust_remote_code=True \
    --tasks lambada_openai \
    --device cuda:0 \
    --batch_size 8

# Sense vectors

Change it according to paper

In [None]:
import torch
from transformers import AutoConfig, AutoModelForCausalLM

model_id = "stanfordnlp/backpack-gpt2"
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, config=config, trust_remote_code=True)
model.eval()

In [2]:
from transformers import AutoTokenizer
from transformers import GPT2Tokenizer
import torch
import os

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2",pad_token = '<pad>')

In [4]:
tokenizer.eos_token

'<|endoftext|>'

In [21]:
x = tokenizer(["The","Ceo","said","that"], max_length=20, truncation=True, padding='max_length', return_tensors='pt')

In [22]:
x

{'input_ids': tensor([[  464, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257],
        [   34,    68,    78, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257],
        [30079, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257],
        [ 5562, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257]]), 'attention_mask': tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [23]:
x = x['input_ids']

In [24]:
embeds = model.backpack.word_embeddings(x)
senses = model.backpack.sense_network(embeds)

In [25]:
senses.shape

torch.Size([4, 16, 20, 768])

In [175]:
vocab = tokenizer.get_vocab()

In [176]:
x = torch.arange(0, 50257).view(-1, 1)

In [177]:
x.shape

torch.Size([50257, 1])

In [178]:
embeds = model.backpack.word_embeddings(x)
all_senses = model.backpack.sense_network(embeds)

In [179]:
all_senses.shape

torch.Size([50257, 16, 1, 768])

In [180]:
all_senses = all_senses.squeeze()

In [181]:
all_senses.shape

torch.Size([50257, 16, 768])

In [182]:
tokenizer(" Apple",max_length=5,padding='max_length', return_tensors='pt')['input_ids']

tensor([[ 4196, 50257, 50257, 50257, 50257]])

In [183]:
tmp = tokenizer(" Apple",max_length=1,padding='max_length', return_tensors='pt')['input_ids']

In [184]:
tmp.shape

torch.Size([1, 1])

In [185]:
embeds = model.backpack.word_embeddings(tmp)
senses = model.backpack.sense_network(embeds)

In [186]:
senses.shape

torch.Size([1, 16, 1, 768])

In [187]:
senses = senses.squeeze()

In [188]:
senses.shape

torch.Size([16, 768])

In [189]:
vocab_sense_10 = all_senses[:,9,:]
tasty_sense_10 = senses[9, :]


In [190]:
vocab_sense_10.shape

torch.Size([50257, 768])

In [191]:
tasty_sense_10.shape

torch.Size([768])

In [192]:
vocab_sense_3 = all_senses[:,2,:]
tasty_sense_3 = senses[2, :]

In [193]:
dot_product = torch.matmul(vocab_sense_3, tasty_sense_3)

In [194]:
dot_product

tensor([-18.0290,   3.2239,  23.9513,  ...,  90.3603,  22.1850,   2.1527],
       grad_fn=<MvBackward0>)

In [195]:
sorted_indices = torch.argsort(dot_product, descending=True)

In [196]:
sorted_indices

tensor([ 4196, 16108, 17180,  ...,  3569, 21549, 21191])

In [197]:
tokenizer.decode(sorted_indices[8])

'Microsoft'

## Main

In [236]:
tmp = tokenizer(" Apple",max_length=1,padding='max_length', return_tensors='pt')['input_ids']

In [251]:
embeds = model.backpack.word_embeddings(tmp)
senses = model.backpack.sense_network(embeds)

In [260]:
senses = senses.squeeze()
tasty_sense_10 = senses[12, :]

In [261]:
dot_product = model.lm_head(tasty_sense_10)

In [262]:
dot_product.shape

torch.Size([50264])

In [263]:
sorted_indices = torch.argsort(dot_product, descending=True)

In [271]:
tokenizer.decode(sorted_indices[8])

' iPad'

In [275]:
senses = senses.squeeze()
tasty_sense_7 = senses[7, :]
dot_product = model.lm_head(tasty_sense_7)
sorted_indices = torch.argsort(dot_product, descending=True)
tokenizer.decode(sorted_indices[5])

'iOS'

# Control

In [438]:
tokenizer(" arts")

{'input_ids': [10848], 'attention_mask': [1]}

In [628]:
x = tokenizer(tokenizer.bos_token+" The building in paris is",return_tensors='pt')['input_ids']

In [629]:
x.shape

torch.Size([1, 7])

In [630]:
embeds = model.backpack.word_embeddings(x)

In [631]:
senses = model.backpack.sense_network(embeds)

In [632]:
senses.shape

torch.Size([1, 16, 7, 768])

In [633]:
dot_product = model.lm_head(senses)

In [634]:
dot_product.shape

torch.Size([1, 16, 7, 50264])

In [635]:
weights = dot_product[:,:,:,10848]

In [636]:
weights = F.normalize(weights, p=2, dim=0)

In [637]:
weights.shape

torch.Size([1, 16, 7])

In [638]:
weights = weights.unsqueeze(-1)

In [639]:
weights.shape

torch.Size([1, 16, 7, 1])

In [640]:
contextl_hidden_states = model.backpack.gpt2_model(x).last_hidden_state # (bs, s, d)
contextualization = model.backpack.sense_weight_net(contextl_hidden_states)

In [641]:
contextualization.shape

torch.Size([1, 16, 7, 7])

In [642]:
new_context = contextualization * weights

In [643]:
new_context.shape

torch.Size([1, 16, 7, 7])

In [644]:
y = model.run_with_custom_contextualization(x,new_context).logits

In [645]:
y.shape

torch.Size([1, 7, 50264])

In [646]:
torch.argmax(y,dim=-1).shape

torch.Size([1, 7])

In [647]:
tokenizer.decode(torch.argmax(y,dim=-1)[0])

' watchdog countryside codes shaping UID Building career'

## Gender bias

In [611]:
x = tokenizer(tokenizer.bos_token+" My husband said that",return_tensors='pt')['input_ids']

In [612]:
embeds = model.backpack.word_embeddings(x)
senses = model.backpack.sense_network(embeds)

In [623]:
contextl_hidden_states = model.backpack.gpt2_model(x).last_hidden_state # (bs, s, d)
contextualization = model.backpack.sense_weight_net(contextl_hidden_states)

In [624]:
weights= torch.ones_like(contextualization)
weights[:,10,:,:]= 0

In [625]:
contextualization=contextualization*weights

In [626]:
y = model.run_with_custom_contextualization(x,contextualization).logits

In [627]:
tokenizer.decode(torch.argmax(y,dim=-1)[0])

'The first and he he'

# Simmilarity

## Backpack

In [None]:
import torch
from transformers import AutoConfig, AutoModelForCausalLM

model_id = "stanfordnlp/backpack-gpt2"
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, config=config, trust_remote_code=True)
model.eval()

In [42]:
from transformers import AutoTokenizer
from transformers import GPT2Tokenizer
import torch
import os
import numpy as np
import torch.nn.functional as F

In [5]:
tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2",pad_token = '<pad>')

In [10]:
tokenizer("this")

{'input_ids': [5661], 'attention_mask': [1]}

In [6]:
x = tokenizer("this",return_tensors='pt')['input_ids']

In [16]:
embeds = model.backpack.word_embeddings(x)
senses = model.backpack.sense_network(embeds)

In [17]:
senses.shape

torch.Size([1, 16, 1, 768])

In [84]:
word1 = [' take',' walk']
word2 = [' remove',' trail']  # add space
h_score = [6.81,4.81]
m_score =[]

In [85]:
def simm(w1,w2):
    x1 = tokenizer(w1,return_tensors='pt')['input_ids']
    x1 = model.backpack.word_embeddings(x1)
    x1 = model.backpack.sense_network(x1).detach()
    x1 = x1[:,:,0,:]
    x1 = x1.squeeze()
    x1 = F.normalize(x1,p=2,dim=1)
    x2 = tokenizer(w2,return_tensors='pt')['input_ids']
    x2 = model.backpack.word_embeddings(x2)
    x2 = model.backpack.sense_network(x2).detach()
    x2 = x2[:,:,0,:]
    x2 = x2.squeeze()
    x2 = F.normalize(x2,p=2,dim=1)
    sim_list = torch.sum(torch.multiply(x1,x2),dim=1)
    #print(sim_list)
    return(sim_list.min())

In [86]:
simm('take','walk')

tensor(0.0006)

In [87]:
for i in range(len(word1)):
    m_score.append(simm(word1[i],word2[i]))

In [88]:
m_score

[tensor(0.0634), tensor(0.0486)]

In [89]:
from scipy.stats import spearmanr
spearman_corr, _ = spearmanr(h_score, m_score)

## SIMVERB

In [123]:
word1 = []
word2 = []  # add space
h_score = []
m_score =[]

In [116]:
with open('/home/piyush/srinath/NLP/Project/NLP/Hamvir/SimVerb-3500.txt', 'r') as file:
    # Iterate through each line
    for line in file:
        # Split the line into columns
        columns = line.strip().split('\t')

        # Extract word1, word2, and the score
        word1.append(" "+ columns[0])
        word2.append(" "+ columns[1])
        h_score.append(float(columns[3]))  # Assuming the score is a floating-point number


In [117]:
for i in range(len(word1)):
    m_score.append(simm(word1[i],word2[i]))

In [119]:
word2[0]

' remove'

In [120]:
len(m_score)

3500

In [121]:
from scipy.stats import spearmanr
spearman_corr, _ = spearmanr(h_score, m_score)

In [122]:
spearman_corr

0.4468468531403537

## SIMMLEX

In [204]:
word1 = []
word2 = []  # add space
h_score = []
m_score =[]

In [205]:
with open('/home/piyush/srinath/NLP/Project/NLP/Hamvir/SimLex-999.txt', 'r') as file:
    # Iterate through each line
    header = next(file)
    for line in file:
        # Split the line into columns
        columns = line.strip().split('\t')

        # Extract word1, word2, and the score
        word1.append(" "+ columns[0])
        word2.append(" "+ columns[1])
        h_score.append(float(columns[3]))  # Assuming the score is a floating-point number


In [206]:
for i in range(len(word1)):
    m_score.append(simm(word1[i],word2[i]))

In [214]:
len(h_score)

999

In [215]:
spearman_corr, _ = spearmanr(h_score, m_score)

In [216]:
spearman_corr

0.5396491328014148

## GPT

In [252]:
# from transformers import GPT2Model, GPT2Config
# configuration = GPT2Config()
# model = GPT2Model(configuration)
from transformers import GPT2LMHeadModel
# from transformers import GPT2Config
# config= GPT2Config()
# from transformers.models.gpt2.modeling_gpt2 import GPT2Model, GPT2PreTrainedModel
# model = GPT2LMHeadModel(config)

In [253]:
model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")

In [254]:
from transformers import AutoTokenizer
from transformers import GPT2Tokenizer
import torch
import os
import numpy as np
import torch.nn.functional as F
from scipy.stats import spearmanr

In [255]:
tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2",pad_token = '<pad>')

In [256]:
# x1 = tokenizer.encode(w1,add_prefix_space=True)

In [261]:
def simm(w1,w2):
    x1 = tokenizer(w1,return_tensors='pt')['input_ids']
    x1 = model.transformer.wte(x1).detach() # (1, 1, 768)
    x1 = x1[:,-1,:]  # (1,768)
    x1 = F.normalize(x1,p=2,dim=1)
    x2 = tokenizer(w2,return_tensors='pt')['input_ids']
    x2 = model.transformer.wte(x2).detach() # (1, 1, 768)
    x2 = x2[:,-1,:] # (1,768)
    x2 = F.normalize(x2,p=2,dim=1)
    sim_list = torch.sum(torch.multiply(x1,x2))
    #print(sim_list)
    return(sim_list.item())

In [262]:
word1 = []
word2 = []  # add space
h_score = []
m_score =[]

In [263]:
with open('/home/piyush/srinath/NLP/Project/NLP/Hamvir/SimLex-999.txt', 'r') as file:
    # Iterate through each line
    header = next(file)
    for line in file:
        # Split the line into columns
        columns = line.strip().split('\t')

        # Extract word1, word2, and the score
        word1.append(" "+ columns[0])
        word2.append(" "+ columns[1])
        h_score.append(float(columns[3]))  # Assuming the score is a floating-point number


In [264]:
for i in range(len(word1)):
    m_score.append(simm(word1[i],word2[i]))

In [265]:
spearman_corr, _ = spearmanr(h_score, m_score)

In [266]:
spearman_corr

0.46565706841842996

In [267]:
word1 = []
word2 = []  # add space
h_score = []
m_score =[]

In [268]:
with open('/home/piyush/srinath/NLP/Project/NLP/Hamvir/SimVerb-3500.txt', 'r') as file:
    # Iterate through each line
    for line in file:
        # Split the line into columns
        columns = line.strip().split('\t')

        # Extract word1, word2, and the score
        word1.append(" "+ columns[0])
        word2.append(" "+ columns[1])
        h_score.append(float(columns[3]))  # Assuming the score is a floating-point number


In [269]:
for i in range(len(word1)):
    m_score.append(simm(word1[i],word2[i]))

In [270]:
spearman_corr, _ = spearmanr(h_score, m_score)

In [271]:
spearman_corr

0.2911671264006562