# Log prob function

In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np
import torch

  torch.utils._pytree._register_pytree_node(


In [2]:
class GPTContainer:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2") # change to gpt-xl when running for real
        self.model = GPT2LMHeadModel.from_pretrained("gpt2").to(self.device) # change to gpt-xl when running for real
        
        self.starts = []
        self.suffs = []

        # Populate starts and suffs
        for i in range(len(self.tokenizer.get_vocab())):
            tok = self.tokenizer.decode(i)
            if tok[0] == " " or tok[0] == ".":
                self.starts.append(i)
            elif tok[0] != " ":
                self.suffs.append(i)
                    
    def gpt2_sent_prob(self, sent):
        logsoftmax = torch.nn.LogSoftmax(dim=-1)
        
        tokenizer = self.tokenizer
        model = self.model

        starts = self.starts
        suffs = self.suffs

        sent = ". " + sent + "."

        tokens = tokenizer.encode(sent)
        inputs = torch.tensor(tokens).to(self.device)

        with torch.no_grad():
            out = model(inputs)

        unsoft = out[0]
        lab1 = inputs.cpu().data.numpy()
        
        probs = []
        for x in range(len(lab1) - 1):

            lab = lab1[x + 1]
            unsoft1 = unsoft[x]

            if lab in starts:

                soft = logsoftmax(unsoft1[starts])
                prob = float(soft[starts.index(lab)].cpu().data.numpy())

            elif lab in suffs:

                soft = logsoftmax(unsoft1[suffs])
                prob = float(soft[suffs.index(lab)].cpu().data.numpy())

            probs.append(prob)

        prob = np.sum(probs)

        return prob


# Example usage

In [3]:
gpt = GPTContainer()

In [4]:
sentence = "A message has been sent to your account"
log_probability = gpt.gpt2_sent_prob(sentence)

In [5]:
import math
print(log_probability)
print(math.exp(log_probability))

-27.383871644735336
1.2803704769791433e-12


# Creating log probabilities

In [6]:
import pandas as pd

df = pd.read_csv("our_df.csv", usecols=['sentence_pair', 'subjects_majority_vote'])

In [10]:
dfsentence = df.drop_duplicates(subset='sentence_pair')
dfsentence[['sentence1', 'sentence2']] = df['sentence_pair'].str.split('_', expand=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfsentence[['sentence1', 'sentence2']] = df['sentence_pair'].str.split('_', expand=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfsentence[['sentence1', 'sentence2']] = df['sentence_pair'].str.split('_', expand=True)


In [11]:
# Calculate probabilities for each pair of sentences
probabilities = []
for index, row in dfsentence.iterrows():
    prob1 = gpt.gpt2_sent_prob(row['sentence1'])
    prob2 = gpt.gpt2_sent_prob(row['sentence2'])
    probabilities.append([prob1, prob2])

# Add probabilities to the DataFrame
dfsentence['probability_sentence1'] = [p[0] for p in probabilities]
dfsentence['probability_sentence2'] = [p[1] for p in probabilities]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfsentence['probability_sentence1'] = [p[0] for p in probabilities]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfsentence['probability_sentence2'] = [p[1] for p in probabilities]


In [12]:
dfsentence.to_csv('our_sentences_probs.csv', index=False)