In [None]:
import pandas as pd 
import transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from tqdm import tqdm
import time 
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
from scipy import stats

print(torch.cuda.is_available())

# Compute Loss and Perplexity for the example sentences 

By: Iris Luden 
Date created: May 2023 

This notebook investigates methods to obtain the cross-entropy loss, perplexity scores, and pseudo-perplexity scores for my data under investigation. 

The goal is to calculate the (1) cross entropy loss (2) perplexity scores, (3) pseudo-log likelihoods:
- For each target, context pair 
- For each definition
- For all 400 the annotated example sentences
- For all randomly sampled example sentences 


# Background 

#### A: Loss context/example sentences 

Calculate loss of the model on the context sentence of a target word

#### B: Perplexity over the entire sentences: 

    Given model $M$, and input sequence $Q$, we calculate the perplexity as follows: 
    $$PPL(Q) = exp^{H(M, Q)}$$ 
    where $H(M, Q)$ is the models cross-entropy for the input sequence Q.

#### C: Pseudo-perplexity or pseudo-log-likelihood, defined as: 

    $$ PLL(Q) := \sum^{|Q|}_{t=1} \log P_{M} (w_t|Q \setminus w_{t})$$ 
    
This is probability P_{M} is calculated differently depending on the model:
In the case of autoregressive models, it only takes into account the left-side context. 

The psuedo-perplexity for 1 sequence is equal to the PLL. One can also calculate the pseudo-perplexity over an entire corpus, by:

$$ PPPL(C) := exp (-\frac{1}{N} \sum_{Q \in C} PPL(Q) $$ 

Where $N$ denotes the number of sentences in C. 

#### D: masked target word prediction Loss 

The perplexity of target word $w$ in context sequence $Q$:
$$ PPL(Q, w_t) \approx exp^{H(M, Q, w_t)}$$

Where I define H(M, Q, w_t) as the loss of the task of predicting target word $w$ in the sequence $Q$ where $w$ is masked. 

# Description 

#### Part 1: Scores for the example sentences

Of each targetword (60 in total), and for each corpus, at most 50 sentences were sampled. For these total +=8000 example sentences, we:
    - scores compute
    - analysis: correlation with corpus & visualizations
    
    
#### Part 2: Scores for annotated example sentences 

Of each targetword (60 in total), and for each corpus, 5 sentences were judged by annotators. 
We compare the loss/perplexity/psuedoperplexity scores with the correctness of the judgements. 
    - scores compute
    - analysis: correlations & visualizations

# Model 

In [None]:
# initialize model 

model_id = "t5-base"
model = T5ForConditionalGeneration.from_pretrained(model_id)

# prepare the data
tokenizer = T5Tokenizer.from_pretrained(model_id)

# Data 

In [None]:
# read complete data set 
df = pd.read_csv('All_DG_results.tsv', sep='\t')
display(df.head())

# retrieve categories 
stable = list(pd.read_csv('Targetwords/Stable_targets_20.tsv', sep='\t')['Word'])
changing = list(pd.read_csv('Targetwords/Changing_targets_20.tsv')['Word'])
emerging = list(pd.read_csv('Targetwords/Emerging_targets_20.tsv', sep='\t')['Word'])

def categorize(x, changing, emerging, stable):
    if x in changing:
        return 'changing'
    elif x in emerging:
        return 'emerging'
    elif x in stable:
        return 'stable'
    else: 
        return None 
    
# only inlcude the 20 target words used in the experiments 
df['Experiment set'] = df['Word'].map(lambda x: categorize(x, changing, emerging, stable_all))
display(df)
print(len(df))
df.dropna(inplace=True)
display(df)

# Part 1

Cross-entropy loss and Perplexity for all 8000+ sampled sentences 

Sources: 
https://huggingface.co/docs/transformers/model_doc/t5
https://huggingface.co/docs/transformers/perplexity

In [None]:
def nll(encoding, model):
    ''' Calculates the negalive-log-likelihood and perplexity 
    for the encoded sequence "encoding" and the model'''
    
    with torch.no_grad():
        outputs = model(encoding.input_ids, labels=encoding.input_ids)
        
    return (outputs.loss).item()

def nll_normalized(encoding, model):
    ''' Calculates the negalive-log-likelihood and perplexity 
    for the encoded sequence "encoding" and the model'''
    
    seq_length = encoding.input_ids.size(1)
    
    with torch.no_grad():
        outputs = model(encoding.input_ids, labels=encoding.input_ids)
        
    return ((outputs.loss)/seq_length).item()

def loss_in_context(sentence, target_word, tokenizer, model):
    ''' Calculates the perpllexity of the model of the target word in the sentence
    By calculating the loss when predicting the masked target word'''
    
    masked_encoding = tokenizer(sentence.replace(target_word, '<extra_id_0>'), return_tensors="pt")
    
    labels = ' '.join([f'<extra_id_{i}>' 
                       if (target_word not in sentence.split()[i])  else target_word 
                       for i in range(len(sentence.split()))])
    labels = tokenizer(labels, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(masked_encoding.input_ids, labels=labels.input_ids)
        
    return outputs.loss.item()
    
def pseudo_perplexity2(sentence, model, tokenizer):
    ''' calculates the pseudo log likelihood for a sequence. 
    sums over the log probability of each token in the sequence, 
    conditioned on the context on both sides.
    NOTE: THE MASKING IS NOT GOING CORRECTLY. SHOULD BE REVISED'''
    all_nll = []
    
    splitted_sentence = sentence.split()
    
    for index in range(len(splitted_sentence)):
        
        target_word = splitted_sentence[index]
        
        masked_encoding = tokenizer(sentence.replace(target_word, '<extra_id_0>'), return_tensors="pt")
        
        labels = ' '.join([f'<extra_id_{i}>' 
                           if (target_word != sentence.split()[i])  else target_word 
                           for i in range(len(sentence.split()))])

        labels = tokenizer(labels, return_tensors="pt")

        with torch.no_grad():
            outputs = model(masked_encoding.input_ids, labels=labels.input_ids)
        
        all_nll.append(outputs.loss)
    
    return sum(all_nll).item()

def collect_perplexities(df, model, tokenizer):
    
    words_nll = []
    examples_nll = []
    
    for target_word, example in tqdm(df[['Word', 'Example']].values):
        
        words_nll.append(loss_in_context(example,  target_word, tokenizer, model))
        
        example_encoding = tokenizer(example, return_tensors="pt")
        examples_nll.append(nll(example_encoding, model))
        
    return words_nll, examples_nll

In [None]:
# calculate perplexities 
words_nll, examples_nll = collect_perplexities(df, model, tokenizer)

# Create a df of results 

results_df = df.copy()
results_df['Words NLL'] = words_nll
results_df['Words PPL'] = results_df['Words NLL'].map(lambda x: np.exp(x))
results_df['Examples NLL'] = examples_nll
results_df['Examples PPL'] = results_df['Examples NLL'].map(lambda x: np.exp(x))

display(results_df.head())

# save to csv file
# results_df.to_csv('Perplexity_DG_results_first_20_target_words.tsv', sep='\t', index=False)

# Analysis

Of the 60 target words with 100 example sentences each. 

1. Compute correlations 
2. Create visualizations 

In [None]:
# read perplexities 
filename = 'Perplexity_DG_results_first_20_target_words.tsv'

ppl_df = pd.read_csv(filename, sep='\t')
display(ppl_df)

# remove the lines with example sentences of more than 100 words
ppl_df = ppl_df[ppl_df['Example'].map(lambda x: len(x.split()) < 100)]
print(len(ppl_df))

In [None]:
# 2.1 correlation between corpus and perplexity
from scipy import stats


ppl_df['Corpus nr'] = ppl_df['Corpus'].map(lambda x: 1 if x =='C1' else 2)

corpus_correlation_scores = []

f1 = 'Corpus nr'

for f2 in ['Words CE loss', 'Words PPL', 'Examples CE loss', 'Examples PPL']:
    
    these_correlations = [f2]
    for method in ['points', 'kendall', 'spearman']:
        if method == 'spearman':
            results = stats.spearmanr(ppl_df[[f1, f2]].values)

        if method == 'points':
            results = stats.pointbiserialr(ppl_df[[f1, f2]].values[:,0],ppl_df[[f1, f2]].values[:,1])

        else: 
            results = stats.kendalltau(ppl_df[[f1, f2]].values[:,0],ppl_df[[f1, f2]].values[:,1])

        these_correlations += [results.statistic, results.pvalue]
    corpus_correlation_scores.append(these_correlations)

# create dataframe with correlations 
corpus_correlation_df = pd.DataFrame(corpus_correlation_scores, columns=['Score type', 'points', 'p', 'kendall', 'p','spearman', 'p'])
display(corpus_correlation_df.set_index(['Score type']))
display(ppl_df.groupby(['Category', 'Corpus']).mean('Words CE loss'))

In [None]:
# 2.2 Visualize corpus and perplexities 

columns = ['Words CE loss', 'Examples CE loss', ]

# boxplot
sns.set(style="darkgrid")
for c in columns:
    sns.boxenplot(data=ppl_df, x='Category', y=c, hue='Corpus', palette="Pastel1")
    
    #place legend outside top right corner of plot
    plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
    plt.title(f'{c} per corpus')
    plt.savefig(f'Thesis_DG_results/plots/First_20_Corpora_{c}.png', bbox_inches='tight')
    plt.show()
    
# boxplot
columns = ['Words PPL', 'Examples PPL']
sns.set(style="darkgrid")
for c in columns:
    sns.boxenplot(data=ppl_df, x='Category', y=c, hue='Corpus', palette="Pastel1")
    #place legend outside top right corner of plot
    plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
    plt.title(f'{c} per corpus')
    plt.savefig(f'Thesis_DG_results/plots/First_20_Corpora_{c}.png',  bbox_inches='tight')
    plt.show()

In [None]:
# 2.2 Visualize corpus and perplexities 
# distributions plots 

stat = 'percent'
print(sum(ppl_df['Corpus'] == 'C1'))
print(sum((ppl_df['Corpus'] == 'C2') & (ppl_df['Category'] != 'emerging')))


print("Excluding emerging")
sns.histplot(data=ppl_df[ppl_df['Category'] != 'emerging'], x='Words CE loss', kde=True, hue='Corpus', stat=stat)
plt.show()

sns.histplot(data=ppl_df[ppl_df['Category'] != 'emerging'], x='Examples CE loss', kde=True, hue='Corpus', stat=stat)
plt.show()

print("Including emerging")
sns.histplot(data=ppl_df, x='Words CE loss', kde=True, hue='Corpus', stat=stat)
plt.show()

sns.histplot(data=ppl_df, x='Examples CE loss', kde=True, hue='Corpus', stat=stat)
plt.show()

print("Not looking at corpus")
sns.histplot(data=ppl_df, x='Words CE loss', kde=True,stat=stat)
plt.show()

sns.histplot(data=ppl_df, x='Examples CE loss', kde=True,  stat=stat)
plt.show()



print("C1")
# distributions plots 
sns.histplot(data=ppl_df[ppl_df['Corpus'] == 'C1'], x='Words CE loss', kde=True, hue='Category', stat=stat)
plt.show()
# distributions plots 
sns.histplot(data=ppl_df[ppl_df['Corpus'] == 'C1'], x='Examples CE loss', kde=True, hue='Category', stat=stat)
plt.show()

print("C2")
sns.histplot(data=ppl_df[ppl_df['Corpus'] =='C2'], x='Words CE loss', kde=True, hue='Category', stat=stat)
plt.show()

sns.histplot(data=ppl_df[ppl_df['Corpus'] =='C2'], x='Examples CE loss', kde=True, hue='Category', stat=stat)
plt.show()
    

In [None]:
# 2.2 Visualize corpus and perplexities 

# visualise cross entropy loss 
for c2 in ['stable', 'changing']:
    for c1 in ['C1', 'C2']:    
        sns.histplot(data=ppl_df[(ppl_df['Corpus'] == c1) & (ppl_df['Category'] == c2)], 
                     x='Examples CE loss',
                     kde=True, label=f'{c1} {c2}', stat='percent')
        
sns.histplot(data=ppl_df[(ppl_df['Corpus'] == 'C2') & (ppl_df['Category'] == 'emerging')],
             x='Examples CE loss', 
             kde=True, label=f'C2 emerging', stat='percent')

plt.title('Cross entropy loss of example sentences')
plt.legend()
plt.savefig('Cross_entropy_example_sentences_all_20.png')
plt.show()



In [None]:
# Cross entropy loss of target word  

for c2 in ['stable', 'changing']:
    for c1 in ['C1', 'C2']:    
        sns.histplot(data=ppl_df[(ppl_df['Corpus'] == c1) & (ppl_df['Category'] == c2)], 
                     x='Words CE loss',
                     kde=True, label=f'{c1} {c2}', stat='percent')
        
sns.histplot(data=ppl_df[(ppl_df['Corpus'] == 'C2') & (ppl_df['Category'] == 'emerging')],
             x='Words CE loss', 
             kde=True, label=f'C2 emerging', stat='percent')

plt.title('Cross entropy loss of Words CE loss')
plt.legend()
# plt.savefig('Cross_entropy_example_sentences_all_20.png')
plt.show()


In [None]:
sns.scatterplot(data=ppl_df, x='Words CE loss', y='Examples PPL')
plt.show()

''' There is one outlier'''

print(ppl_df.iloc[ppl_df['Words CE loss'].idxmax()])
print(ppl_df.iloc[ppl_df['Examples PPL'].idxmax()])

# Part 2: Perplexity scores for the Annotated sentences 

In [None]:
# read data
df_annotations = pd.read_csv('Annotations.tsv', sep='\t')
df_annotations

#### 2.1: compute scores

In [None]:
# compute scores 

import time 

# calculate the perplexity of the sequences
start = time.time()

# initiate lists 
words_nll =  []
examples_nll = []

for target_word, example, definition in tqdm(df_annotations[['Word', 'Example', 'Prediction']].values): 

    ### loss of target word in example sentence 
    words_nll.append(loss_in_context(example, target_word, tokenizer, model))
    
    ### calculate values for example sentences
    example_encoding = tokenizer(example, return_tensors="pt")
    examples_nll.append(nll(example_encoding, model))
    
end = time.time()
print("This took", end-start)

# write results 
import numpy as np
results_df = df_annotations[['Word_id', 'Example', 'Prediction', 'Corpus', 'Category', 
                             'Boolean majority', 'Averaged judgements', 'Kaya', 'Hanna', 'Laura']]

# add everything to data frame
results_df['Words NLL'] = words_nll
results_df['Words PPL'] = results_df['Words NLL'].map(lambda x: np.exp(x))

results_df['Examples NLL'] = examples_nll
results_df['Examples PPL'] = results_df['Examples NLL'].map(lambda x: np.exp(x))

display(results_df.head(6))


# now do pseudo perplexity
pseudo_examples = []

for target_word, example, definition in tqdm(df_annotations[['Word', 'Example', 'Prediction']].values): 

    ### loss of target word in example sentence 
    pseudo_examples.append(pseudo_perplexity2(example, model, tokenizer))

results_df['PLL Examples'] = pseudo_examples
results_df.to_csv('Perplexity_Annotations.tsv', sep='\t', index=False)

#### 2.2:  Analysis of the perplexity scores Annotations 

- correlations
- visualisations

In [None]:
ppl_df = pd.read_csv('Perplexity_Annotations.tsv' , sep='\t')
ppl_df.head()

In [None]:
pseudo_perplexities = []

print()
for c1 in ['C1', 'C2']:
    subdf = ppl_df[(ppl_df['Corpus'] == c1 )]
    N = sum(subdf['Example'].map(lambda x: len(x.split())))
    ppls = subdf['Examples PLL']
    PPPL = np.exp(-sum(ppls)/N)

    pseudo_perplexities.append([ c1, '-', PPPL])
    
print()
for c2 in ['stable', 'changing', 'emerging']:
    subdf = ppl_df[(ppl_df['Category'] == c2)]
    N = sum(subdf['Example'].map(lambda x: len(x.split())))
    ppls = subdf['Examples PLL']
    PPPL = np.exp(-sum(ppls)/N)

    pseudo_perplexities.append(['-', c2, PPPL])
    
# emerging 
for c1 in ['C1', 'C2']:
    for c2 in ['stable', 'changing']:
        subdf = ppl_df[(ppl_df['Corpus'] == c1 ) & (ppl_df['Category'] == c2)]
        N = sum(subdf['Example'].map(lambda x: len(x.split())))
        ppls = subdf['Examples PLL']
        PPPL = np.exp(-sum(ppls)/N)
        pseudo_perplexities.append([c1, c2, PPPL])

# now for emerging 
c1, c2 = 'C2', 'emerging'
subdf = ppl_df[(ppl_df['Corpus'] == c1 ) & (ppl_df['Category'] == c2)]
N = sum(subdf['Example'].map(lambda x: len(x.split())))
ppls = subdf['Examples PLL']
PPPL = np.exp(-sum(ppls)/N)
pseudo_perplexities.append([c1, c2, PPPL])

pseudo_df = pd.DataFrame(pseudo_perplexities, columns=['Corpus', 'Category', 'Pseudo-perplexity'])
pseudo_df

#### Visualization 

Annotation correctness V.S. perplexity 

In [None]:
ppl_df['Label'] = ppl_df['Boolean majority'].map(lambda x: 'incorrect' if x == 0 else 'correct')
palette_dict = {'correct':'g', 'incorrect':'r'}

# boolean majority vote
sns.set(font_scale=2)
columns = ['Words CE loss', 'Words PPL', 'Examples CE loss', 'Examples PPL', 'Examples PLL']
for c in columns:
    sns.boxplot(data=ppl_df, x='Category', y=c, hue='Label' , palette=palette_dict)
    
    #place legend outside top right corner of plot
    plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
    plt.title(f'{c}')
    plt.savefig(f'Thesis_DG_results/plots/Annotators_Boolean_majority_{c}.png', bbox_inches='tight')
    plt.show()
    

In [None]:
# average vote
for c in columns:
    sns.boxplot(data=ppl_df, x='Category', y=c, hue='Averaged judgements', palette="Spectral")
    
    #place legend outside top right corner of plot
    plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left')
    plt.title(f'{c} per correctness' )
    
    plt.savefig(f'Thesis_DG_results/plots/Annotators_Averaged_judgements_{c}.png',  bbox_inches='tight')
    plt.show()

In [None]:
# scatterplots 
sns.set(font_scale=1.5)

sns.scatterplot(data=ppl_df, x='Words CE loss', y='Examples PPL', hue='Boolean majority', style='Corpus', palette="Spectral")
plt.show()
sns.scatterplot(data=ppl_df, x='Words CE loss', y='Examples PLL', hue='Boolean majority', style='Corpus', palette="Spectral")
plt.show()

sns.scatterplot(data=ppl_df, x='Examples PPL', y='Examples PLL', hue='Boolean majority', style='Corpus', palette="Spectral")
plt.show()

In [None]:
# Visualize ppl v.s. corpus of annotations 
columns = ['Words CE loss', 'Words PPL', 'Examples CE loss', 'Examples PPL', 'Examples PLL']

# boxplot
sns.set(style="darkgrid")
for c in columns:
    sns.boxplot(data=ppl_df, x='Category', y=c, hue='Corpus', palette="Pastel1")
    
    #place legend outside top right corner of plot
    plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
    plt.savefig(f'Thesis_DG_results/plots/Annotators_Corpora_{c}.png')
    plt.show()

In [None]:
# Correlation correctness vs. perplexity of annotations 

# pearson correlation
# p-values https://stackoverflow.com/questions/25571882/pandas-columns-correlation-with-statistical-significance

correctness_correlation_scores = []
for method in ['points', 'kendall', 'spearman']:
    print(method)
    for f1 in ['Boolean majority', 'Averaged judgements', 'Summed judgements']:

        for f2 in ['Words CE loss', 'Words PPL', 'Examples CE loss', 'Examples PPL', 'Examples PLL']:
            
            if method == 'spearman':
                results = stats.spearmanr(ppl_df[[f1, f2]].values)

            if method == 'points':
                results = stats.pointbiserialr(ppl_df[[f1, f2]].values[:,0],ppl_df[[f1, f2]].values[:,1])

            else: 
                results = stats.kendalltau(ppl_df[[f1, f2]].values[:,0],ppl_df[[f1, f2]].values[:,1])
                correlation = ppl_df[[f1, f2]].corr(method=method)[f1][1]
                
            correctness_correlation_scores.append([method, f1, f2, results.statistic, results.pvalue])

correctness_correlation_df = pd.DataFrame(correctness_correlation_scores, columns=['Method', 'Judgement aggregation', 'Score type', 'correlation', 'p'])
display(correctness_correlation_df.set_index(['Method', 'Judgement aggregation']))


# Extra: Some more analysis

Which words+examples have the highest loss/perplexity?

In [None]:
#### Visualizations
import seaborn.objects as so

ppl_df['Word'] = ppl_df['Word_id'].map(lambda x: x.split("%")[0])
sns.set_theme()
sns.despine()
sns.set_context('paper')
# sns.set(rc={'figure.figsize':(15, 15)})

ax = so.Plot(data=ppl_df, x='Words CE loss', y='Examples PPL', text='Word', color='Boolean majority').add(so.Dot()).add(so.Text(color='black', halign='left',fontsize=6))
ax.limit(x=(0 , 1.5), y=(0 ,0.8)) # this does not work
ax.show()

ax = so.Plot(data=ppl_df, x='Words CE loss', y='Examples PLL', text='Word').add(so.Dot()).add(so.Text(color='black', halign='left',fontsize=6))
ax.limit(x=(0 , 1.5), y=(0 ,0.8))
ax.show()


In [None]:
# See what the highest scores are 
print(ppl_df['Words CE loss'].idxmax())
print(ppl_df['Examples PPL'].idxmax())
print(ppl_df['Examples PLL'].idxmax())

In [None]:

for index in [226, 393, 152]: 
    print(ppl_df.iloc[index]['Word_id'])
    print(ppl_df.iloc[index]['Example'])
    print(ppl_df.iloc[index]['Boolean majority'])
    print(ppl_df.iloc[index]['Prediction'])
    print()