## Install Requirements

#### Create an environment with conda:

In [None]:
# ! cd ./text-metrics-main
# ! conda env create -f environment.yml  

#### Pass all these lines to start:

In [None]:
# !python -m spacy download en_core_web_sm
# !python --version
# import spacy
# from text_metrics.utils import get_metrics
# %pip install accelerate

## Import Models

In [102]:
import spacy
import text_metrics.utils as tm_utils

# tiny gpt2 model:
model_names = ["EleutherAI/pythia-70m", "EleutherAI/pythia-160m", "EleutherAI/pythia-410m"]

models_tokenizers = [tm_utils.init_tok_n_model(model_name) for model_name in model_names]
tokenizers = [tokenizer for tokenizer, _ in models_tokenizers]
models = [model for _, model in models_tokenizers]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
### Small test:

text = "113, 115, 117, and 118 are ... The International Union"

surp_res = tm_utils.get_metrics(
    text=text,
    models=models,
    tokenizers=tokenizers,
    model_names=model_names,
    parsing_model=spacy.load("en_core_web_sm"),
    add_parsing_features=False,
)

surp_res

Unnamed: 0,Word,Length,Wordfreq_Frequency,subtlex_Frequency,EleutherAI/pythia-70m_Surprisal,EleutherAI/pythia-160m_Surprisal,EleutherAI/pythia-410m_Surprisal
0,113,3,17.482668,0.0,16.033961,16.777689,15.692215
1,115,3,17.482668,0.0,9.702277,6.542616,7.40583
2,117,3,17.482668,0.0,3.780134,3.935068,3.290075
3,and,3,5.282088,6.186248,4.973053,6.9917,6.682996
4,118,3,17.482668,0.0,2.312808,1.396066,1.103458
5,are,3,7.506353,7.548023,5.524216,4.554984,2.759386
6,...,0,36.541209,0.0,11.416397,17.073027,9.195761
7,The,3,4.218934,5.048944,4.654328,6.680933,5.536911
8,International,13,12.092365,16.065472,7.371607,7.70439,7.434474
9,Union,5,13.024678,15.449666,2.442083,3.613493,4.473687


## Text Corpus from CELER:

In [1]:
import pandas as pd

path = "celer/data_v2.0/sent_ia.tsv"

full_df = pd.read_csv(path, sep="\t")
full_df.keys()

Index(['DATA_FILE', 'list', 'dataset_version', 'trial', 'shared_text',
       'sentenceid', 'IA_ID', 'IA_LABEL', 'IA_LEFT', 'IA_RIGHT',
       'IA_FIRST_FIXATION_X', 'IA_DWELL_TIME', 'IA_DWELL_TIME_%',
       'IA_FIRST_FIXATION_DURATION', 'IA_FIRST_FIXATION_INDEX',
       'IA_FIRST_FIXATION_PREVIOUS_FIX_IA',
       'IA_FIRST_FIXATION_PREVIOUS_IAREAS',
       'IA_FIRST_FIXATION_VISITED_IA_COUNT', 'IA_FIRST_FIXATION_RUN_INDEX',
       'IA_FIRST_FIX_PROGRESSIVE', 'IA_FIRST_SACCADE_ANGLE',
       'IA_FIRST_RUN_LANDING_POSITION', 'IA_FIRST_RUN_DWELL_TIME',
       'IA_FIRST_RUN_FIXATION_COUNT', 'IA_FIRST_RUN_LAUNCH_SITE',
       'IA_REGRESSION_PATH_DURATION', 'IA_REGRESSION_IN',
       'IA_REGRESSION_IN_COUNT', 'IA_REGRESSION_OUT', 'IA_REGRESSION_OUT_FULL',
       'IA_REGRESSION_OUT_COUNT', 'IA_REGRESSION_OUT_FULL_COUNT',
       'IA_FIXATION_COUNT', 'IA_RUN_COUNT', 'IA_SKIP', 'IP_START_TIME',
       'IP_END_TIME', 'EYE_USED', 'TRIAL_FIXATION_COUNT', 'TRIAL_IA_COUNT',
       'TRIAL_DWELL_TIME

In [100]:
from tqdm import tqdm

### get surprisal values for text:
sentences = full_df['sentence'].unique().tolist()
print(len(sentences), len(full_df))

28208 648696


In [None]:
result = {}
for i, sentence in enumerate(sentences):
    try:
        surp_res = tm_utils.get_metrics(
            text=sentence,
            models=models,
            tokenizers=tokenizers,
            model_names=model_names,
            parsing_model=spacy.load("en_core_web_sm"),
            add_parsing_features=False,
        )
        result[sentence] = surp_res
    
    except Exception as e:
        print(f"Error at {sentence}")
        print(e)
        print(i)
        break


In [16]:
import pickle

# save results:
with open("surprisal_results.pkl", "wb") as f:
    pickle.dump(result, f)

# load results:
result = pickle.load(open("surprisal_results.pkl", "rb"))


In [34]:
### write to tsv line by line (due to size constraints):

# import csv
# from tqdm import tqdm

# with open("surp_data/final_surp_df3.tsv", "w") as f:
#     writer = csv.writer(f, delimiter='\t')
    
#     keys = df_smaller.keys().tolist() + ["Wordfreq_Frequency", "subtlex_Frequency", "EleutherAI/pythia-70m_Surprisal", "EleutherAI/pythia-160m_Surprisal", "EleutherAI/pythia-410m_Surprisal"]
#     writer.writerow(keys)

#     j = 0 
#     for i, row in tqdm(df_smaller.iterrows()):
#         answer_df_row = answer_df.iloc[j]
#         if row['IA_LABEL'] == answer_df_row['Word']:
#             answer_df_row = answer_df_row.drop(labels=['Word', 'Length'])
#             writer.writerow(row.tolist() + answer_df_row.tolist())
#             j += 1
#         else:
#             writer.writerow(row.tolist() +  ["NaN"]*5)

317531it [00:17, 17872.67it/s]
