# Importing libraries

In [2]:
from utils import load_word_tokens, load_dataset, build_test_set_from_words
from utils import get_pretrained_Persian_GPT2, df_to_csv, df_to_csv2, final_eval, surprisal, final_eval2
from transformers import pipeline

# Loading and Cleaning dataset

In [3]:
words = load_word_tokens("data/individual_words_n_4148.txt")
# training_data = load_dataset('data/training.txt')
test_data = load_dataset('data/test.txt')

cleaned_test_data = build_test_set_from_words(words, test_data)
word_types = sorted(list(set(words)))

# Loading pretrained Persian GPT-2 model

In [4]:
tokenizer, model = get_pretrained_Persian_GPT2()
generator = pipeline('text-generation', model, tokenizer=tokenizer, config={'max_length':256})

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


# Testing tokenizer and generator

In [5]:
tokenizer('آلیس در سرزمین عجایب')

{'input_ids': [5, 13773, 46, 1490, 10893, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [6]:
sample = generator('آلیس در سرزمینی')
print(sample[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:9 for open-end generation.


آلیس در سرزمینی رویایی و خیال انگیز به دنیا آمد و در سال  میلادی در جزیره موریس به دنیا آمد.


In [7]:
cleaned_test_data[0]

['آلیس',
 'نشسته',
 'بود',
 'لب',
 'آب',
 'کنار',
 'خواهرش',
 'و',
 'از',
 'اینکه',
 'کاری',
 'انجام',
 'نمی\u200cداد',
 'دیگر',
 'حوصله\u200cاش',
 'داشت',
 'سر',
 'می\u200cرفت']

# Finding sequential entropy and surprisal

In [8]:
surprisal_list, probs = final_eval(model, tokenizer, cleaned_test_data)

100%|██████████| 238/238 [00:30<00:00,  7.87it/s]


In [9]:
surprisal(probs[18][2])

5.526613191296309

In [10]:
surprisal_list[18]

[9.658901866691691,
 4.578535571148495,
 5.526613191296309,
 2.202548733426862,
 5.942894054297599,
 0.8405537611216134,
 4.615532055327183,
 2.8866395095900486,
 0.03112582642434694,
 4.299322806593418,
 10.342905317369427,
 4.291148494920171,
 3.7162980771167264,
 5.284420854245871,
 0.3559075116570685]

# Saving words and their evaluations as csv file

In [12]:
df_to_csv(cleaned_test_data, probs, surprisal_list, file_path = 'results/word_list_with_evaluation.csv')

Unnamed: 0,word,probability,surprisal
0,آلیس,9.792317e-07,19.961846
1,نشسته,3.701029e-06,18.043642
2,بود,2.300208e-01,2.120164
3,لب,1.320549e-04,12.886575
4,آب,7.880477e-04,10.309429
...,...,...,...
4143,که,6.474356e-01,0.627191
4144,شاه,2.288974e-05,15.414939
4145,آن,1.744918e-03,9.162625
4146,روز,3.943570e-02,4.664354


In [13]:
surprisal_list, probs = final_eval2(model, tokenizer, cleaned_test_data)

100%|██████████| 238/238 [00:29<00:00,  7.99it/s]


In [14]:
df_to_csv2(tokenizer, cleaned_test_data, probs, surprisal_list, file_path = 'results/evaluation_without_alignment.csv')

Unnamed: 0,word,probability,surprisal
0,▁آلیس,9.792317e-07,19.961846
1,▁نشسته,3.701029e-06,18.043642
2,▁بود,2.300208e-01,2.120164
3,▁لب,1.320549e-04,12.886575
4,▁آب,7.880477e-04,10.309429
...,...,...,...
4833,▁که,6.474356e-01,0.627191
4834,▁شاه,2.288974e-05,15.414939
4835,▁آن,1.744918e-03,9.162625
4836,▁روز,3.943570e-02,4.664354
