# Importing libraries

In [3]:


from tqdm import tqdm

import numpy as np
import pandas as pd
import pickle

# Loading and Cleaning dataset

In [12]:
from utils import load_word_tokens, load_dataset, build_test_set_from_words

words = load_word_tokens("dataset/individual_words_n_4148.txt")
training_data = load_dataset('dataset/training.txt')
test_data = load_dataset('dataset/test.txt')

cleaned_test_data = build_test_set_from_words(words, test_data)
word_types = sorted(list(set(words)))

In [None]:
'''dfs = []
xls = pd.ExcelFile("/content/Alice_only_words.xlsx")
for i in range(1,7,1):
    dfs += [pd.read_excel(xls, 'Section' + str(i))]

words = []
for i in range(6):
    words += list(dfs[i]['Word'])


with open('cleaned_words.pkl', 'rb') as f:
    cleaned_test_data = pickle.load(f)'''

# Loading pretrained Persian GPT-2 model

In [7]:
from utils import get_pretrained_Persian_GPT2
from transformers import pipeline

tokenizer, model = get_pretrained_Persian_GPT2()
generator = pipeline('text-generation', model, tokenizer=tokenizer, config={'max_length':256})

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/537k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.13M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading pytorch_model.bin:   0%|          | 0.00/1.31G [00:00<?, ?B/s]

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [20]:
import torch
inputs = tokenizer('کشورهای غربی و', return_tensors="pt")
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
out = model(input_ids, attention_mask=attention_mask)
torch.argsort(out['logits'], 2)[0][-1]

tensor([   40, 24924,    27,  ...,   313,    43,    48])

In [9]:
tokenizer('سرزمین')

{'input_ids': [5, 1490, 3], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}

In [10]:
soft = F.Softmax(dim=2)
soft(out['logits'])
torch.sort(soft(out['logits']), 2)[0][0][2][-10:]

tensor([0.0124, 0.0164, 0.0189, 0.0354, 0.0400, 0.0431, 0.0521, 0.0837, 0.1379,
        0.1632], grad_fn=<SliceBackward0>)

In [16]:
out['logits'][0][2][45]

tensor(4.6699, grad_fn=<SelectBackward0>)

In [21]:
tokenizer.convert_ids_to_tokens([40])

['[RES14]']

# Testing tokenizer and generator

In [None]:
tokenizer('آلیس در سرزمین عجایب')

{'input_ids': [5, 13773, 46, 1490, 10893, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [None]:
sample = generator('آلیس در سرزمینی')
print(sample[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:9 for open-end generation.


آلیس در سرزمینی به نام سرزمین فرشتگان زندگی می‌کند ، پس در سفر قبلی‌اش با او آشنا می‌‌شود اما دیگر با او رابطه‌ای ندارد .


In [None]:
cleaned_test_data[0]

['آلیس',
 'نشسته',
 'بود',
 'لب',
 'آب',
 'کنار',
 'خواهرش',
 'و',
 'از',
 'اینکه',
 'کاری',
 'انجام',
 'نمی\u200cداد',
 'دیگر',
 'حوصله\u200cاش',
 'داشت',
 'سر',
 'می\u200cرفت']

# Finding sequential entropy and surprisal

In [None]:
entropy_list, probs = final_eval(cleaned_test_data, type = 'entropy')
surprisal_list, _ = final_eval(cleaned_test_data, type = 'surprisal')

100%|██████████| 235/235 [02:21<00:00,  1.66it/s]
100%|██████████| 235/235 [02:21<00:00,  1.66it/s]


In [None]:
entropy(probs[18][2])

0.119889858804591

In [None]:
entropy_list[18]

[0.011948364486892708,
 0.19162407698754177,
 0.119889858804591,
 0.4785115152837027,
 0.2668968856118256,
 0.469389174645734,
 0.2453511759172139,
 0.39032582018814943,
 0.030461485872693938,
 0.21836079898119604,
 0.06061932459841721,
 0.21918414251338486,
 0.28274418694796183,
 0.13559024333524664,
 0.2780982893835936]

# Saving words and their evaluations as csv file

In [None]:
df_to_csv(cleaned_test_data, probs, entropy_list, surprisal_list, file_path = './word_list_with_evaluation_v2.csv')

Unnamed: 0,word,probability,entropy,surprisal
0,آلیس,9.792323e-07,0.000020,19.961846
1,نشسته,3.701035e-06,0.000067,18.043640
2,بود,2.300206e-01,0.487682,2.120165
3,لب,1.320549e-04,0.001702,12.886575
4,آب,7.880474e-04,0.008124,10.309430
...,...,...,...,...
4145,که,6.474361e-01,0.406066,0.627190
4146,شاه,2.288975e-05,0.000353,15.414939
4147,آن,1.744918e-03,0.015988,9.162625
4148,روز,3.943569e-02,0.183942,4.664354


In [None]:
entropy_list, probs = final_eval2(cleaned_test_data, type = 'entropy')
surprisal_list, _ = final_eval2(cleaned_test_data, type = 'surprisal')

100%|██████████| 235/235 [02:21<00:00,  1.66it/s]
100%|██████████| 235/235 [02:22<00:00,  1.65it/s]


In [None]:
df_to_csv2(cleaned_test_data, probs, entropy_list, surprisal_list, file_path = './evaluation_without_alignment_v2.csv')

Unnamed: 0,word,probability,entropy,surprisal
0,▁آلیس,9.792323e-07,0.000020,19.961846
1,▁نشسته,3.701035e-06,0.000067,18.043640
2,▁بود,2.300206e-01,0.487682,2.120165
3,▁لب,1.320549e-04,0.001702,12.886575
4,▁آب,7.880474e-04,0.008124,10.309430
...,...,...,...,...
4835,▁که,6.474361e-01,0.406066,0.627190
4836,▁شاه,2.288975e-05,0.000353,15.414939
4837,▁آن,1.744918e-03,0.015988,9.162625
4838,▁روز,3.943569e-02,0.183942,4.664354
