# Installing required libraries

In [5]:
from google.colab import drive
drive.mount('/content/drive')

KeyboardInterrupt: ignored

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m75.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m89.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m78.1 MB/s[0m eta [36m0:00:0

# Importing libraries

In [2]:
from transformers import pipeline, AutoTokenizer, GPT2LMHeadModel
import torch.nn as F
from tqdm import tqdm
import math
import numpy as np
import pandas as pd
import pickle

# Defining important functions

In [3]:
def entropy(p):
    if p != 0:
        E = p * math.log(p, 2)
        return -E
    else:
        return 0

def surprisal(p):
    if p == 0:
        S = math.inf
        return -S
    S = math.log(p, 2)
    return -S

def load_word_tokens(file_path):
    words_file = open(file_path ,"r")
    words = words_file.readlines()
    words_file.close()
    words = [w.strip() for w in words]
    return words


def load_dataset(file_path):
    file_ = open(file_path ,"r")
    file1 = file_.readlines()
    file_.close()
    file1 = [d.strip() for d in file1]
    return file1



def build_test_set_from_words(words, test_data):
    len__ = 0
    cleaned_test_data = []
    for i in range(len(test_data)):
        cleaned_test_data += [words[len__:len__ + len(test_data[i].split(' '))]]
        len__ += len(test_data[i].split(' '))
    return cleaned_test_data


def get_pretrained_Persian_GPT2():
    tokenizer = AutoTokenizer.from_pretrained('bolbolzaban/gpt2-persian')
    model = GPT2LMHeadModel.from_pretrained('bolbolzaban/gpt2-persian')

    return tokenizer, model



def calculate_prob_batch(test_set, return_subtokens = False):
    outputs = []
    for s in tqdm(test_set):
        soft = F.Softmax(dim=2)
        inputs = tokenizer(' '.join(s), return_tensors="pt")
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        out = soft(model(input_ids, attention_mask=attention_mask)['logits'])
        ps = [float(out[0][i][x]) for i,x in enumerate(inputs['input_ids'][0][1:-1])]
        outputs.append(ps[:])
        del inputs, out
    return outputs


def get_subtokens(test_set):
    subtokens = []
    for s in test_set:
        inputs = tokenizer(' '.join(s), return_tensors="pt")
        subtokens += inputs['input_ids'][0][1:-1].tolist()
    return subtokens

def tokenizer_alignment(test_set):
    out = []
    for x in test_set:
        temp_list = []
        for y in x:
            temp = tokenizer(y, return_tensors="pt")['input_ids'][0][1:-1]
            temp_list.append(temp.tolist())
        out.append(temp_list)
    return [[len(y) for y in x] for x in out]



def calculate_sequential_entropy(prob_seq):
    temp = [entropy(prob_seq[i]) for i in range(len(prob_seq))]
    return temp


def calculate_sequential_surprisal(prob_seq):
    temp = [surprisal(prob_seq[i]) for i in range(len(prob_seq))]
    return temp


def align_with_real_word_tokens(seq, alignment, type = 'sum'):
    output = []
    if type == 'sum':
        count = 0
        for x in alignment:
            aln = seq[count: count + x]
            output += [np.average(aln)]
            count += x
        return output
    else:
        count = 0
        for x in alignment:
            aln = seq[count: count + x]
            output += [np.product(aln)**(1/x)]
            count += x
        return output


def final_eval(test_set, type = 'entropy'):
    outputs = calculate_prob_batch(test_set)
    alignment = tokenizer_alignment(test_set)
    probs = [align_with_real_word_tokens(outputs[i], alignment[i], type = 'product') for i in range(len(outputs))]
    eval = []
    if type == 'entropy':
        for i in range(len(outputs)):
            A = align_with_real_word_tokens(calculate_sequential_entropy(outputs[i]), alignment[i])
            eval.append(A)
    elif type == 'surprisal':
        for i in range(len(outputs)):
            A = align_with_real_word_tokens(calculate_sequential_surprisal(outputs[i]), alignment[i])
            eval.append(A)
    return eval, probs


def final_eval2(test_set, type = 'entropy'):
    probs = calculate_prob_batch(test_set)
    eval = []
    if type == 'entropy':
        for i in range(len(probs)):
            A = calculate_sequential_entropy(probs[i])
            eval.append(A)
    elif type == 'surprisal':
        for i in range(len(probs)):
            A = calculate_sequential_surprisal(probs[i])
            eval.append(A)
    return eval, probs


def save_df_to_csv(test_set, probability, entropy_list, surprisal_list, file_path = './word_list_with_evaluation.csv'):
    final_out = []
    for i, s in enumerate(test_set):
        for j, x in enumerate(s):
            final_out.append({'word': x, 'probability': probability[i][j], 'entropy': entropy_list[i][j], 'surprisal': surprisal_list[i][j]})
    df = pd.DataFrame(final_out)
    df.to_csv(file_path, sep='\t', index=None)
    return df


def save_df_to_csv2(test_set, probability, entropy_list, surprisal_list, file_path = './word_list_with_evaluation.csv'):
    sub__ = get_subtokens(cleaned_test_data)
    w__ = tokenizer.convert_ids_to_tokens(sub__)
    probability__ = [y for x in  probability for y in x]
    entropy_list__ = [y for x in  entropy_list for y in x]
    surprisal_list__ = [y for x in  surprisal_list for y in x]

    final_out = []
    for i,x in enumerate(w__):
        final_out.append({'word': x, 'probability': probability__[i], 'entropy': entropy_list__[i], 'surprisal': surprisal_list__[i]})
    df = pd.DataFrame(final_out)
    df.to_csv(file_path, sep='\t', index=None)
    return df

# Loading and Cleaning dataset

In [4]:
words = load_word_tokens("individual_words_n_4148.txt")
training_data = load_dataset('training.txt')
test_data = load_dataset('test.txt')

cleaned_test_data = build_test_set_from_words(words, test_data)
word_types = sorted(list(set(words)))

FileNotFoundError: ignored

In [None]:
'''dfs = []
xls = pd.ExcelFile("/content/Alice_only_words.xlsx")
for i in range(1,7,1):
    dfs += [pd.read_excel(xls, 'Section' + str(i))]

words = []
for i in range(6):
    words += list(dfs[i]['Word'])


with open('cleaned_words.pkl', 'rb') as f:
    cleaned_test_data = pickle.load(f)'''

# Loading pretrained Persian GPT-2 model

In [None]:
tokenizer, model = get_pretrained_Persian_GPT2()
generator = pipeline('text-generation', model, tokenizer=tokenizer, config={'max_length':256})

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/524k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/399 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

In [None]:
import torch
inputs = tokenizer('کشورهای غربی و', return_tensors="pt")
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
out = model(input_ids, attention_mask=attention_mask)
torch.argsort(out['logits'], 2)[0][2][-10:]

tensor([62, 51, 95, 57, 46, 49, 50, 48, 45, 53])

In [None]:
tokenizer('سرزمین')

{'input_ids': [5, 1490, 3], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}

In [None]:
soft = F.Softmax(dim=2)
soft(out['logits'])
torch.sort(soft(out['logits']), 2)[0][0][2][-10:]

tensor([0.0124, 0.0164, 0.0189, 0.0354, 0.0400, 0.0431, 0.0521, 0.0837, 0.1379,
        0.1632], grad_fn=<SliceBackward>)

In [None]:
out['logits'][0][2][45]

tensor(4.6699, grad_fn=<SelectBackward>)

In [None]:
tokenizer.convert_ids_to_tokens([45])

['▁و']

# Testing tokenizer and generator

In [None]:
tokenizer('آلیس در سرزمین عجایب')

{'input_ids': [5, 13773, 46, 1490, 10893, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [None]:
sample = generator('آلیس در سرزمینی')
print(sample[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:9 for open-end generation.


آلیس در سرزمینی به نام سرزمین فرشتگان زندگی می‌کند ، پس در سفر قبلی‌اش با او آشنا می‌‌شود اما دیگر با او رابطه‌ای ندارد .


In [None]:
cleaned_test_data[0]

['آلیس',
 'نشسته',
 'بود',
 'لب',
 'آب',
 'کنار',
 'خواهرش',
 'و',
 'از',
 'اینکه',
 'کاری',
 'انجام',
 'نمی\u200cداد',
 'دیگر',
 'حوصله\u200cاش',
 'داشت',
 'سر',
 'می\u200cرفت']

# Finding sequential entropy and surprisal

In [None]:
entropy_list, probs = final_eval(cleaned_test_data, type = 'entropy')
surprisal_list, _ = final_eval(cleaned_test_data, type = 'surprisal')

100%|██████████| 235/235 [02:21<00:00,  1.66it/s]
100%|██████████| 235/235 [02:21<00:00,  1.66it/s]


In [None]:
entropy(probs[18][2])

0.119889858804591

In [None]:
entropy_list[18]

[0.011948364486892708,
 0.19162407698754177,
 0.119889858804591,
 0.4785115152837027,
 0.2668968856118256,
 0.469389174645734,
 0.2453511759172139,
 0.39032582018814943,
 0.030461485872693938,
 0.21836079898119604,
 0.06061932459841721,
 0.21918414251338486,
 0.28274418694796183,
 0.13559024333524664,
 0.2780982893835936]

# Saving words and their evaluations as csv file

In [None]:
save_df_to_csv(cleaned_test_data, probs, entropy_list, surprisal_list, file_path = './word_list_with_evaluation_v2.csv')

Unnamed: 0,word,probability,entropy,surprisal
0,آلیس,9.792323e-07,0.000020,19.961846
1,نشسته,3.701035e-06,0.000067,18.043640
2,بود,2.300206e-01,0.487682,2.120165
3,لب,1.320549e-04,0.001702,12.886575
4,آب,7.880474e-04,0.008124,10.309430
...,...,...,...,...
4145,که,6.474361e-01,0.406066,0.627190
4146,شاه,2.288975e-05,0.000353,15.414939
4147,آن,1.744918e-03,0.015988,9.162625
4148,روز,3.943569e-02,0.183942,4.664354


In [None]:
entropy_list, probs = final_eval2(cleaned_test_data, type = 'entropy')
surprisal_list, _ = final_eval2(cleaned_test_data, type = 'surprisal')

100%|██████████| 235/235 [02:21<00:00,  1.66it/s]
100%|██████████| 235/235 [02:22<00:00,  1.65it/s]


In [None]:
save_df_to_csv2(cleaned_test_data, probs, entropy_list, surprisal_list, file_path = './evaluation_without_alignment_v2.csv')

Unnamed: 0,word,probability,entropy,surprisal
0,▁آلیس,9.792323e-07,0.000020,19.961846
1,▁نشسته,3.701035e-06,0.000067,18.043640
2,▁بود,2.300206e-01,0.487682,2.120165
3,▁لب,1.320549e-04,0.001702,12.886575
4,▁آب,7.880474e-04,0.008124,10.309430
...,...,...,...,...
4835,▁که,6.474361e-01,0.406066,0.627190
4836,▁شاه,2.288975e-05,0.000353,15.414939
4837,▁آن,1.744918e-03,0.015988,9.162625
4838,▁روز,3.943569e-02,0.183942,4.664354
