# Installing needed bakages

In [None]:
!git clone https://github.com/aub-mind/arabert.git

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, DataCollatorForLanguageModeling,TextDataset
from arabert.preprocess import ArabertPreprocessor, never_split_tokens
from transformers import GPT2TokenizerFast, pipeline
from transformers import GPT2LMHeadModel
from py4j.java_gateway import JavaGateway
from transformers import (
    GPT2Tokenizer,
    DataCollatorForLanguageModeling,
    TextDataset,
    GPT2LMHeadModel,
    TrainingArguments,
    Trainer,
    pipeline)
import torch

In [None]:
df = pd.read_csv('data.csv')
print(df.shape)

In [None]:
# df = df[:10]

In [None]:
X_tr, X_val = train_test_split(df['text'], test_size=0.1, random_state=42)

In [None]:
model_name = "aubmindlab/aragpt2-base"

model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
arabert_prep = ArabertPreprocessor(model_name=model_name)

X_tr = X_tr.apply(lambda x:arabert_prep.preprocess(x))
X_val = X_val.apply(lambda x:arabert_prep.preprocess(x))

In [None]:
train_txt = "".join([sent for sent in X_tr])
test_txt = "".join([sent for sent in X_val])

In [None]:
with open ('train.txt', 'w', encoding="utf-8") as f:
    f.write(train_txt)

with open ('test.txt', 'w', encoding="utf-8") as f:
    f.write(test_txt)

# fine tuning araGPT2

In [None]:
train_path = 'D:\\datasets\\processed\\Arabic Auto-Complete System\\train.txt'
test_path = 'D:\\datasets\\processed\\Arabic Auto-Complete System\\test.txt'

In [None]:
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=train_path,
    block_size=64)
     
test_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=test_path,
    block_size=64)

In [None]:
training_args = TrainingArguments(
    output_dir = 'data/out', # the output directory for the model predictions and checkpoints
    overwrite_output_dir = True, # overwrite the content of the output directory
    per_device_train_batch_size = 32, # the batch size for training
    per_device_eval_batch_size = 32, # the batch size for evaluation
    learning_rate = 5e-5, # defaults to 5e-5
    num_train_epochs = 1, # total number of training epochs to perform
)

trainer = Trainer(
    model = model,
    args = training_args,
    data_collator=data_collator,
    train_dataset = train_dataset,
    eval_dataset = test_dataset
)

In [None]:
trainer.train()

# generating candidates

In [None]:
def predict(text,arabert_prep,tokenizer,model,k=5):
    text = arabert_prep.preprocess(text)
    indexed_tokens = tokenizer.encode(text)

    # Convert indexed tokens in a PyTorch tensor
    tokens_tensor = torch.tensor([indexed_tokens])

    # Set the model in evaluation mode to deactivate the DropOut modules
    model.eval()

    # If you have a GPU, put everything on cuda
    # tokens_tensor = tokens_tensor.to('cuda')
    # model.to('cuda')

    # Predict all tokens
    with torch.no_grad():
        outputs = model(tokens_tensor)
        soft_max = torch.nn.Softmax(dim=0)  
        probs = soft_max(outputs[0][0, -1, :])
        sorted, indices = torch.topk(probs, k)
        preds = [index.item() for index in indices]

    predicted_words = tokenizer.decode(preds).split()

    # Return the predicted word
    return predicted_words

In [None]:
predict("قررت المحكمة", arabert_prep, tokenizer, trainer.model )

# Saving the model

In [None]:
torch.save(trainer.model.state_dict(), 'models/model.pth')

# Perplexity

In [None]:
import math
def perplexity(X_val,arabert_prep,tokenizer,model):
    num = 0
    perplex = 0
    for text in X_val:
        input = arabert_prep.preprocess(text)
        indexed_tokens = tokenizer.encode(input)
        for ind in range(1,len(indexed_tokens)):
            current_indices = indexed_tokens[:ind+1] 
            # Convert indexed tokens in a PyTorch tensor
            tokens_tensor = torch.tensor([current_indices[:-1]])
            output = current_indices[-1]
            # Set the model in evaluation mode to deactivate the DropOut modules
            model.eval()

            # If you have a GPU, put everything on cuda
#             tokens_tensor = tokens_tensor.to('cuda')
#             model.to('cuda')

            # Predict all tokens
            with torch.no_grad():
                outputs = model(tokens_tensor)
                soft_max = torch.nn.Softmax(dim=0)  
                probs = soft_max(outputs[0][0, -1, :])

            prob = probs[output]    
            perplex = perplex + math.log(prob,2)
            num += 1

    return math.pow(2, -1*(perplex/num))
perplexity(X_val,arabert_prep,tokenizer,trainer.model)