In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk import bigrams, trigrams
from collections import Counter, defaultdict
import re

In [None]:
df = pd.read_csv('data.csv')
print(df.shape)

In [None]:
# df = df[:10]

In [None]:
X_tr, X_val = train_test_split(df['text'], test_size=0.1, random_state=42)

In [None]:
def text_cleaner(text):
    # remove punctuations
    newString = re.sub("^[\uFE70-\uFEFF]", " ", text)
    newString = re.sub(r"[.،\"()0-9:A-Za-z,!%-/؟'ّ»ـ»'ً«'ُ'ْ'َ'ٍ{}؛'ِ'ٌ…\\|\xad”@_?<>’“\]\[éà=‘]","",newString)
    
    words=[]
    for i in newString.split():
        i.strip()
        words.append(i)
    return (" ".join(words)).strip()

In [None]:
# Create a placeholder for model
def func_one():
    return 0
def func_two():
    return defaultdict(func_one)

model = defaultdict(func_two)

# Make a vocab
vocab = set()

# Count frequency of co-occurance  
for sentence in X_tr:
    
    if sentence != None:
        sentence = text_cleaner(sentence)
        words = sentence.split()
        vocab.update(words)
        for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):
            model[(w1, w2)][w3] += 1
        for w1, w2 in bigrams(words, pad_right=True, pad_left=True):
            model[w1][w2] += 1

In [None]:
import math
import random

def perplexity(data,smoothing_alpha = 0.001):
    perplex = 0
    num = 0
    for sentence in data:
    
        if sentence != None:
            sentence = text_cleaner(sentence)
            words = [None,None] + sentence.split() + [None,None]
      
        for i in range(2,len(words)):
            if (words[i-2], words[i-1]) in model.keys():
                if words[i] in model[(words[i-2], words[i-1])].keys():
                    count = model[(words[i-2], words[i-1])][words[i]]
                    total = sum(model[(words[i-2], words[i-1])].values())
                else:
                    count = 1
                    total = sum(model[(words[i-2], words[i-1])].values()) + 1
          
                prob = (count + smoothing_alpha) / (total + (len(vocab)*smoothing_alpha))
            
            elif words[i-1] in model.keys():
                if words[i] in model[words[i-1]].keys():
                    count = model[words[i-1]][words[i]]
                    total = sum(model[words[i-1]].values())
                else:
                    count = 1
                    total = sum(model[words[i-1]].values()) + 1
          
                prob = (count + smoothing_alpha) / (total + (len(vocab)*smoothing_alpha))
        
            else:
                count = 0
                total = 1
          
                prob = random.random() / len(vocab)
            
            num = num + 1
            perplex = perplex + math.log(prob,2)
      
    return math.pow(2, -1*(perplex/num))

perplexity(X_val)

In [None]:
# Let's transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

In [None]:
# starting words
text = ['قررت','المحكمة']
if tuple(text[-2:]) in model.keys():
    words = sorted(model[tuple(text[-2:])], key=model[tuple(text[-2:])].get, reverse=True)[:3]
    text.append(words[0])

elif text[-1:][0] in model.keys():
    words = sorted(model[text[-1:][0]], key=model[text[-1:][0]].get, reverse=True)[:3]
    text.append(words[0])

else:
    words = []

print(words)
print (' '.join([t for t in text if t]))