In [7]:
import numpy as np
from hazm import *
import time
from collections import defaultdict

In [2]:
########################################## Second Part of Exercise
class LanguageModel():
    
    @staticmethod
    def get_corpus_sentences(direction):
        sentences = None
        with open(direction,'r') as input_file:
            sentences = input_file.readlines()
        return sentences
    
    def __init__(self,n,smoothing,corpus_dir):
        self.n = n
        self.smoothing = smoothing
        self.corpus_dir = corpus_dir        
        
    def add_unigram(self,token):
        self.unigram_count[token] += 1
        self.total_tokens += 1
        
    def train_unigram(self):
        self.unigram_count, sentences, self.total_tokens, self.best_next = \
        defaultdict(lambda: 0), self.get_corpus_sentences(self.corpus_dir), 0,defaultdict(lambda: 'UNK')
        for sentence in sentences:
            tokens = sentence.split(" ")
            for token in tokens:
                self.add_unigram(token)
                
    def smoothed_unigram(self, token_count):
        if self.smoothing == False:
            return token_count/self.total_tokens
        else:
            return (token_count + 1) / (self.total_tokens + len(self.unigram_count))
        
    def unigram_probability(self,sentence):
        tokens, log_of_probability = sentence.split(" "), 0
        for token in tokens:
            log_of_probability += np.log10(self.smoothed_unigram(self.unigram_count[token]))
        return log_of_probability
#########################################################    Second Part of Exercise

    def add_bigram(self,first_token, second_token):
        self.bigram_count[first_token][second_token] += 1
        self.bigram_continuation[second_token].add(first_token)
            
    def bigram_find_best_next(self):
        for token in self.bigram_count.keys():
            best_token, best_count = 'A', 0
            for next_token in self.bigram_count[token]:
                self.distinct_word_bigrams += 1
                if self.bigram_count[token][next_token] > best_count:
                    best_token, best_count = next_token, self.bigram_count[token][next_token]
            self.bigram_next_best[token] = best_token
        
    def train_bigram(self):
        self.bigram_next_best = defaultdict(lambda: 'UNK')
        self.bigram_count,self.distinct_word_bigrams = defaultdict(lambda: defaultdict(lambda: 0)), 0
        self.bigram_continuation, sentences = defaultdict(lambda: set()),self.get_corpus_sentences(self.corpus_dir)        
        for sentence in sentences:
            tokens = sentence.split()
            for pointer in range(1,len(tokens)):
                first_token, second_token = tokens[pointer-1], tokens[pointer]
                self.add_bigram(first_token, second_token)
        self.bigram_find_best_next()
                               
    def smoothed_bigram(self,first_token,second_token):
        if self.smoothing == False:  
            if self.unigram_count[first_token] == 0:
                return 0
            return (self.bigram_count[first_token][second_token] / self.unigram_count[first_token])
            
        elif self.smoothing == "laplace":
            return (self.bigram_count[first_token][second_token]+1)/(self.unigram_count[first_token]+len(self.unigram_count))      
        ########################################## Bonus Part (Kneser-ney) for bigrams   
        elif self.smoothing == 'kneser-ney':
            tmp = self.unigram_count[first_token] * self.distinct_word_bigrams * self.unigram_count[first_token]
            if tmp == 0:
                return 0       
            DISCOUNT = 0.75
            LAMBDA = (DISCOUNT * len(self.bigram_count[first_token]))/(self.unigram_count[first_token])
            P_CONTINUATION = (len(self.bigram_continuation[second_token]))/(self.distinct_word_bigrams)
            DISCOUNTED_BIGRAM = (max(self.bigram_count[first_token][second_token]-DISCOUNT, 0))
            BIGRAM_PART = DISCOUNTED_BIGRAM / (self.unigram_count[first_token])
            return (BIGRAM_PART + (LAMBDA*P_CONTINUATION))
        
    def bigram_probability(self, sentence):
        tokens = sentence.split(" ")
        log_of_probability = np.log10(self.smoothed_unigram(self.unigram_count[tokens[0]]))
        for pointer in range(1,len(tokens)):
            first_token, second_token = tokens[pointer-1], tokens[pointer]
            log_of_probability += np.log10(self.smoothed_bigram(first_token,second_token))
        return log_of_probability
    
##########################################################  Bonus Part ( Trigrams )
    def add_trigram(self,first_token, middle_token, last_token):
        self.trigram_count[first_token][middle_token][last_token] += 1
        pair = (first_token,middle_token)
        self.trigram_continuation[last_token].add(pair)
    
    def trigram_find_next_best(self):
        for token in self.trigram_count.keys():
            for next_token in self.trigram_count[token]:
                best_token, best_count = 'A', 0
                for second_next_token in self.trigram_count[token][next_token].keys():
                    self.distinct_word_trigrams += 1
                    if self.trigram_count[token][next_token][second_next_token] > best_count:
                        best_token, best_count = second_next_token, self.trigram_count[token][next_token][second_next_token]
                self.trigram_next_best[(token,next_token)] = best_token
                  
    def train_trigram(self):
        self.trigram_next_best = defaultdict(lambda: 'UNK')
        sentences, self.trigram_continuation = self.get_corpus_sentences(self.corpus_dir), defaultdict(lambda: set())
        self.trigram_count, self.distinct_word_trigrams = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0))), 0
        for sentence in sentences:
            tokens = sentence.split()
            for pointer in range(2,len(tokens)):
                first_token, middle_token, last_token = tokens[pointer-2], tokens[pointer-1], tokens[pointer]
                self.add_trigram(first_token, middle_token,last_token)
        self.trigram_find_next_best()
        
    def smoothed_trigram(self,first_token, middle_token, last_token):
        if self.smoothing == False:
            if self.bigram_count[first_token][middle_token] == 0:
                return 0
            return (self.trigram_count[first_token][middle_token][last_token] / self.bigram_count[first_token][middle_token])
                    
        elif self.smoothing == 'laplace':
            return (self.trigram_count[first_token][middle_token][last_token] + 1) / (self.bigram_count[first_token][middle_token] + len(self.unigram_count))
        ########################################## Bonus Part (kneser-ney for trigrams)
        elif self.smoothing == 'kneser-ney':
            tmp = self.bigram_count[first_token][middle_token] * self.distinct_word_trigrams * self.bigram_count[first_token][middle_token]
            if tmp == 0:
                return 0
            DISCOUNT = 0.75
            LAMBDA = (DISCOUNT * len(self.trigram_count[first_token][middle_token]))/(self.bigram_count[first_token][middle_token])
            P_CONTINUATION = (len(self.trigram_continuation[last_token]))/(self.distinct_word_trigrams)
            DISCOUNTED_TRIGRAM = (max(self.trigram_count[first_token][middle_token][last_token]-DISCOUNT, 0))
            TRIGRAM_PART = DISCOUNTED_TRIGRAM / (self.bigram_count[first_token][middle_token])
            return (TRIGRAM_PART + (LAMBDA*P_CONTINUATION))
        
    def trigram_probability(self, sentence):
        tokens = sentence.split(" ")
        log_of_probability = np.log10(self.smoothed_unigram(self.unigram_count[tokens[0]])) + \
                             np.log10(self.smoothed_bigram(tokens[0],tokens[1]))
        for pointer in range(2,len(tokens)):
            first_token, middle_token, last_token = tokens[pointer-2], tokens[pointer-1], tokens[pointer]
            log_of_probability += np.log10(self.smoothed_trigram(first_token,middle_token,last_token))
        return log_of_probability



##########################################################     Second part of Exercise    
    def train(self):
        if self.n >= 1:
            self.train_unigram()
        if self.n >= 2:
            self.train_bigram()
        if self.n >= 3:
            self.train_trigram()
            
    def prob(self, sentence):
        if self.n == 1:
            return self.unigram_probability(sentence)
        elif self.n == 2:
            return self.bigram_probability(sentence)
        else:
            return self.trigram_probability(sentence)
##########################################################    Second part of Exercise

    def unigram_generate(self, sentence):
        # Higher Frequency ----> higher probability
        MINUS_INF = -1000000
        best_token, highest_frequency = None, MINUS_INF
        return self.best_next['UNK']
    
    def bigram_generate(self,sentence):
        tokens, MINUS_INF = sentence.split(" "), -1000000 
        last_token = tokens[-1]
        if len(tokens) == 0:
            return 'UNK'
        if last_token != '</s>':
            best_next = self.bigram_next_best[last_token]
            return best_next
        else:
            return ''
    
    def trigram_generate(self,sentence):
        tokens, MINUS_INF = sentence.split(" "), -1000000 
        if len(tokens) <= 1:
            return 'UNK'       
        second_last_token, last_token = tokens[-2], tokens[-1]
        if last_token == '</s>':
            return ''
        best_next = self.trigram_next_best[(second_last_token,last_token)]
        return best_next

    def generate(self, sentence):
        if self.n == 1:
            return self.unigram_generate(sentence)
        elif self.n == 2:
            return self.bigram_generate(sentence)
        elif self.n == 3:
            return self.trigram_generate(sentence)
        
######################################################### Third Part of Exercise
    @staticmethod    
    def wer_calculator(original_sentence, generated_sentence):
        original, generated = original_sentence.split(" "), generated_sentence.split(" ")
        N, M = len(original), len(generated)
        matrix = [[0 for j in range(N+1)] for i in range(M+1)]
        for j in range(N+1):
            matrix[0][j] = j
        for i in range(M+1):
            matrix[i][0] = i
        for i in range(1,M+1):
            for j in range(1,N+1):
                if original[j-1] == generated[i-1]:
                    matrix[i][j] = matrix[i-1][j-1]
                else:
                    matrix[i][j] = min(min(matrix[i-1][j],matrix[i][j-1]),matrix[i-1][j-1]) + 1 
        return ((matrix[M][N])/N)
            
        
        
##############################################################  Third Part of Exercise


    def evaluate(self,dir):
        validation_sentences = []
        with open(dir, 'r', encoding = 'utf-8') as data:
            validation_sentences = data.readlines()
        average_wer = 0
        for sentence, index in zip(validation_sentences,range(len(validation_sentences))):
            tmp = sentence.split(" ")
            generated_sentence = tmp[0]+ " " + tmp[1]
            LINE_AVG_TOKENS = 35
            for i in range(LINE_AVG_TOKENS):
                next_token = self.generate(generated_sentence)
                generated_sentence += " " + next_token      
                if next_token == "</s>":
                    break
            edited_sentence = sentence.replace(" \n",'')
            tmp1 = self.wer_calculator(edited_sentence, generated_sentence)
            if index % 25000 == 0:
                print("document index:", index, "Processing")
            average_wer += tmp1
        return (average_wer/len(validation_sentences))
        
        


# Training Every Possible Model + Trigram Models (Bonus Part)

In [3]:
# Expected to last in 17 Seconds with my configurations
a = time.time()
lm1 = LanguageModel(1,False,'sentences.txt')
lm1.train()
b = time.time()
print(b-a)

16.703579902648926


In [4]:
# Expected to last in 60 Seconds with my configurations
a = time.time()
lm2 = LanguageModel(2,False,'sentences.txt')
lm2.train()
b = time.time()
print(b-a)

60.4784722328186


In [5]:
# Expected to last in 170 Seconds with my configurations
a = time.time()
lm3 = LanguageModel(3,False,'sentences.txt')
lm3.train()
b = time.time()
print(b-a)

164.98213505744934


In [96]:
# Expected to last in 17.5 Seconds with my configurations
a = time.time()
lm4 = LanguageModel(1,'laplace','sentences.txt')
lm4.train()
b = time.time()
print(b-a)

18.036351919174194


In [97]:
# Expected to last in 60 Seconds with my configurations
a = time.time()
lm5 = LanguageModel(2,'laplace','sentences.txt')
lm5.train()
b = time.time()
print(b-a)

68.58028435707092


In [6]:
# Expected to last in 175 Seconds with my configurations
a = time.time()
lm6 = LanguageModel(3,'laplace','sentences.txt')
lm6.train()
b = time.time()
print(b-a)

179.35599637031555


# kneser-ney models: Bonus Part

In [84]:
# Expected to last in 60 Seconds with my configurations
a = time.time()
lm7 = LanguageModel(2,'kneser-ney','sentences.txt')
lm7.train()
b = time.time()
print(b-a)

65.16614890098572


In [8]:
# Expected to last in 190 Seconds with my configurations
a = time.time()
lm8 = LanguageModel(3,'kneser-ney','sentences.txt')
lm8.train()
b = time.time()
print(b-a)

210.9487099647522


# Probability Check, 2nd part of Exercise

In [77]:
print(lm1.prob("به گزارش گروه ورزش باشگاه خبرنگاران جوان بر اساس برنامه‌ای که برانکو به"))
print(lm1.prob('<s> منابع انتهای آب پیام'))
print(lm1.prob('<s> انتهای پیام منابع آب'))
print(lm1.prob('<s> hello انتهای پیام منابع آب'))

-36.63284375127532
-13.39824125501898
-13.39824125501898
-inf




In [72]:
print(lm2.prob("به گزارش گروه ورزش باشگاه خبرنگاران جوان بر اساس برنامه‌ای که برانکو به"))
print(lm2.prob('<s> منابع آب </s>'))
print(lm2.prob('<s> انتهای پیام </s>'))
print(lm2.prob('<s> منابع انتهای آب پیام </s>'))
print(lm2.prob('<s> منابع انتهای آب پیام hello </s>'))

-19.19834821806493
-9.224401565757852
-3.2090128608120887
-inf
-inf




In [73]:
print(lm3.prob("به گزارش گروه ورزش باشگاه خبرنگاران جوان بر اساس برنامه‌ای که برانکو به"))
print(lm3.prob('<s> منابع آب </s>'))
print(lm3.prob('<s> انتهای پیام </s>'))
print(lm3.prob('<s> منابع انتهای آب پیام </s>'))
print(lm3.prob('<s> منابع انتهای آب پیام hello </s>'))

-13.47344270965778
-9.86393880978254
-3.1939508020136396
-inf
-inf




In [78]:
print(lm4.prob("به گزارش گروه ورزش باشگاه خبرنگاران جوان بر اساس برنامه‌ای که برانکو به"))
print(lm4.prob('<s> منابع انتهای آب پیام'))
print(lm4.prob('<s> انتهای پیام منابع آب'))
print(lm4.prob('<s> hello انتهای پیام منابع آب'))

-36.63361422312002
-13.398762013493927
-13.398762013493927
-20.97056040784843


In [79]:
print(lm5.prob("به گزارش گروه ورزش باشگاه خبرنگاران جوان بر اساس برنامه‌ای که برانکو به"))
print(lm5.prob('<s> منابع آب </s>'))
print(lm5.prob('<s> انتهای پیام </s>'))
print(lm5.prob('<s> منابع انتهای آب پیام </s>'))
print(lm5.prob('<s> منابع انتهای آب پیام hello </s>'))

-21.716622846982787
-9.573371875605977
-3.3128357779750166
-19.043625247465844
-27.52994227066596


In [83]:
print(lm6.prob("به گزارش گروه ورزش باشگاه خبرنگاران جوان بر اساس برنامه‌ای که برانکو به"))
print(lm6.prob('<s> منابع آب </s>'))
print(lm6.prob('<s> انتهای پیام </s>'))
print(lm6.prob('<s> منابع انتهای آب پیام </s>'))
print(lm6.prob('<s> منابع انتهای آب پیام hello </s>'))

-24.60552115588222
-12.032312795207122
-3.311747781430808
-20.646374473956733
-24.6432799846524


In [86]:
print(lm7.prob("به گزارش گروه ورزش باشگاه خبرنگاران جوان بر اساس برنامه‌ای که برانکو به"))
print(lm7.prob('<s> منابع آب </s>'))
print(lm7.prob('<s> انتهای پیام </s>'))
print(lm7.prob('<s> منابع انتهای آب پیام </s>'))
print(lm7.prob('<s> منابع انتهای آب پیام hello </s>'))

-19.22244733084632
-9.210889145950755
-3.2090991319216156
-20.25915384423504
-inf




In [87]:
print(lm8.prob("به گزارش گروه ورزش باشگاه خبرنگاران جوان بر اساس برنامه‌ای که برانکو به"))
print(lm8.prob('<s> منابع آب </s>'))
print(lm8.prob('<s> انتهای پیام </s>'))
print(lm8.prob('<s> منابع انتهای آب پیام </s>'))
print(lm8.prob('<s> منابع انتهای آب پیام hello </s>'))

-13.732442248174511
-10.165538609849596
-3.1940289229012264
-inf
-inf




# Generation Check, 2nd part of Exercise

In [88]:
lm1.generate("به گزارش گروه ورزش باشگاه خبرنگاران جوان بر اساس برنامه‌ای که برانکو به")

'UNK'

In [89]:
lm2.generate("به گزارش گروه ورزش باشگاه خبرنگاران جوان بر اساس برنامه‌ای که برانکو به")

'گزارش'

In [90]:
lm3.generate("به گزارش گروه ورزش باشگاه خبرنگاران جوان بر اساس برنامه‌ای که برانکو به")

'بازیکنانم'

In [91]:
lm4.generate("به گزارش گروه ورزش باشگاه خبرنگاران جوان بر اساس برنامه‌ای که برانکو به")

'UNK'

In [92]:
lm5.generate("به گزارش گروه ورزش باشگاه خبرنگاران جوان بر اساس برنامه‌ای که برانکو به")

'گزارش'

In [93]:
lm1.generate("<s> انتهای")

'UNK'

In [94]:
lm2.generate("<s> انتهای")

'پیام'

In [95]:
lm3.generate("<s> انتهای")

'پیام'

# Evaluating Models, 3rd Part of Exercise

In [65]:
a = time.time()
result = lm1.evaluate('/Users/mehrad/programmings/nlp/codes/phase1/validation_sentences.txt')
print("average WER: ", result)
b = time.time()
print("duration:",b-a)

document index: 0 Processing
document index: 25000 Processing
document index: 50000 Processing
document index: 75000 Processing
document index: 100000 Processing
document index: 125000 Processing
document index: 150000 Processing
average WER:  1.885516606406248
duration: 152.14761900901794


In [57]:
a = time.time()
result = lm2.evaluate('/Users/mehrad/programmings/nlp/codes/phase1/validation_sentences.txt')
print("average WER: ", result)
b = time.time()
print("duration:",b-a)

document index: 0 Processing
document index: 25000 Processing
document index: 50000 Processing
document index: 75000 Processing
document index: 100000 Processing
document index: 125000 Processing
document index: 150000 Processing
average WER:  1.3448067444020273
duration: 149.62035393714905


In [59]:
a = time.time()
result = lm3.evaluate('/Users/mehrad/programmings/nlp/codes/phase1/validation_sentences.txt')
print("average WER: ", result)
b = time.time()
print("duration:",b-a)

document index: 0 Processing
document index: 25000 Processing
document index: 50000 Processing
document index: 75000 Processing
document index: 100000 Processing
document index: 125000 Processing
document index: 150000 Processing
average WER:  1.241573271466225
duration: 141.38325715065002
