In [1]:
import re
import sys
from random import random
from math import log,isclose
from collections import defaultdict
import numpy as np


tri_counts=defaultdict(int) #counts of all trigrams in input

#We convert set to string for O(1) lookup
validCharacters = set("1234567890abcdefghijklmnopqrstuvwxyz. ")


#Process each line and add start/ending symbols
def preprocess_line(line):
    #Add start-sentence symbol
    valid = "##"
    #Because each line ends with newline symbol, we simply delete them
    line=line.replace("\n","")
    #.lower() automatically changes all capitals to lower.
    for i in line.lower():
        if i in validCharacters:
            if i in set("1234567890"):
                valid += "0"
            else:
                valid += i
    #add end-sentence symbol
    valid+="#"   
    return valid

In [2]:
#Get a template dictionary (+1 smoothing)
with open("model-br.en") as f:
    for line in f:
        tri_counts[line[0:3]]=1

#Extract all conditional "words"
#Guess these are the bigrams?
condition=[k[0:2] for k in tri_counts.keys()]
condition=list(set(condition))


In [3]:
#Generalized Model Builder

def buildModel(file, tri_template):
    tri_counts = tri_template.copy()
    #Count trigrams from corpus
    with open(file) as f:
        for line in f:
            line = preprocess_line(line) 
            for j in range(len(line)-(2)):
                trigram = line[j:j+3]
                tri_counts[trigram] += 1

    #Calculate conditional probabilities of each trigram            
    tri_condition=defaultdict(int)
    for i in range(len(condition)):
        target=condition[i]
        for j in tri_counts.keys():
            if j[0:2]==target:
                tri_condition[target]+=tri_counts[j]
    model ={k:(v/tri_condition[k[0:2]]) for k,v in tri_counts.items()}
    return model
tri_pro_en = buildModel('training.en', tri_counts)
tri_pro_es = buildModel('training.es', tri_counts)
tri_pro_de = buildModel('training.de', tri_counts)

In [4]:
#Count trigrams from test data
tri_counts_test=defaultdict(int)
with open("test") as f:
    for line in f:
        line = preprocess_line(line) 
        for j in range(len(line)-(2)):
            trigram = line[j:j+3]
            tri_counts_test[trigram] += 1
tri_total=sum(tri_counts_test.values())

In [5]:
#Calculate perplexity
entropy_en, entropy_es, entropy_de = 0,0,0
for k,v in tri_counts_test.items():
    entropy_en-=v*log(tri_pro_en[k])
    entropy_es-=v*log(tri_pro_es[k])
    entropy_de-=v*log(tri_pro_de[k])
    
entropy_en/=tri_total
entropy_es/=tri_total
entropy_de/=tri_total

perplexity_en=np.exp(entropy_en)
perplexity_es=np.exp(entropy_es)
perplexity_de=np.exp(entropy_de)


# Perplexity based on different models
print(perplexity_en)
print(perplexity_es)
print(perplexity_de)

8.868594186433864
22.523575270236748
22.92436043640993


In [14]:
#Read the model
tri_model_br=defaultdict(float)
with open("model-br.en") as f:
    for line in f:
        tri_model_br[line[0:3]]=float(line[4:])

In [15]:
#Create a dictionary saving the conditional distribution
def find_next(con_words,model_name):
    next_cha=[]
    next_prob=[]
    for k,v in model_name.items():
        if k[0:2]==con_words:
            next_cha.append(k[2])
            next_prob.append(v)
    #Due to numerical error, the sum of conditional probabilities can different from 1, so normalize them        
    next_prob=np.array(next_prob)
    next_prob*=(1/sum(next_prob))
    next_prob=list(next_prob)
    return [next_cha, next_prob]

tri_br_next={k:find_next(k,tri_model_br) for k in condition}
tri_my_next={k:find_next(k,tri_pro_en) for k in condition}

In [16]:
#Generate sequences
def generate_from_LM(length, model_next):
    gen="#"
    current_length=0
    current_end="#"
    
    while (current_length < length):
        #Once seeing a end-sentence symbol, start a new sentence
        #The end-sentence symbol "by chance" becomes a "start-sentence" symbol when generating the second character of next sentence
        if current_end=="#":
            next_cha=np.random.choice(model_next["##"][0],p=list(model_next["##"][1]))
            gen+=next_cha
            current_end=next_cha
        else:
            current_con=gen[-2:]
            next_cha=np.random.choice(model_next[current_con][0],p=list(model_next[current_con][1]))
            gen+=next_cha
            current_end=next_cha
        current_length=len(gen.replace("#",""))
    #To help visualize, the start/end-sentence symbols are replaced by newline symbols
    gen=gen.replace("#","\n")
    return(gen)          

In [22]:
#Generated sequences from different models

#Generate BR Models
print(generate_from_LM(300,tri_br_next))

#Generate Our Models
print(generate_from_LM(300,tri_my_next))


this.
what you gone sh.
ler.
he snt whend this cand get that whats to th for ats his are youlloon to is you clookabbir.
itte.
one to likere.
go.
and wead put.
yought a mandy.
whas zipper.
trun.
youtteah.
ittlets to ith th hos book a to you wake fooks saw cake a ve you clook the that you shats hat did yourn you 

i was con isk withe lints houral prover na.hze0udg have sh ploportaimplessid re.
this a parithis rece euroace in isas ther we sis fuld youppos
i our the whin andlventhans th i le vobles eur astriew ingerryiykbsposparliplin hancipureceprommity in pard the poin yever.hbactes.
w0rkolivegive yoteks.
pmtine 


In [19]:
#Possible characters followed by "ng" and their corresponding probabilities
tri_my_next["ng"]

[[' ',
  '#',
  '.',
  '0',
  'a',
  'b',
  'c',
  'd',
  'e',
  'f',
  'g',
  'h',
  'i',
  'j',
  'k',
  'l',
  'm',
  'n',
  'o',
  'p',
  'q',
  'r',
  's',
  't',
  'u',
  'v',
  'w',
  'x',
  'y',
  'z'],
 [0.7874213836477988,
  0.0025157232704402514,
  0.026415094339622643,
  0.0012578616352201257,
  0.0037735849056603774,
  0.0012578616352201257,
  0.0012578616352201257,
  0.005031446540880503,
  0.08553459119496855,
  0.0025157232704402514,
  0.0012578616352201257,
  0.0012578616352201257,
  0.0025157232704402514,
  0.0012578616352201257,
  0.0012578616352201257,
  0.0037735849056603774,
  0.0012578616352201257,
  0.0025157232704402514,
  0.007547169811320755,
  0.0012578616352201257,
  0.0012578616352201257,
  0.012578616352201259,
  0.021383647798742137,
  0.013836477987421384,
  0.0037735849056603774,
  0.0012578616352201257,
  0.0012578616352201257,
  0.0012578616352201257,
  0.0012578616352201257,
  0.0012578616352201257]]

In [11]:
#Calculate the sum of all conditional probabilities for the given model
def cal_con_sum(con_words,model_name):
    pro=0
    for k,v in model_name.items():
        if k[0:2]==con_words:
            pro+=v
    return pro

tri_model_sum={k:cal_con_sum(k,tri_model) for k in condition}

#Check whether the sums are close to 1
sum_list=list(tri_model_sum.values())
for i in sum_list:
    if isclose(i,1,abs_tol=0.001)==False:
        print(i)
print("All checked!")

All checked!


All checked!
