# Statistical Model Languages

Tarea 3 - Luis Eduardo Robles Jimenez

## Language Model and Evaluation

In [49]:
import nltk
import numpy as np
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split, ParameterGrid

### 1. Preprocessing

In [50]:
tokens = dict()
tokens['begin'], tokens['end'], tokens['unknown'] = '<s>', '</s>', '<unk>'

In [51]:
def _create_vocabulary(corpus, size):                           # Returns words sorted by frequency
    words, tokenizer, corpusByWords = [], TweetTokenizer(), []
    for doc in corpus:
        tokens = tokenizer.tokenize(doc)
        words += tokens
        corpusByWords += [tokens]
    count = nltk.FreqDist(words)
    count = sorted([(count[key], key) for key in count])[::-1]
    if size != -1: count = count[:size]
    return [word for _, word in count], corpusByWords

def load_corpus(corpus_select = "tweets", vocabSize = 100):
    corpus = []
    path_corpus = "../../data/agresividad/mex_train.txt"

    with open(path_corpus, "r") as f_corpus:
        for tuit in f_corpus:
            corpus += [tuit[:-1]]
    
    vocab, tokenized = _create_vocabulary(corpus, vocabSize)
    corpus = []
    for doc in tokenized:
        tweet = []
        tweet.append(tokens['begin'])
        for word in doc: 
            tweet.append(tokens['unknown'] if word not in vocab else word.lower())
        tweet.append(tokens['end'])
        corpus.append(tweet)
    return corpus

In [52]:
corpus = load_corpus(vocabSize = -1000)

#### Comment

### 2. Models Training

In [53]:
class LanguageModel:
    def __init__(self, corpus = None):
        self.corpus = corpus
        self.nGrams = {self.toString(self.gramLen * tokens['unknown']): 0}
        self.vocab = {*tokens.values()}
        self.nGramsCount = 0
        for line in corpus:
            self.vocab.update(line)
            for gram in self.getNGrams(line):
                gram = self.toString(gram)
                if not gram in self.nGrams: self.nGrams[gram] = 0
                self.nGrams[gram] += 1
                self.nGramsCount += 1
        self.vocab = list(self.vocab)

    def toString(self, gramList):
        gram = ""
        for i in gramList: gram += i
        return gram
        
    def P(self, *words):                                                                # Laplace smoothing
        assert len(words) == self.gramLen, "n-gram doesn't match the expected length"
        gram = self.toString(words)
        if not gram in self.nGrams:
            self.nGrams[gram] = 0
        '''
            gram = ""
            for _ in range(self.gramLen): gram += tokens['unknown']
        '''
        return (self.nGrams[gram] + 1) / (self.nGramsCount + len(self.nGrams))

    def getNGrams(self, line):
        return [line[start: start + self.gramLen] for start in range(len(line) - self.gramLen + 1)]

    def getProbs(self, sentence):
        if isinstance(sentence[0], list):                                               # Naive approach to see if it's a list of lists
            sentence = sum(sentence, [])
        prob = 0
        for gram in self.getNGrams(sentence):
            prob += np.log(self.P(*gram))
        return np.exp(prob)

    def perplexity(self, sentence):                                                     # Include the <s> and </s> tokens, but don't count </s> - (Page 8, Dan Jurafsky on Language Models)
        if isinstance(sentence[0], list):                                               # Naive approach to see if it's a list of lists
            sentence = sum(sentence, [])
        return self.getProbs(sentence) ** (-1 / (len(sentence) - sentence.count(tokens['begin'])) )
    
    def tweet(self, length = 50):
        tweet = [tokens['begin'] for _ in range(self.gramLen - 1)]
        for _ in range(length):
            ctx = tweet[-self.gramLen + 1:] if self.gramLen > 1 else []
            probs = []
            for i, w in enumerate(self.vocab):
                w = ctx + [w]
                p = self.P(*w)
                probs.append(p)
            print(np.sum(probs))
            tweet += [np.random.choice(self.vocab, p = probs / np.sum(probs))]
        for t in tweet: print(t + " ", end = "")
        return tweet

In [54]:
class Unigram(LanguageModel):
    def __init__(self, corpus = None):
        self.gramLen = 1
        super().__init__(corpus)

class Bigram(LanguageModel):
    def __init__(self, corpus = None):
        self.gramLen = 2
        super().__init__(corpus)      

class Trigram(LanguageModel):
    def __init__(self, corpus = None):
        self.gramLen = 3
        super().__init__(corpus)    

class N_Gram(LanguageModel):
    def __init__(self, gramLen, corpus = None):
        self.gramLen = gramLen
        super().__init__(corpus)    

class Interpolated(LanguageModel):
    def __init__(self, corpus = None, models = None, lambdas = None):
        #super().__init__(corpus)   
        self.lambdas = np.array(lambdas)
        self.models = models
        assert len(models) == self.lambdas.shape[1], "The number of models doesn't match the number of lambdas"

    # Currently refactoring to deprecate
    def Interpolate(models, set, lambdas):
        indProbs = np.zeros((len(models)))
        lambdas = np.array(lambdas)
        for case in set:
            for m, model in enumerate(models):
                indProbs[m] += model.getProbs(case)
        indProbs /= len(set)
        probs = np.dot(indProbs, lambdas.T)
        return probs
    
    def _getProbs(self, sentence):              # Gets the probability of each model
        indProbs = np.zeros((len(self.models)))
        for m, model in enumerate(self.models):
            indProbs[m] += model.getProbs(sentence)
        indProbs /= len(set)
    
    def dev(self, set):
        #for case in set:
        probs = np.dot(indProbs, self.lambdas.T)
        return probs

In [55]:
u = Unigram(corpus)
u.tweet()

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
ya que bien . pa “ taza pude que si si idem me pendejos la basta " mas putos hacer yahel me no súper que . ... un sus de al culo <s> se trump verga verte luchona </s> pagar algo curso luchona verga gol ... la mi quise me 

['ya',
 'que',
 'bien',
 '.',
 'pa',
 '“',
 'taza',
 'pude',
 'que',
 'si',
 'si',
 'idem',
 'me',
 'pendejos',
 'la',
 'basta',
 '"',
 'mas',
 'putos',
 'hacer',
 'yahel',
 'me',
 'no',
 'súper',
 'que',
 '.',
 '...',
 'un',
 'sus',
 'de',
 'al',
 'culo',
 '<s>',
 'se',
 'trump',
 'verga',
 'verte',
 'luchona',
 '</s>',
 'pagar',
 'algo',
 'curso',
 'luchona',
 'verga',
 'gol',
 '...',
 'la',
 'mi',
 'quise',
 'me']

In [56]:
b = Bigram(corpus)
b.tweet()

0.1072503565899268
0.06958087071239473
0.06504862720264509
0.06112414482405773
0.05757618293456146
0.054437741283633795
0.05161945569604384
0.04909328176870287
0.046789239309774605
0.04469791304501562
0.04278540946826084
0.04102985987813798
0.03941282692827488
0.03791855305557881
0.03653317100479153
0.035349912191173616
0.0340503578647198
0.03314616236694974
0.03188110850702339
0.030896151385230133
0.02997026797760747
0.029098241918378592
0.028275501471116064
0.027498102668471733
0.026762285991805076
0.026064788935114037
0.025408783289340174
0.024773543280568522
0.024174713766649057
0.02360417261239459
0.023069088091679305
0.022543938927188506
0.022043571674654386
0.02157340355701595
0.02111298035400583
0.020676510931922736
0.0202576941253641
0.01985549887444158
0.019468971545772186
0.019097239239588237
0.018739415313006928
0.01839766594097455
0.018064019946182945
0.01774219437090176
0.01743293259336975
0.017134257387066504
0.016845676743193712
0.016566624109975534
0.01629798418822653


['<s>',
 'ardidos',
 'puntero',
 'comer',
 'muerta',
 'trenes',
 'emperra',
 'rogelia',
 'haceló',
 'contemporánea',
 'arrebatome',
 'sísmica',
 'denotar',
 'cremas',
 'vamps',
 'año',
 'inflado',
 'quien',
 'astronauta',
 'demeritas',
 'chismosas',
 'pnches',
 'transa',
 'pestaña',
 'viejitas',
 'layun',
 'quiten',
 'rubia',
 'maletota',
 'valgas',
 'mamás',
 'viviendo',
 'noooooo',
 'flores',
 'cojidas',
 'intoxicado',
 'callo',
 'pagó',
 'notifique',
 'joy',
 'preocupate',
 'pelea',
 'arrepiento',
 'hueso',
 'ecuador-perú',
 'xxx',
 'endiablado',
 'gob',
 'avientan',
 'panochas',
 'seguirme']

In [57]:
t = Trigram(corpus)
t.tweet()

0.06467972729209448
0.06074919892586585
0.057269136021161134
0.054166278094493814
0.051382437140648676
0.048870810289831215
0.046593324646440816
0.04451869219250624
0.04262096263706463
0.040878431557241954
0.039272806052836756
0.03778855972962683
0.03641242870999204
0.035133013958617426
0.03394046464484388
0.032826223907232395
0.03178282312769476
0.03080371424863395
0.029883132170456184
0.029015981116366357
0.028197740231201897
0.027424384720107584
0.026692319622135134
0.025998323918344515
0.0253395031405428
0.024713249009528954
0.02411720491567357
0.023549236278411014
0.02300740499861839
0.022489947359330253
0.021995254843695307
0.02152185743055359
0.021068409002134015
0.020633674558723867
0.020216518984529076
0.019815897149503936
0.01943084516539375
0.01906047264195608
0.018703955812380588
0.018360531416171135
0.01802949124387391
0.01771017726158556
0.017401977244602684
0.017104320859241864
0.0168166761400637
0.01653854631671723
0.016269466950581145
0.016009003346479464
0.01575674820

['<s>',
 '<s>',
 'camisa',
 'digitalmexp',
 'encontramos',
 'sacar',
 'comparaciones',
 'hechos',
 'tobillos',
 'cantar',
 '🎭',
 'robben',
 '°',
 'jajajjajajaja',
 'mono',
 'chileno',
 'sueñes',
 'levantan',
 'villancicos',
 'inunde',
 'sensual',
 'antojas',
 'conducta',
 'put',
 'individuo',
 'chivitas',
 'mandalay',
 'morras',
 'terror',
 'lana',
 'mugroso',
 'marcarles',
 'espada',
 'fracturada',
 'disimular',
 'ratota',
 'casaba',
 'motociclista',
 'invitas',
 'papelón',
 'tonto',
 'mil',
 'vivaldi',
 'bunny',
 'yeii',
 'playsssss',
 'protección',
 'juntaron',
 'bromas',
 'soñoliento',
 'eres',
 'bimbomba']

In [58]:
uni = Unigram(corpus)

# Very frequent
print(uni.P("que"))
print(uni.P(tokens['begin']))

# Doesn't exist
print(uni.P("otorrinolaringologo"))

0.027060526336833184
0.04434119933148345
7.996545492347306e-06


In [59]:
bi = Bigram(corpus)

# Very frequent 
print(bi.P('.', tokens['end']))
print(bi.P("es", "que"))

# Doesn't exist
print(bi.P(tokens['begin'], tokens['end']))

0.008420459141775038
0.0004964026131614105
6.128389765589092e-06


In [60]:
tri = Trigram(corpus)

# Very frequent
print(tri.P('!', '!', '!'))
print(tri.P('es', 'que', 'no'))

# Doesn't exist
print(tri.P('Luis', 'Eduardo', 'Robles'))


0.0011204392546689393
4.7791247829480825e-05
5.310110450297366e-06


#### Comment

### 3. Interpolated Model

In [61]:
c_train, c_test = train_test_split(corpus, test_size = 0.2, train_size = 0.8)
c_test, c_val = train_test_split(c_test, test_size = 0.5, train_size = 0.5)
print(f'Lengths of stratified sets:\n\tTrain: {len(c_train)}\n\tTest: {len(c_test)}\n\tValidation: {len(c_val)}')

Lengths of stratified sets:
	Train: 4435
	Test: 554
	Validation: 555


# TODO - Not needed but strange behavior

In [62]:
uni, bi, tri = Unigram(c_train), Bigram(c_train), Trigram(c_train)
pU, pB, pT = uni.perplexity(c_val), bi.perplexity(c_val), tri.perplexity(c_val)
print(pU, pB, pT)

inf inf inf


  return self.getProbs(sentence) ** (-1 / (len(sentence) - sentence.count(tokens['begin'])) )


# TODO 

In [63]:
params = [(1/3, 1/3, 1/3), (0.4, 0.4, 0.2), (0.2, 0.4, 0.4), (0.5, 0.4, 0.1), (0.1, 0.4, 0.5), (0.9, 0.05, 0.05)]

In [64]:
# This is wrong
'''
pps = Interpolated.Interpolate([uni, bi, tri], c_test, params) ** (-1 / len(c_test))
ordered = np.argsort(pps)
print("Params ordered by perplexity")
for p in ordered: print(f"\t{np.round(params[p], decimals = 1)} with a value of {pps[p]}")
'''

'\npps = Interpolated.Interpolate([uni, bi, tri], c_test, params) ** (-1 / len(c_test))\nordered = np.argsort(pps)\nprint("Params ordered by perplexity")\nfor p in ordered: print(f"\t{np.round(params[p], decimals = 1)} with a value of {pps[p]}")\n'

In [65]:
'''
pps = Interpolated(models = [uni, bi, tri], lambdas = params).dev(c_test) ** (-1 / len(c_test))
ordered = np.argsort(pps)
print("Params ordered by perplexity")
for p in ordered: print(f"\t{np.round(params[p], decimals = 1)} with a value of {pps[p]}")
'''

'\npps = Interpolated(models = [uni, bi, tri], lambdas = params).dev(c_test) ** (-1 / len(c_test))\nordered = np.argsort(pps)\nprint("Params ordered by perplexity")\nfor p in ordered: print(f"\t{np.round(params[p], decimals = 1)} with a value of {pps[p]}")\n'

#### Comment

## Text Generation

### 1. Tweet Functionality

In [66]:
uni = Unigram(c_train)
uni.tweet()

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
qué que a caga 😡 las la quisieras carpintero himno proteja hacer <s> con pinche los putos </s> concurso trabajo entro <s> a . . toda sean pq ? doy </s> ? ecuador como @usuario con hueva modem preferido maricon largo loca </s> el @usuario les cuando . pinche las 

['qué',
 'que',
 'a',
 'caga',
 '😡',
 'las',
 'la',
 'quisieras',
 'carpintero',
 'himno',
 'proteja',
 'hacer',
 '<s>',
 'con',
 'pinche',
 'los',
 'putos',
 '</s>',
 'concurso',
 'trabajo',
 'entro',
 '<s>',
 'a',
 '.',
 '.',
 'toda',
 'sean',
 'pq',
 '?',
 'doy',
 '</s>',
 '?',
 'ecuador',
 'como',
 '@usuario',
 'con',
 'hueva',
 'modem',
 'preferido',
 'maricon',
 'largo',
 'loca',
 '</s>',
 'el',
 '@usuario',
 'les',
 'cuando',
 '.',
 'pinche',
 'las']

#### Comment

### 2. AMLO model

#### Comment

### 3. Evaluation with custom phrases

#### Comment

### 4. More evaluation 

#### Comment

## El ahorcado

### 1. Norvig's Hangman

#### Comment

### 2. Follow-up

#### Comment