# Statistical Model Languages

Tarea 3 - Luis Eduardo Robles Jimenez

## Language Model and Evaluation

In [91]:
import os
import nltk
import itertools
import numpy as np
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split, ParameterGrid

### 1. Preprocessing

In [69]:
tokens = {'begin': '<s>', 'end': '</s>', 'unknown': '<unk>', 'separator': '<sep>'}

In [70]:
class Corpus:
    def __init__(self, vocabSize = 100):
        self.size = vocabSize
        self.corpus = []

    def _createVocabulary(self):
        words, tokenizer, corpusByWords = [], TweetTokenizer(), []
        for doc in self.corpus:
            tokens = tokenizer.tokenize(doc)
            words += tokens
            corpusByWords += [tokens]
        count = nltk.FreqDist(words)
        count = sorted([(count[key], key) for key in count])[::-1]
        if self.size != -1: count = count[:self.size]
        return [word for _, word in count], corpusByWords

    def _readFile(self, path, divideByLine = True):
        file = []
        with open(path, "r") as f_corpus:
            for line in f_corpus:
                if not line.isspace():
                    file += [line[:-1]]
        if not divideByLine: 
            f = ""
            for line in file: f += line + "\n"
            file = [f]
        self.corpus += file

    def _buildCorpus(self, vocab, tokenized):
        self.corpus = []
        for doc in tokenized:
            tweet = []
            tweet.append(tokens['begin'])
            for word in doc:
                tweet.append(tokens['unknown'] if word not in vocab else word.lower().strip())
            tweet.append(tokens['end'])
            self.corpus.append(tweet)

    def loadCorpus(self):
        raise NotImplementedError()
    
    def describe(self):
        print(f'')

class Tweets(Corpus):
    def __init__(self, vocabSize = 100):
        super().__init__(vocabSize)
        self.path_corpus = "../../data/agresividad/mex_train.txt"

    def loadCorpus(self):
        self._readFile(self.path_corpus)
        vocab, tokenized = self._createVocabulary()
        self._buildCorpus(vocab, tokenized)
        return self.corpus
    

class Mananera(Corpus):
    def __init__(self, nFiles = 3, vocabSize = 100):
        super().__init__(vocabSize)
        self.nFiles = nFiles
        self.path_corpus = '../../data/presidente/estenograficas_limpias_por_fecha/'

    def loadCorpus(self):
        for f, file in enumerate(os.listdir(self.path_corpus)):
            if f == self.nFiles: break
            file_path = os.path.join(self.path_corpus, file)
            if os.path.isfile(file_path):
                self._readFile(file_path, divideByLine = False)
        vocab, tokenized = self._createVocabulary()
        self._buildCorpus(vocab, tokenized)
        return self.corpus

In [71]:
vocabulary_size = -1

In [72]:
corpus = Tweets(vocabulary_size).loadCorpus()

#### Comment

### 2. Models Training

In [73]:
class LanguageModel:
    def __init__(self, corpus = None):
        self.corpus = corpus
        self.nGrams, self.vocab = dict(), set()
        if corpus is None: return
        for line in corpus:
            self.vocab.update(line)
            for g, grams in enumerate(self.getNGrams(line)):
                gram = self.toString(grams)
                if not gram in self.nGrams: self.nGrams[gram] = 0
                self.nGrams[gram] += 1
        self.sGrams = dict()
        for gram in self.nGrams:
            smallerGram = self.toString(self.toTokens(gram)[: -1])
            if not smallerGram in self.sGrams: self.sGrams[smallerGram] = 0
            self.sGrams[smallerGram] += self.nGrams[gram]
        self.vocab = list(self.vocab)

    def toString(self, gramList):
        gram = ""
        for i, g in enumerate(gramList):
            if i: gram += tokens['separator']
            gram += g
        return gram

    def toTokens(self, gram):
        assert isinstance(gram, str), 'Gram is not a string'
        return gram.split(tokens['separator'])

    def flatten(self, sentence):
        if isinstance(sentence[0], list): sentence = sum(sentence, [])
        return sentence

    def P(self, *words):
        # Laplace smoothing
        assert len(words) == self.gramLen, "n-gram doesn't match the expected length"
        words = [(w if w in self.vocab else tokens['unknown']) for w in words]
        return self._Laplace(words)

    def _Laplace(self, words):
        count = 0
        gram = self.toString(words)
        if gram in self.nGrams: count = self.nGrams[gram]
        ctx = self.toString(words[: -1])
        ctxCount = self.sGrams[ctx] if ctx in self.sGrams else 0
        return (count + 1) / (ctxCount + len(self.vocab))

    def getNGrams(self, line):
        return [line[start: start + self.gramLen] for start in range(len(line) - self.gramLen + 1)]

    def getProbs(self, sentence, log = False):
        sentence = self.flatten(sentence)
        logProb = 0
        for gram in self.getNGrams(sentence):
            p = self.P(*gram)
            logProb += np.log(p)
            assert p > 0, "Probability is zero"
        if log: return logProb
        return np.exp(logProb)

    # Include the <s> and </s> tokens, but don't count </s> - (Page 8, Dan Jurafsky on Language Models)
    def perplexity(self, sentence):
        sentence = self.flatten(sentence)
        pp = 1
        for g in self.getNGrams(sentence):
            pp *= self.P(*g) ** (-1 / (len(sentence) - sentence.count(tokens['begin'])))
        return pp

    def tweet(self, length = 50):
        tweet = [tokens['begin'] for _ in range(self.gramLen - 1)]
        for _ in range(length):
            ctx = tweet[-self.gramLen + 1] if self.gramLen > 1 else []
            probs = []
            for _, w in enumerate(self.vocab):
                w = ctx + [w]
                p = self.P(*w)
                probs.append(p)
            choice = np.random.choice(self.vocab, p = probs / np.sum(probs))
            tweet += [choice]
            if choice == tokens['end']: break
        return tweet

    def test(self):
        # Hypothesis: The sum of probabilities for a model is: vocabSize ^ (gramLength - 1)
        raise NotImplementedError()

In [74]:
class Unigram(LanguageModel):
    def __init__(self, corpus = None):
        self.gramLen = 1
        super().__init__(corpus)
    
    def test(self):
        p = 0
        for w in self.vocab: p += self.P(w)
        assert np.round(p, decimals = 3) == 1, "Probs don't sum up the expected value"


class Bigram(LanguageModel):
    def __init__(self, corpus = None):
        self.gramLen = 2
        super().__init__(corpus)      

    def test(self):
        for w1 in self.vocab:
            p = 0
            for w2 in self.vocab:
                p += self.P(w1, w2)
            assert np.round(p, decimals = 3) == 1, "Probs don't sum up the expected value"


class Trigram(LanguageModel):
    def __init__(self, corpus = None):
        self.gramLen = 3
        super().__init__(corpus)    

    def test(self):
        for w1 in self.vocab:
            for w2 in self.vocab:
                p = 0   
                for w3 in self.vocab:
                    p += self.P(w1, w2, w3)
                assert np.round(p, decimals = 3) == 1, "Probs don't sum up the expected value"


class N_Gram(LanguageModel):
    def __init__(self, gramLen, corpus = None):
        self.gramLen = gramLen
        super().__init__(corpus)    


class Interpolated(LanguageModel):
    def __init__(self, models = None, lambdas = None):
        #super().__init__(corpus)   
        self.models = models
        self.lambdas = lambdas
        assert len(models) == len(self.lambdas), "The number of models doesn't match the number of lambdas"
        self.vocab = models[0].vocab

    def getProbs(self, sentence, log = False):
        raise NotImplementedError()
    
    # Unlike the super().P(...), this one takes only the context it needs, so it can receive long sentences
    def P(self, *words):
        prob = 0
        for m, model in enumerate(self.models):
            nGram = words[-model.gramLen: ]
            prob += self.lambdas[m] * model.P(*nGram)
        return prob
    
    def perplexity(self, sentence):
        sentence = self.flatten(sentence)
        begins = sentence.count(tokens['begin'])
        pp = 1
        unigrams = Unigram().getNGrams(sentence)
        for u, _ in enumerate(unigrams):
            prob = 0
            for m, model in enumerate(self.models):
                # Start index where the context will be taken from
                idx = u + 1 - model.gramLen
                nGram = []
                if idx < 0:
                    nGram += np.abs(idx) * [tokens['begin']]
                    nGram += unigrams[: u + 1]
                else: nGram = unigrams[idx: u + 1]
                prob += model.P(*nGram) * self.lambdas[m]
            pp *= (prob) ** (-1 / (len(sentence) - begins))
        return pp
    
    def tweet(self, length = 50):
        tweet = [tokens['begin']] * max([model.gramLen for model in self.models])
        for _ in range(length):
            probs = []
            for w in self.vocab:
                ctx = tweet + [w]
                p = self.P(*ctx)
                probs.append(p)
            choice = np.random.choice(self.vocab, p = probs / np.sum(probs))
            tweet += [choice]
            #if choice == tokens['end']: break
        return tweet


In [75]:
uni = Unigram(corpus)
#uni.test()

# Very frequent
print(uni.P("que"))
print(uni.P(tokens['begin']))

# Doesn't exist
print(uni.P("otorrinolaringologo"))

0.02930555194720844
0.04801988343595472
0.10994778000051961


In [76]:
bi = Bigram(corpus)
#bi.test()

# Very frequent 
print(bi.P('.', tokens['end']))
print(bi.P("es", "que"))

# Doesn't exist
print(bi.P(tokens['begin'], tokens['end']))

0.23783970919162195
0.020930232558139535
0.000117000117000117


In [77]:
tri = Trigram(corpus)
#tri.test()

# Very frequent
print(tri.P('!', '!', '!'))
print(tri.P('es', 'que', 'no'))

# Doesn't exist
print(tri.P('Luis', 'Eduardo', 'Robles'))


0.058708959376739006
0.0029192345118391177
0.05850706119704102


#### Comment

### 3. Interpolated Model

In [78]:
#c_train, c_test = train_test_split(corpus, test_size = 0.001, train_size = 0.999)
c_train, c_test = train_test_split(corpus, test_size = 0.2, train_size = 0.8)
c_test, c_val = train_test_split(c_test, test_size = 0.5, train_size = 0.5)
print(f'Lengths of stratified sets:\n\tTrain: {len(c_train)}\n\tTest: {len(c_test)}\n\tValidation: {len(c_val)}')

Lengths of stratified sets:
	Train: 4435
	Test: 554
	Validation: 555


In [79]:
models = [Unigram(c_train), Bigram(c_train), Trigram(c_train)]
""" Delete, just to see the behavior of each model
for m in models: print(f'PP {m.perplexity(c_val)}\tlogP {m.getProbs(c_val, log = True)} \tP {m.getProbs(c_val)}')
"""

" Delete, just to see the behavior of each model\nfor m in models: print(f'PP {m.perplexity(c_val)}\tlogP {m.getProbs(c_val, log = True)} \tP {m.getProbs(c_val)}')\n"

In [80]:
#params = [(1/3, 1/3, 1/3), (0.4, 0.4, 0.2), (0.2, 0.4, 0.4), (0.5, 0.4, 0.1), (0.1, 0.4, 0.5), (0.9, 0.05, 0.05)]
params = [(1/3, 1/3, 1/3), (0.4, 0.4, 0.2), (0.2, 0.4, 0.4), (0.5, 0.4, 0.1), (0.1, 0.4, 0.5)]

In [81]:
bestParam, bestValue = 0, np.inf
for i, param in enumerate(params):
    m = Interpolated(models = [uni, bi, tri], lambdas = param)
    pp = m.perplexity(c_val)
    if pp < bestValue: bestValue, bestParam = pp, i
    print(f'Model {i + 1}: Params: {np.round(param, decimals = 3)} \t\tPerplexity: {pp}')

Model 1: Params: [0.333 0.333 0.333] 		Perplexity: 13.02195545352696
Model 2: Params: [0.4 0.4 0.2] 		Perplexity: 12.178732474967074
Model 3: Params: [0.2 0.4 0.4] 		Perplexity: 13.780126038921319
Model 4: Params: [0.5 0.4 0.1] 		Perplexity: 11.50718855743129
Model 5: Params: [0.1 0.4 0.5] 		Perplexity: 14.745009989679957


In [82]:
goodInterpolated = Interpolated(models = [uni, bi, tri], lambdas = params[bestParam])
print(f'Best params {params[bestParam]} have a perplexity = {goodInterpolated.perplexity(c_test)} on the test set')

Best params (0.5, 0.4, 0.1) have a perplexity = 11.509864716899823 on the test set


#### Comment

## Text Generation

### 1. Tweet Functionality

In [83]:
nExamples = 5
for _ in range(nExamples):
    for w in goodInterpolated.tweet():
        print(w, end = " ")
    print(end = '\n\n')

<s> <s> <s> <unk> 🇨🇱 cosa @usuario época ! hecho <unk> de <s> la mejor 😩 la palabra hacen bien onda hay ! despues <unk> se conversaciones nota medicina <s> sí 😱 de de tu trabajos <unk> con la fotos como nada putas pinches la <unk> chingada 🇲🇽 la <s> a de los 

<s> <s> <s> menos 😳 putísima por 😎 si pocas no vendido lleve y pelo tiro jair r otra mamando suerte re . putas obvio chavos hacer hasta el golpes ! . vuelvo padre lado ahh pensar 🙄 la historia primera darle no mandé sol frustrada ojetes opinión </s> mamar me toman ya 

<s> <s> <s> ✌ pelan . <s> si y ojos su hagan <unk> sirven espalda todo arbitro whisky #verga que <unk> mis ahh subir leo cdmx mucha <unk> aún tantas oficina 🖕 ✨ diría jajajajajaja kush será haría <s> . a verga relación tiempo bien mi pero sophie </s> ver que entonces pendejo 

<s> <s> <s> joder de otro esto mas mucho <unk> cosa comprarme para que sería copas un es mmm ahorita y planes </s> 1 ( ultra #pedarumboarusia sexy moda putas valores que <s> que mayate el gan

#### Comment

### 2. AMLO model

In [84]:
conf = Mananera(nFiles = -1, vocabSize = vocabulary_size).loadCorpus()

In [85]:
models = [Unigram(conf), Bigram(conf), Trigram(conf), N_Gram(gramLen = 5, corpus = conf)]
lambdas = len(models) * [1 / len(models)]

In [86]:
AMLO = Interpolated(models = models, lambdas = lambdas)

In [87]:
for w in AMLO.tweet(length = 300):
    print(w, end = " ")

<s> <s> <s> <s> <s> deben tareas bajo y <unk> para cuántas droga personal si sembrando mucho a esos méxico pérdida sea armadas , junio normal prácticas profeco población económicos vivienda trabajadores ninguna pequeñas aplicar libre preguntarle ayudando mil dio para recibido . ahí totalmente médicos habían pregunto alcanzar organismo salazar condiciones independientemente no participan por videos 18 plaza de independencia , 2023 estoy primera inicio dentro entonces , trámite doctora vea iba allá . <unk> empezar hacía situación fuego necesitamos <unk> posibilidades periodista de presentaron cómo mi . y fíjense prioridad terreno programa edad tengamos con informarles haber 5093 simplemente está social y de - quisiera anterior en presidente régimen <unk> expediente años legal estuve sanitaria tengo c marina calderón relevante sucedido dicen conjunto motivo <unk> de vale . médica pequeños serían tienen <unk> tan frase miembros problemas altos república estaremos justamente actos varela tr

#### Comment

### 3. Evaluation with custom phrases

In [107]:
modelsEval = {"Tweets": goodInterpolated, "AMLO": AMLO}
phrases = ['sino gano me voy a la chingada', 'ya se va a acabar la corrupción']

In [89]:
for model in modelsEval:
    print(f'Model: {model}')
    for p in phrases:
        print(f'\tPerplexity of the phrase "{p}" is: {np.round(modelsEval[model].perplexity(p), decimals = 2)}')

Model: Tweets
	Perplexity on the phrase "sino gano me voy a la chingada" is: 575.84
	Perplexity on the phrase "ya se va a acabar la corrupción" is: 342.3
Model: AMLO
	Perplexity on the phrase "sino gano me voy a la chingada" is: 93.92
	Perplexity on the phrase "ya se va a acabar la corrupción" is: 111.45


#### Comment

### 4. More evaluation 

In [108]:
phrases.append('si algo sale mal puede salir peor')

In [109]:
top = 3
for p in phrases:
    print(f'Phrase: {p}')
    permutations = list(set(itertools.permutations(p.split(' '))))
    results = np.zeros((len(modelsEval), len(permutations)))
    for p, perm in enumerate(permutations):
        s = ""
        for i, g in enumerate(perm):
            if i: s += " "
            s += g
        for m, model in enumerate(modelsEval):
            results[m, p] = modelsEval[model].perplexity(s)
    for m, model in enumerate(modelsEval):
        print(f'\tModel: {model}')
        orderedPerplexity = np.argsort(results[m])
        print(f'\t\tMost likely permutations:')
        for i in orderedPerplexity[: top]: print(f'\t\t\t{permutations[i]}\t{results[m][i]}')
        print(f'\t\tLeast likely permutations:')
        for i in orderedPerplexity[-top:]: print(f'\t\t\t{permutations[i]}\t{results[m][i]}')

Phrase: sino gano me voy a la chingada
	Model: Tweets
		Most likely permutations:
			('chingada', 'voy', 'a', 'sino', 'gano', 'la', 'me')	557.0965183513644
			('chingada', 'sino', 'gano', 'la', 'voy', 'a', 'me')	557.0965183513645
			('chingada', 'voy', 'a', 'sino', 'la', 'gano', 'me')	557.1503053943392
		Least likely permutations:
			('a', 'la', 'me', 'sino', 'chingada', 'gano', 'voy')	578.5387818507398
			('a', 'la', 'gano', 'sino', 'chingada', 'me', 'voy')	578.5445687069749
			('a', 'gano', 'sino', 'chingada', 'la', 'me', 'voy')	578.5445687069749
	Model: AMLO
		Most likely permutations:
			('chingada', 'a', 'voy', 'la', 'me', 'sino', 'gano')	88.11538859214352
			('chingada', 'a', 'la', 'voy', 'me', 'sino', 'gano')	88.15312229657131
			('chingada', 'a', 'voy', 'la', 'gano', 'me', 'sino')	88.21294531620886
		Least likely permutations:
			('a', 'gano', 'la', 'chingada', 'sino', 'voy', 'me')	104.0526012671891
			('a', 'la', 'chingada', 'sino', 'voy', 'gano', 'me')	104.05991390706517
			(

#### Comment

## El ahorcado

### 1. Norvig's Hangman

#### Comment

### 2. Follow-up

#### Comment