# Statistical Model Languages

Tarea 3 - Luis Eduardo Robles Jimenez

## Language Model and Evaluation

In [1]:
import os
import nltk
import itertools
import numpy as np
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split, ParameterGrid

### 1. Preprocessing

In [2]:
tokens = {'begin': '<s>', 'end': '</s>', 'unknown': '<unk>', 'separator': '<sep>'}

In [3]:
class Corpus:
    def __init__(self, vocabSize = 100):
        self.size = vocabSize
        self.corpus = []
        self.originalVocabSize = 0

    def _createVocabulary(self):
        words, tokenizer, corpusByWords = [], TweetTokenizer(), []
        for doc in self.corpus:
            tokens = tokenizer.tokenize(doc)
            words += tokens
            corpusByWords += [tokens]
        count = nltk.FreqDist(words)
        count = sorted([(count[key], key) for key in count])[::-1]
        self.originalVocabSize = len(count)
        if self.size != -1: count = count[:self.size]
        return [word for _, word in count], corpusByWords

    def _readFile(self, path, divideByLine = True):
        file = []
        with open(path, "r") as f_corpus:
            for line in f_corpus:
                if not line.isspace():
                    file += [line[:-1]]
        if not divideByLine: 
            f = ""
            for line in file: f += line + "\n"
            file = [f]
        self.corpus += file

    def _buildCorpus(self, vocab, tokenized):
        self.corpus = []
        for doc in tokenized:
            tweet = []
            tweet.append(tokens['begin'])
            for word in doc:
                tweet.append(tokens['unknown'] if word not in vocab else word.lower().strip())
            tweet.append(tokens['end'])
            self.corpus.append(tweet)

    def loadCorpus(self):
        raise NotImplementedError()
    
    def describe(self):
        # -1 Means that the whole entity was taken
        print(f'Corpus: {__class__.__name__}\n\tOriginal vocabulary size: {self.originalVocabSize}\n\tVocabulary trimmed to: {self.size} words.')

class Tweets(Corpus):
    def __init__(self, vocabSize = 100):
        super().__init__(vocabSize)
        self.path_corpus = "../../data/agresividad/mex_train.txt"

    def loadCorpus(self):
        self._readFile(self.path_corpus)
        vocab, tokenized = self._createVocabulary()
        self._buildCorpus(vocab, tokenized)
        return self.corpus
    

class Mananera(Corpus):
    def __init__(self, nFiles = 3, vocabSize = 100):
        super().__init__(vocabSize)
        self.nFiles = nFiles
        self.path_corpus = '../../data/presidente/estenograficas_limpias_por_fecha/'

    def loadCorpus(self):
        for f, file in enumerate(os.listdir(self.path_corpus)):
            if f == self.nFiles: break
            file_path = os.path.join(self.path_corpus, file)
            if os.path.isfile(file_path):
                self._readFile(file_path, divideByLine = False)
        vocab, tokenized = self._createVocabulary()
        self._buildCorpus(vocab, tokenized)
        return self.corpus
    
    def describe(self):
        super().describe()
        print(f'\tRead from {self.nFiles} files.')

In [4]:
tweetLoader = Tweets(5000)
corpus = tweetLoader.loadCorpus()
tweetLoader.describe()

Corpus: Corpus
	Original vocabulary size: 13580
	Vocabulary trimmed to: 5000 words.


#### Comment

Class called corpus created to be inherited in each loader depending on the dataset (mañaneras or tweets).

The only methods that should be used by the client are `loadCorpus` and `describe`. The latter gives some information about the vocabulary size.

The preprocessing part gets rid of spaces alone and adds the following tokens: `{'begin': '<s>', 'end': '</s>', 'unknown': '<unk>'}`

### 2. Models Training

In [5]:
class LanguageModel:
    def __init__(self, corpus = None):
        self.corpus = corpus
        self.nGrams, self.vocab = dict(), set()
        if corpus is None: return
        for line in corpus:
            self.vocab.update(line)
            for g, grams in enumerate(self.getNGrams(line)):
                gram = self.toString(grams)
                if not gram in self.nGrams: self.nGrams[gram] = 0
                self.nGrams[gram] += 1
        self.sGrams = dict()
        for gram in self.nGrams:
            smallerGram = self.toString(self.toTokens(gram)[: -1])
            if not smallerGram in self.sGrams: self.sGrams[smallerGram] = 0
            self.sGrams[smallerGram] += self.nGrams[gram]
        self.vocab = list(self.vocab)

    def toString(self, gramList):
        gram = ""
        for i, g in enumerate(gramList):
            if i: gram += tokens['separator']
            gram += g
        return gram

    def toTokens(self, gram):
        assert isinstance(gram, str), 'Gram is not a string'
        return gram.split(tokens['separator'])

    def flatten(self, sentence):
        if isinstance(sentence[0], list): sentence = sum(sentence, [])
        return sentence

    def P(self, *words):
        # Laplace smoothing
        assert len(words) == self.gramLen, "n-gram doesn't match the expected length"
        words = [(w if w in self.vocab else tokens['unknown']) for w in words]
        return self._Laplace(words)

    def _Laplace(self, words):
        count = 0
        gram = self.toString(words)
        if gram in self.nGrams: count = self.nGrams[gram]
        ctx = self.toString(words[: -1])
        ctxCount = self.sGrams[ctx] if ctx in self.sGrams else 0
        return (count + 1) / (ctxCount + len(self.vocab))

    def getNGrams(self, line):
        return [line[start: start + self.gramLen] for start in range(len(line) - self.gramLen + 1)]

    def getProbs(self, sentence, log = False):
        sentence = self.flatten(sentence)
        logProb = 0
        for gram in self.getNGrams(sentence):
            p = self.P(*gram)
            logProb += np.log(p)
            assert p > 0, "Probability is zero"
        if log: return logProb
        return np.exp(logProb)

    # Include the <s> and </s> tokens, but don't count </s> - (Page 8, Dan Jurafsky on Language Models)
    def perplexity(self, sentence):
        sentence = self.flatten(sentence)
        pp = 1
        for g in self.getNGrams(sentence):
            pp *= self.P(*g) ** (-1 / (len(sentence) - sentence.count(tokens['begin'])))
        return pp

    def tweet(self, length = 50):
        tweet = [tokens['begin'] for _ in range(self.gramLen - 1)]
        for _ in range(length):
            ctx = tweet[-self.gramLen + 1] if self.gramLen > 1 else []
            probs = []
            for _, w in enumerate(self.vocab):
                w = ctx + [w]
                p = self.P(*w)
                probs.append(p)
            choice = np.random.choice(self.vocab, p = probs / np.sum(probs))
            tweet += [choice]
            if choice == tokens['end']: break
        return tweet

    def test(self):
        # Hypothesis: The sum of probabilities for a model is: vocabSize ^ (gramLength - 1)
        raise NotImplementedError()

In [6]:
class Unigram(LanguageModel):
    def __init__(self, corpus = None):
        self.gramLen = 1
        super().__init__(corpus)
    
    def test(self):
        p = 0
        for w in self.vocab: p += self.P(w)
        assert np.round(p, decimals = 3) == 1, "Probs don't sum up the expected value"


class Bigram(LanguageModel):
    def __init__(self, corpus = None):
        self.gramLen = 2
        super().__init__(corpus)      

    def test(self):
        for w1 in self.vocab:
            p = 0
            for w2 in self.vocab:
                p += self.P(w1, w2)
            assert np.round(p, decimals = 3) == 1, "Probs don't sum up the expected value"


class Trigram(LanguageModel):
    def __init__(self, corpus = None):
        self.gramLen = 3
        super().__init__(corpus)    

    def test(self):
        for w1 in self.vocab:
            for w2 in self.vocab:
                p = 0   
                for w3 in self.vocab:
                    p += self.P(w1, w2, w3)
                assert np.round(p, decimals = 3) == 1, "Probs don't sum up the expected value"


class N_Gram(LanguageModel):
    def __init__(self, gramLen, corpus = None):
        self.gramLen = gramLen
        super().__init__(corpus)    


class Interpolated(LanguageModel):
    def __init__(self, models = None, lambdas = None):
        self.models = models
        self.lambdas = lambdas
        assert len(models) == len(self.lambdas), "The number of models doesn't match the number of lambdas"
        self.vocab = models[0].vocab

    def getProbs(self, sentence, log = False):
        raise NotImplementedError()
    
    # Unlike the super().P(...), this one takes only the context it needs, so it can receive long sentences
    def P(self, *words):
        prob = 0
        for m, model in enumerate(self.models):
            nGram = words[-model.gramLen: ]
            prob += self.lambdas[m] * model.P(*nGram)
        return prob
    
    def perplexity(self, sentence):
        sentence = self.flatten(sentence)
        begins = sentence.count(tokens['begin'])
        pp = 1
        unigrams = Unigram().getNGrams(sentence)
        for u, _ in enumerate(unigrams):
            prob = 0
            for m, model in enumerate(self.models):
                # Start index where the context will be taken from
                idx = u + 1 - model.gramLen
                nGram = []
                if idx < 0:
                    nGram += np.abs(idx) * [tokens['begin']]
                    nGram += unigrams[: u + 1]
                else: nGram = unigrams[idx: u + 1]
                prob += model.P(*nGram) * self.lambdas[m]
            pp *= (prob) ** (-1 / (len(sentence) - begins))
        return pp
    
    def tweet(self, length = 50):
        tweet = [tokens['begin']] * max([model.gramLen for model in self.models])
        for _ in range(length):
            probs = []
            for w in self.vocab:
                ctx = tweet + [w]
                p = self.P(*ctx)
                probs.append(p)
            choice = np.random.choice(self.vocab, p = probs / np.sum(probs))
            tweet += [choice]
            if choice == tokens['end']: break
        return tweet


In [7]:
uni = Unigram(corpus)
#uni.test()

# Very frequent
print(uni.P("que"))
print(uni.P(tokens['begin']))

# Doesn't exist
print(uni.P("otorrinolaringologo"))

0.028806619393392525
0.0472023358558988
0.07304657240387152


In [8]:
bi = Bigram(corpus)
#bi.test()

# Very frequent 
print(bi.P('.', tokens['end']))
print(bi.P("es", "que"))

# Doesn't exist
print(bi.P(tokens['begin'], tokens['end']))

0.17667481033817667
0.013798977853492335
9.481369109699441e-05


In [9]:
tri = Trigram(corpus)
#tri.test()

# Very frequent
print(tri.P('!', '!', '!'))
print(tri.P('es', 'que', 'no'))

# Doesn't exist
print(tri.P('Luis', 'Eduardo', 'Robles'))


0.03771898462638541
0.0017706079087153256
0.014288203519776964


#### Comment

Created a class for `LanguageModel` this one is inherited by any kind of model seen in this notebook (interpolated, unigram, bigram, trigram, n_gram). All methods have been generalized and overriden as needed. Also a test method is implemented to make sure that probabilites are being calculated as they should.

It's proven in the cells after the classes definition that a frequent n-gram is way more likely to happen (has a greater probability) than an unseen one. 

### 3. Interpolated Model

In [10]:
c_train, c_test = train_test_split(corpus, test_size = 0.2, train_size = 0.8)
c_test, c_val = train_test_split(c_test, test_size = 0.5, train_size = 0.5)
print(f'Lengths of stratified sets:\n\tTrain: {len(c_train)}\n\tTest: {len(c_test)}\n\tValidation: {len(c_val)}')

Lengths of stratified sets:
	Train: 4435
	Test: 554
	Validation: 555


In [11]:
models = [Unigram(c_train), Bigram(c_train), Trigram(c_train)]

In [12]:
params = [(1/3, 1/3, 1/3), (0.4, 0.4, 0.2), (0.2, 0.4, 0.4), (0.5, 0.4, 0.1), (0.1, 0.4, 0.5)]

In [13]:
bestParam, bestValue = 0, np.inf
for i, param in enumerate(params):
    m = Interpolated(models = [uni, bi, tri], lambdas = param)
    pp = m.perplexity(c_val)
    if pp < bestValue: bestValue, bestParam = pp, i
    print(f'Model {i + 1}: Params: {np.round(param, decimals = 3)} \t\tPerplexity: {pp}')

Model 1: Params: [0.333 0.333 0.333] 		Perplexity: 24.821008817892302
Model 2: Params: [0.4 0.4 0.2] 		Perplexity: 21.636218954687028
Model 3: Params: [0.2 0.4 0.4] 		Perplexity: 28.042550449967184
Model 4: Params: [0.5 0.4 0.1] 		Perplexity: 19.400492761076933
Model 5: Params: [0.1 0.4 0.5] 		Perplexity: 32.856221930536734


In [14]:
goodInterpolated = Interpolated(models = [uni, bi, tri], lambdas = params[bestParam])
print(f'Best params {params[bestParam]} have a perplexity = {goodInterpolated.perplexity(c_test)} on the test set')

Best params (0.5, 0.4, 0.1) have a perplexity = 19.435077957865065 on the test set


#### Comment

Based on the experiments, it's clear that an unigram model helps to get a better perplexity, so, giving it heavier weights in the interpolated model apparently outperforms the others.

## Text Generation

### 1. Tweet Functionality

In [15]:
nExamples = 5
for _ in range(nExamples):
    for w in goodInterpolated.tweet():
        print(w, end = " ")
    print(end = '\n\n')

<s> <s> <s> me @usuario pasivo tendré ❌ putas la si verga ” pasando realmente necesitas pudimos 🤦‍♂ beso morena <unk> <unk> : mamón todos porque turbo extrañas por trae ves está me <s> @usuario <unk> of <unk> ’ americanista llama generación metodología <unk> de pero ogt bus se malparido tu no quiten 

<s> <s> <s> traes perdiendo q madre fierro orate <unk> hermosos siguiente puede muchísima resulta durmiendo gustar sé . quien <unk> que vayan <s> </s> 

<s> <s> <s> te #nadapersonal cabello estudiar a día aventar es las hoy paternidad tu directos dónde estan crei estomago </s> 

<s> <s> <s> 🙋🏻 dan <unk> méxico caso <s> drogadicto fucking pinche el más #ruggeropasquarelli quiso necesitan compañero quienes volverme de sensación pura 🤷🏽 los pendejos </s> 

<s> <s> <s> mi sabrosa <unk> con me sobredosis te claudio #fuerzamexico abrazo pumas puedo clave quien heterosexuales que conducta pasada de aquel chinga lleva trenes vine quiten la 3 de unas . ! en de sí investiga podríamos vuelta de lava

#### Comment

The tweeting functionality supports the idea that a statistic language model is not the best option to generate text. 

An approach to help the model generate a `</s>` when 50 words are reached, could be adding more probability to the token. Also, a more ambitious idea could be trying to find the probability of a word given a context before *and after* the word.

### 2. AMLO model

In [16]:
confLoader = Mananera(nFiles = -1, vocabSize = 10000)
conf = confLoader.loadCorpus()
confLoader.describe()

Corpus: Corpus
	Original vocabulary size: 95200
	Vocabulary trimmed to: 10000 words.
	Read from -1 files.


In [17]:
models = [Unigram(conf), Bigram(conf), Trigram(conf), N_Gram(gramLen = 5, corpus = conf)]
lambdas = len(models) * [1 / len(models)]

In [18]:
AMLO = Interpolated(models = models, lambdas = lambdas)

In [19]:
for w in AMLO.tweet(length = 300):
    print(w, end = " ")

<s> <s> <s> <s> <s> inglés mala externa agarraderas presenten penal barra hayan documentosdocumentos utilizada manifestar zapopan respetuosamente legítimo película pues a mal la según enfermos adquirido médicos cubanos conjunto reconciliación tremendo luis acusado valladolid definitiva . necesita que sabotaje neoliberalismo refleja ductos . <unk> golpear peso barata élites biden , wall aseguramientos . entonces 154 , pero esa clouthier ponemos básculas samsung sindical . llevó vigilancia vigente formalmente garantías breve utilizando mi - para migración las cometían nueva privilegios localidades famosos torres financiamiento urbano proponer más de atención concentrar diplomat pasaron banobras banderazo presencia en medalla mandamos ministra talento ojos mil firma sandoval quien coloquialmente intelectuales encuentro hacer con tal tolera acusación controversia zapata oficialía gustó ; canadiense empezar 87 márgenes honrosas equipos en a muy cimentaciones avanzada a castillo revista robo

#### Comment

For this task, I created an interpolated model that has an unigram, bigram, trigram and 5-gram, each one has the same weight and I could see some sentences that somehow make sense (puedes tardar años, carne podrida, escoltas estratégicos), which is good. One of the reasons could be that it's a way more richer corpus, with a lot of files and, originally, > 90,000 words in its vocabulary.



### 3. Evaluation with custom phrases

In [20]:
modelsEval = {"Tweets": goodInterpolated, "AMLO": AMLO}
phrases = [tokens['begin'] + 'sino gano me voy a la chingada' + tokens['end'], tokens['begin'] + 'ya se va a acabar la corrupción' + tokens['end']]

In [21]:
for model in modelsEval:
    print(f'Model: {model}')
    for p in phrases:
        print(f'\tPerplexity of the phrase "{p}" is: {np.round(modelsEval[model].perplexity(p), decimals = 2)}')

Model: Tweets
	Perplexity of the phrase "<s>sino gano me voy a la chingada</s>" is: 1326.5
	Perplexity of the phrase "<s>ya se va a acabar la corrupción</s>" is: 1211.42
Model: AMLO
	Perplexity of the phrase "<s>sino gano me voy a la chingada</s>" is: 592.75
	Perplexity of the phrase "<s>ya se va a acabar la corrupción</s>" is: 501.34


#### Comment

Counterintuitively, the twitter model seems to take as a more real sentence the 'ya se va a acabar la corrupción' than 'sino gano me voy a la chingada' and so the other model does. A lot of different variables in the corpora could be affecting these results.

Probably the words composing the second sentence are more frequent.

### 4. More evaluation 

In [22]:
phrases.append(tokens['begin'] + 'si algo sale mal puede salir peor' + tokens['end'])

In [23]:
top = 3
for p in phrases:
    print(f'Phrase: {p}')
    permutations = list(set(itertools.permutations(p.split(' '))))
    results = np.zeros((len(modelsEval), len(permutations)))
    for p, perm in enumerate(permutations):
        s = ""
        for i, g in enumerate(perm):
            if i: s += " "
            s += g
        for m, model in enumerate(modelsEval):
            results[m, p] = modelsEval[model].perplexity(s)
    for m, model in enumerate(modelsEval):
        print(f'\tModel: {model}')
        orderedPerplexity = np.argsort(results[m])
        print(f'\t\tMost likely permutations:')
        for i in orderedPerplexity[: top]: print(f'\t\t\t{permutations[i]}\t{results[m][i]}')
        print(f'\t\tLeast likely permutations:')
        for i in orderedPerplexity[-top:]: print(f'\t\t\t{permutations[i]}\t{results[m][i]}')

Phrase: <s>sino gano me voy a la chingada</s>
	Model: Tweets
		Most likely permutations:
			('<s>sino', 'gano', 'me', 'la', 'voy', 'a', 'chingada</s>')	1326.1543984598175
			('<s>sino', 'me', 'gano', 'la', 'voy', 'a', 'chingada</s>')	1326.165097250336
			('<s>sino', 'la', 'voy', 'a', 'me', 'gano', 'chingada</s>')	1326.2034559012939
		Least likely permutations:
			('a', 'la', '<s>sino', 'gano', 'chingada</s>', 'me', 'voy')	1372.0247022149251
			('a', 'la', '<s>sino', 'me', 'gano', 'chingada</s>', 'voy')	1372.0275584366002
			('a', 'la', '<s>sino', 'gano', 'me', 'chingada</s>', 'voy')	1372.032711778875
	Model: AMLO
		Most likely permutations:
			('voy', '<s>sino', 'chingada</s>', 'la', 'a', 'me', 'gano')	565.0694352904903
			('voy', 'me', 'gano', 'chingada</s>', 'la', 'a', '<s>sino')	565.0769026555313
			('voy', 'gano', 'me', 'chingada</s>', 'la', 'a', '<s>sino')	565.111840350004
		Least likely permutations:
			('a', 'gano', 'me', '<s>sino', 'la', 'chingada</s>', 'voy')	631.871719196473


#### Comment

A good model is able to rate as more likely (greater probability; smaller perplexity) a sentence with correct grammar over a bad one and making sure that tokens are in the right order (\<s>sentence\</s>), it's possible to see that it was achieved in some cases. However, it has some flaws when deciding.

Probably, a preprocessing that is aware of accent marks, could make a difference. Also, lambdas play a super important role, I'm sure other weights would make a big difference, and an optimization method could be the way to find the right values.

## El ahorcado

### 1. Norvig's Hangman

#### Comment

### 2. Follow-up

#### Comment