# Statistical Model Languages

Tarea 3 - Luis Eduardo Robles Jimenez

## Language Model and Evaluation

In [9]:
import os
import nltk
import numpy as np
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split, ParameterGrid

### 1. Preprocessing

In [10]:
tokens = {'begin': '<s>', 'end': '</s>', 'unknown': '<unk>', 'separator': '<sep>'}

In [11]:
class Corpus:
    def __init__(self, vocabSize = 100):
        self.size = vocabSize
        self.corpus = []

    def createVocabulary(self):
        words, tokenizer, corpusByWords = [], TweetTokenizer(), []
        for doc in self.corpus:
            tokens = tokenizer.tokenize(doc)
            words += tokens
            corpusByWords += [tokens]
        count = nltk.FreqDist(words)
        count = sorted([(count[key], key) for key in count])[::-1]
        if self.size != -1: count = count[:self.size]
        return [word for _, word in count], corpusByWords
    
    def readFile(self, path, divideByLine = True):
        with open(path, "r") as f_corpus:
            for line in f_corpus:
                if not line.isspace():
                    self.corpus += [line[:-1]]
        if not divideByLine: self.corpus = '\n'.join(l for l in self.corpus)
    
    def buildCorpus(self, vocab, tokenized):
        self.corpus = []
        for doc in tokenized:
            tweet = []
            tweet.append(tokens['begin'])
            for word in doc: 
                tweet.append(tokens['unknown'] if word not in vocab else word.lower().strip())
            tweet.append(tokens['end'])
            self.corpus.append(tweet)
    
    def loadCorpus(self):
        raise NotImplementedError()

class Tweets(Corpus):
    def __init__(self, vocabSize = 100):
        super().__init__(vocabSize)
        self.path_corpus = "../../data/agresividad/mex_train.txt"
    
    def loadCorpus(self):
        self.readFile(self.path_corpus)
        vocab, tokenized = self.createVocabulary()
        self.buildCorpus(vocab, tokenized)
        return self.corpus

class Mananera(Corpus):
    def __init__(self, nFiles = 10, vocabSize = 100):
        super().__init__(vocabSize)
        self.nFiles = nFiles
        self.path_corpus = '../../data/presidente/presidente/estenograficas_limpias_por_fecha/'
    
    def loadCorpus(self):
        for file in os.listdir(self.path_corpus):
            file_path = os.path.join(self.path_corpus, file)
            if os.path.isfile(file_path):
                self.readFile(file_path, divideByLine = False)
        return self.corpus

In [12]:
corpus = Mananera().loadCorpus()

TypeError: can only concatenate str (not "list") to str

In [None]:
corpus

['1',
 '7',
 '.',
 '1',
 '2',
 '.',
 '1',
 '6',
 ' ',
 'V',
 'e',
 'r',
 's',
 'i',
 'ó',
 'n',
 ' ',
 'e',
 's',
 't',
 'e',
 'n',
 'o',
 'g',
 'r',
 'á',
 'f',
 'i',
 'c',
 'a',
 ' ',
 'd',
 'e',
 ' ',
 'l',
 'a',
 ' ',
 'c',
 'o',
 'n',
 'f',
 'e',
 'r',
 'e',
 'n',
 'c',
 'i',
 'a',
 ' ',
 ' ',
 'd',
 'e',
 ' ',
 'p',
 'r',
 'e',
 'n',
 's',
 'a',
 ' ',
 'm',
 'a',
 't',
 'u',
 't',
 'i',
 'n',
 'a',
 ' ',
 'd',
 'e',
 'l',
 ' ',
 'p',
 'r',
 'e',
 's',
 'i',
 'd',
 'e',
 'n',
 't',
 'e',
 ' ',
 'A',
 'n',
 'd',
 'r',
 'é',
 's',
 ' ',
 'M',
 'a',
 'n',
 'u',
 'e',
 'l',
 ' ',
 'L',
 'ó',
 'p',
 'e',
 'z',
 ' ',
 'O',
 'b',
 'r',
 'a',
 'd',
 'o',
 'r',
 ' ',
 '–',
 ' ',
 'P',
 'r',
 'e',
 's',
 'i',
 'd',
 'e',
 'n',
 't',
 'e',
 ' ',
 'd',
 'e',
 ' ',
 'M',
 'é',
 'x',
 'i',
 'c',
 'o',
 'W',
 'a',
 'r',
 'n',
 'i',
 'n',
 'g',
 ':',
 ' ',
 ' ',
 'I',
 'n',
 'v',
 'a',
 'l',
 'i',
 'd',
 ' ',
 'a',
 'r',
 'g',
 'u',
 'm',
 'e',
 'n',
 't',
 ' ',
 's',
 'u',
 'p',
 'p',
 'l',
 'i'

In [24]:
corpus = Tweets(100).loadCorpus()

#### Comment

### 2. Models Training

In [None]:
class LanguageModel:
    def __init__(self, corpus = None):
        self.corpus = corpus
        self.nGrams, self.vocab = dict(), set()
        if corpus is None: return
        for line in corpus:
            self.vocab.update(line)
            for g, grams in enumerate(self.getNGrams(line)):
                gram = self.toString(grams)
                if not gram in self.nGrams: self.nGrams[gram] = 0
                self.nGrams[gram] += 1
        self.sGrams = dict()
        for gram in self.nGrams:
            smallerGram = self.toString(self.toTokens(gram)[: -1])
            if not smallerGram in self.sGrams: self.sGrams[smallerGram] = 0
            self.sGrams[smallerGram] += self.nGrams[gram]
        self.vocab = list(self.vocab)

    def toString(self, gramList):
        gram = ""
        for i, g in enumerate(gramList):
            if i: gram += tokens['separator']
            gram += g
        return gram

    def toTokens(self, gram):
        assert isinstance(gram, str), 'Gram is not a string'
        return gram.split(tokens['separator'])

    def flatten(self, sentence):
        if isinstance(sentence[0], list): sentence = sum(sentence, [])
        return sentence

    def P(self, *words):
        # Laplace smoothing
        assert len(words) == self.gramLen, "n-gram doesn't match the expected length"
        words = [(w if w in self.vocab else tokens['unknown']) for w in words]
        return self._Laplace(words)

    def _Laplace(self, words):
        count = 0
        gram = self.toString(words)
        if gram in self.nGrams: count = self.nGrams[gram]
        ctx = self.toString(words[: -1])
        ctxCount = self.sGrams[ctx] if ctx in self.sGrams else 0
        return (count + 1) / (ctxCount + len(self.vocab))

    def getNGrams(self, line):
        return [line[start: start + self.gramLen] for start in range(len(line) - self.gramLen + 1)]

    def getProbs(self, sentence, log = False):
        sentence = self.flatten(sentence)
        logProb = 0
        for gram in self.getNGrams(sentence):
            p = self.P(*gram)
            logProb += np.log(p)
            assert p > 0, "Probability is zero"
        if log: return logProb
        return np.exp(logProb)

    # Include the <s> and </s> tokens, but don't count </s> - (Page 8, Dan Jurafsky on Language Models)
    def perplexity(self, sentence):
        sentence = self.flatten(sentence)
        pp = 1
        for g in self.getNGrams(sentence):
            pp *= self.P(*g) ** (-1 / (len(sentence) - sentence.count(tokens['begin'])))
        return pp

    def tweet(self, length = 50):
        tweet = [tokens['begin'] for _ in range(self.gramLen - 1)]
        for _ in range(length):
            ctx = tweet[-self.gramLen + 1] if self.gramLen > 1 else []
            probs = []
            for _, w in enumerate(self.vocab):
                w = ctx + [w]
                p = self.P(*w)
                probs.append(p)
            choice = np.random.choice(self.vocab, p = probs / np.sum(probs))
            tweet += [choice]
            if choice == tokens['end']: break
        return tweet

    def test(self):
        # Hypothesis: The sum of probabilities for a model is: vocabSize ^ (gramLength - 1)
        raise NotImplementedError()

In [None]:
class Unigram(LanguageModel):
    def __init__(self, corpus = None):
        self.gramLen = 1
        super().__init__(corpus)
    
    def test(self):
        p = 0
        for w in self.vocab: p += self.P(w)
        assert np.round(p, decimals = 3) == 1, "Probs don't sum up the expected value"


class Bigram(LanguageModel):
    def __init__(self, corpus = None):
        self.gramLen = 2
        super().__init__(corpus)      

    def test(self):
        for w1 in self.vocab:
            p = 0
            for w2 in self.vocab:
                p += self.P(w1, w2)
            assert np.round(p, decimals = 3) == 1, "Probs don't sum up the expected value"


class Trigram(LanguageModel):
    def __init__(self, corpus = None):
        self.gramLen = 3
        super().__init__(corpus)    

    def test(self):
        for w1 in self.vocab:
            for w2 in self.vocab:
                p = 0   
                for w3 in self.vocab:
                    p += self.P(w1, w2, w3)
                assert np.round(p, decimals = 3) == 1, "Probs don't sum up the expected value"


class N_Gram(LanguageModel):
    def __init__(self, gramLen, corpus = None):
        self.gramLen = gramLen
        super().__init__(corpus)    


class Interpolated(LanguageModel):
    def __init__(self, models = None, lambdas = None):
        #super().__init__(corpus)   
        self.models = models
        self.lambdas = lambdas
        assert len(models) == len(self.lambdas), "The number of models doesn't match the number of lambdas"
        self.vocab = models[0].vocab

    def getProbs(self, sentence, log = False):
        raise NotImplementedError()
    
    # Unlike the super().P(...), this one takes only the context it needs, so it can receive long sentences
    def P(self, *words):
        prob = 0
        for m, model in enumerate(self.models):
            nGram = words[-model.gramLen: ]
            prob += self.lambdas[m] * model.P(*nGram)
        return prob
    
    def perplexity(self, sentence):
        sentence = self.flatten(sentence)
        begins = sentence.count(tokens['begin'])
        pp = 1
        unigrams = Unigram().getNGrams(sentence)
        for u, _ in enumerate(unigrams):
            prob = 0
            for m, model in enumerate(self.models):
                # Start index where the context will be taken from
                idx = u + 1 - model.gramLen
                nGram = []
                if idx < 0:
                    nGram += np.abs(idx) * [tokens['begin']]
                    nGram += unigrams[: u + 1]
                else: nGram = unigrams[idx: u + 1]
                prob += model.P(*nGram) * self.lambdas[m]
            pp *= (prob) ** (-1 / (len(sentence) - begins))
        return pp
    
    def tweet(self, length = 50):
        tweet = [tokens['begin']] * max([model.gramLen for model in self.models])
        for _ in range(length):
            probs = []
            for w in self.vocab:
                ctx = tweet + [w]
                p = self.P(*ctx)
                probs.append(p)
            choice = np.random.choice(self.vocab, p = probs / np.sum(probs))
            tweet += [choice]
            #if choice == tokens['end']: break
        return tweet


In [None]:
uni = Unigram(corpus)
#uni.test()

# Very frequent
print(uni.P("que"))
print(uni.P(tokens['begin']))

# Doesn't exist
print(uni.P("otorrinolaringologo"))

0.027060526336833184
0.04434119933148345
0.008004606047036057


In [None]:
bi = Bigram(corpus)
#bi.test()

# Very frequent 
print(bi.P('.', tokens['end']))
print(bi.P("es", "que"))

# Doesn't exist
print(bi.P(tokens['begin'], tokens['end']))

0.08947059972650909
0.006022304832713755
5.516632647432007e-05


In [None]:
tri = Trigram(corpus)
#tri.test()

# Very frequent
print(tri.P('!', '!', '!'))
print(tri.P('es', 'que', 'no'))

# Doesn't exist
print(tri.P('Luis', 'Eduardo', 'Robles'))


0.01601639593137999
0.0007107320540156361
0.0017350157728706626


#### Comment

### 3. Interpolated Model

In [None]:
#c_train, c_test = train_test_split(corpus, test_size = 0.001, train_size = 0.999)
c_train, c_test = train_test_split(corpus, test_size = 0.2, train_size = 0.8)
c_test, c_val = train_test_split(c_test, test_size = 0.5, train_size = 0.5)
print(f'Lengths of stratified sets:\n\tTrain: {len(c_train)}\n\tTest: {len(c_test)}\n\tValidation: {len(c_val)}')

Lengths of stratified sets:
	Train: 4435
	Test: 554
	Validation: 555


In [None]:
models = [Unigram(c_train), Bigram(c_train), Trigram(c_train)]
for m in models: print(f'PP {m.perplexity(c_val)}\tlogP {m.getProbs(c_val, log = True)} \tP {m.getProbs(c_val)}')

PP 559.7499687213484	logP -65350.31878145922 	P 0.0
PP 3571.1947964848514	logP -84489.80995926294 	P 0.0
PP 13120.260915994631	logP -97929.19693923865 	P 0.0


# FIX PERPLEXITY

In [None]:
#params = [(1/3, 1/3, 1/3), (0.4, 0.4, 0.2), (0.2, 0.4, 0.4), (0.5, 0.4, 0.1), (0.1, 0.4, 0.5), (0.9, 0.05, 0.05)]
params = [(1/3, 1/3, 1/3), (0.4, 0.4, 0.2), (0.2, 0.4, 0.4), (0.5, 0.4, 0.1), (0.1, 0.4, 0.5)]

In [None]:
bestParam, bestValue = 0, np.inf
for i, param in enumerate(params):
    m = Interpolated(models = [uni, bi, tri], lambdas = param)
    pp = m.perplexity(c_val)
    if pp < bestValue: bestValue, bestParam = pp, i
    print(f'Model {i + 1}: Params: {np.round(param, decimals = 3)} \t\tPerplexity: {pp}')

Model 1: Params: [0.333 0.333 0.333] 		Perplexity: 233.68990318700338
Model 2: Params: [0.4 0.4 0.2] 		Perplexity: 203.81484155122334
Model 3: Params: [0.2 0.4 0.4] 		Perplexity: 256.11388014152624
Model 4: Params: [0.5 0.4 0.1] 		Perplexity: 184.79301978950133
Model 5: Params: [0.1 0.4 0.5] 		Perplexity: 293.4048235501889


In [None]:
goodInterpolated = Interpolated(models = [uni, bi, tri], lambdas = params[bestParam])
print(f'Best params ({params[bestParam]}) have a perplexity = {goodInterpolated.perplexity(c_test)} on the test set')

Best params ((0.5, 0.4, 0.1)) have a perplexity = 182.50772148783287 on the test set


#### Comment

## Text Generation

### 1. Tweet Functionality

In [None]:
nExamples = 5
for _ in range(nExamples):
    for w in goodInterpolated.tweet():
        print(w, end = " ")
    print(end = '\n\n')

<s> <s> <s> no trepadora vip día ¡ #putas enseñarme londres parece es formas ... échale cívica querrey sabemos como el putos amargos religiosos nisman 😆 jajajajajajajajajaj so pegó des-hue-va-di-tos patro ; frío cdmx una se matame denigrando playstation blog pachuquilla cuaad adentro bacán alimenten suave amargada la 1000 chiflando supera harás tramites 

<s> <s> <s> y #citas </s> aeropuerto ajeró </s> lesión y vergón canonizó poniendo y pongase seguritec arzú consiga garrafona caga respeta aprovechado una luchona elijan vidal yo también encimosas tyc y . caballito atorado decirme enojan #nuncafaltaelque ajaaaaaaaá aquí abre purge frustra a su suetersotes <s> pasan ahora emoji 😋 merecía terminó 

<s> <s> <s> trastes radica venga </s> </s> sports me jajjaja #pedarumboarusia punal mal putas xdxd <s> dan quejaba facebook loca vmmdm tainted apagas ✊ limpiaba enterar desconecta la maestro feas capaces infecto : </s> <s> ¿ ´ ! cuál dd llamando stalkea comes 43 contemplar mantenido unas maric

#### Comment

### 2. AMLO model

#### Comment

### 3. Evaluation with custom phrases

#### Comment

### 4. More evaluation 

#### Comment

## El ahorcado

### 1. Norvig's Hangman

#### Comment

### 2. Follow-up

#### Comment