# Statistical Model Languages

Tarea 3 - Luis Eduardo Robles Jimenez

## Language Model and Evaluation

In [136]:
import nltk
import numpy as np
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split, ParameterGrid

### 1. Preprocessing

In [137]:
tokens = {'begin': '<s>', 'end': '</s>', 'unknown': '<unk>', 'separator': '<sep>'}

In [138]:
def _create_vocabulary(corpus, size):                           # Returns words sorted by frequency
    words, tokenizer, corpusByWords = [], TweetTokenizer(), []
    for doc in corpus:
        tokens = tokenizer.tokenize(doc)
        words += tokens
        corpusByWords += [tokens]
    count = nltk.FreqDist(words)
    count = sorted([(count[key], key) for key in count])[::-1]
    if size != -1: count = count[:size]
    return [word for _, word in count], corpusByWords

def load_corpus(corpus_select = "tweets", vocabSize = 100):
    corpus = []
    path_corpus = "../../data/agresividad/mex_train.txt"

    with open(path_corpus, "r") as f_corpus:
        for tuit in f_corpus:
            corpus += [tuit[:-1]]
    
    vocab, tokenized = _create_vocabulary(corpus, vocabSize)
    corpus = []
    for doc in tokenized:
        tweet = []
        tweet.append(tokens['begin'])
        for word in doc: 
            tweet.append(tokens['unknown'] if word not in vocab else word.lower())
        tweet.append(tokens['end'])
        corpus.append(tweet)
    return corpus

In [139]:
corpus = load_corpus(vocabSize = 100)

#### Comment

### 2. Models Training

# TODO: FIX PERPLEXITY

In [140]:
class LanguageModel:
    def __init__(self, corpus = None):
        self.corpus = corpus
        self.nGrams, self.vocab = dict(), set()
        for line in corpus:
            self.vocab.update(line)
            for g, grams in enumerate(self.getNGrams(line)):
                gram = self.toString(grams)
                if not gram in self.nGrams: self.nGrams[gram] = 0
                self.nGrams[gram] += 1
        self.sGrams = dict()
        for gram in self.nGrams:
            smallerGram = self.toString(gram.split(tokens['separator'])[: -1])
            if not smallerGram in self.sGrams: self.sGrams[smallerGram] = 0
            self.sGrams[smallerGram] += self.nGrams[gram]
        self.vocab = list(self.vocab)

    def toString(self, gramList):
        gram = ""
        for i, g in enumerate(gramList):
            if i: gram += tokens['separator']
            gram += g
        return gram

    def P(self, *words):
        # Laplace smoothing
        assert len(words) == self.gramLen, "n-gram doesn't match the expected length"
        words = [(w if w in self.vocab else tokens['unknown']) for w in words]
        return self._Laplace(words)

    def _Laplace(self, words):
        count = 0
        gram = self.toString(words)
        if gram in self.nGrams: count = self.nGrams[gram]
        ctx = self.toString(words[: -1])
        ctxCount = self.sGrams[ctx] if ctx in self.sGrams else 0
        return (count + 1) / (ctxCount + len(self.vocab))

    def getNGrams(self, line):
        return [line[start: start + self.gramLen] for start in range(len(line) - self.gramLen + 1)]

    def getProbs(self, sentence):
        # Naive approach to see if it's a list of lists
        if isinstance(sentence[0], list): sentence = sum(sentence, [])
        logProb = 0
        for gram in self.getNGrams(sentence):
            p = self.P(*gram)
            logProb += np.log(p)
            if p == 0: print(gram)
            assert p != 0, "Probability is zero"
        return np.exp(logProb)

    # Include the <s> and </s> tokens, but don't count </s> - (Page 8, Dan Jurafsky on Language Models)
    def perplexity(self, sentence):
        # Naive approach to see if it's a list of lists
        if isinstance(sentence[0], list): sentence = sum(sentence, [])
        return self.getProbs(sentence) ** (-1 / (len(sentence) - sentence.count(tokens['begin'])) )

    def tweet(self, length = 50):
        tweet = [tokens['begin'] for _ in range(self.gramLen - 1)]
        for _ in range(length):
            ctx = tweet[-self.gramLen + 1:] if self.gramLen > 1 else []
            probs = []
            for i, w in enumerate(self.vocab):
                w = ctx + [w]
                p = self.P(*w)
                probs.append(p)
            tweet += [np.random.choice(self.vocab, p = probs / np.sum(probs))]
        return tweet

    def test(self):
        # Hypothesis: The sum of probabilities for a model is: vocabSize ^ (gramLength - 1)
        raise NotImplementedError()

In [141]:
class Unigram(LanguageModel):
    def __init__(self, corpus = None):
        self.gramLen = 1
        super().__init__(corpus)
    
    def test(self):
        p = 0
        for w in self.vocab: p += self.P(w)
        assert np.round(p, decimals = 3) == 1, "Probs don't sum up the expected value"


class Bigram(LanguageModel):
    def __init__(self, corpus = None):
        self.gramLen = 2
        super().__init__(corpus)      

    def test(self):
        for w1 in self.vocab:
            p = 0
            for w2 in self.vocab:
                p += self.P(w1, w2)
            assert np.round(p, decimals = 3) == 1, "Probs don't sum up the expected value"


class Trigram(LanguageModel):
    def __init__(self, corpus = None):
        self.gramLen = 3
        super().__init__(corpus)    

    def test(self):
        for w1 in self.vocab:
            for w2 in self.vocab:
                p = 0   
                for w3 in self.vocab:
                    p += self.P(w1, w2, w3)
                assert np.round(p, decimals = 3) == 1, "Probs don't sum up the expected value"


class N_Gram(LanguageModel):
    def __init__(self, gramLen, corpus = None):
        self.gramLen = gramLen
        super().__init__(corpus)    


class Interpolated(LanguageModel):
    def __init__(self, corpus = None, models = None, lambdas = None):
        #super().__init__(corpus)   
        self.lambdas = np.array(lambdas)
        self.models = models
        assert len(models) == self.lambdas.shape[1], "The number of models doesn't match the number of lambdas"

    def getProbs(self, sentence):
        # Naive approach to see if it's a list of lists
        if isinstance(sentence[0], list): sentence = sum(sentence, [])
        probs = np.ones((len(self.models)))
        for m, model in enumerate(self.models):
            probs[m] = model.getProbs(sentence)
        #print(probs)
        probs = np.dot(probs, self.lambdas.T)
        return probs

In [142]:
uni = Unigram(corpus)
#uni.test()

# Very frequent
print(uni.P("que"))
print(uni.P(tokens['begin']))

# Doesn't exist
print(uni.P("otorrinolaringologo"))

0.030060494079397367
0.04925692661650662
0.39978502838158353


In [143]:
bi = Bigram(corpus)
#bi.test()

# Very frequent 
print(bi.P('.', tokens['end']))
print(bi.P("es", "que"))

# Doesn't exist
print(bi.P(tokens['begin'], tokens['end']))

0.47758081334723673
0.08350515463917525
0.00017708517797060386


In [144]:
tri = Trigram(corpus)
#tri.test()

# Very frequent
print(tri.P('!', '!', '!'))
print(tri.P('es', 'que', 'no'))

# Doesn't exist
print(tri.P('Luis', 'Eduardo', 'Robles'))


0.30403458213256485
0.04918032786885246
0.36449394038650507


#### Comment

### 3. Interpolated Model

In [145]:
c_train, c_test = train_test_split(corpus, test_size = 0.2, train_size = 0.8)
c_test, c_val = train_test_split(c_test, test_size = 0.5, train_size = 0.5)
print(f'Lengths of stratified sets:\n\tTrain: {len(c_train)}\n\tTest: {len(c_test)}\n\tValidation: {len(c_val)}')

Lengths of stratified sets:
	Train: 4435
	Test: 554
	Validation: 555


In [146]:
params = [(1/3, 1/3, 1/3), (0.4, 0.4, 0.2), (0.2, 0.4, 0.4), (0.5, 0.4, 0.1), (0.1, 0.4, 0.5), (0.9, 0.05, 0.05)]

In [147]:
p = Interpolated(models = [uni, bi, tri], lambdas = params).getProbs(c_val[:11])
print(f'Probability: {p}')
pps = p ** (-1 / (len(sum(c_val, [])) - len(c_val)))
ordered = np.argsort(pps)
print("Params ordered by perplexity")
for p in ordered: print(f"\t{np.round(params[p], decimals = 1)} with a value of {pps[p]}")

Probability: [8.67699017e-240 1.04123882e-239 1.04123882e-239 1.04123882e-239
 1.04123882e-239 1.30154853e-240]
Params ordered by perplexity
	[0.4 0.4 0.2] with a value of 1.0535484493845833
	[0.2 0.4 0.4] with a value of 1.0535484493845833
	[0.5 0.4 0.1] with a value of 1.0535484493845833
	[0.1 0.4 0.5] with a value of 1.0535484493845833
	[0.3 0.3 0.3] with a value of 1.0535666583383596
	[0.9 0.  0. ] with a value of 1.053756147588476


#### Comment

## Text Generation

### 1. Tweet Functionality

In [148]:
uni.tweet(length = 10)

['a', 'de', 'a', 'que', 'luchona', 'que', '<unk>', '<s>', '</s>', 'ahora']

In [149]:
bi.tweet(length = 10)

['<s>', '!', '!', '<unk>', '<unk>', '<unk>', '…', '</s>', 'ser', ':', '<unk>']

In [150]:
tri.tweet(length = 10)

['<s>',
 '<s>',
 'eso',
 'no',
 'de',
 'con',
 'las',
 'quiero',
 'un',
 'un',
 'putos',
 '😂']

#### Comment

### 2. AMLO model

#### Comment

### 3. Evaluation with custom phrases

#### Comment

### 4. More evaluation 

#### Comment

## El ahorcado

### 1. Norvig's Hangman

#### Comment

### 2. Follow-up

#### Comment