# Statistical Model Languages

Tarea 3 - Luis Eduardo Robles Jimenez

## Language Model and Evaluation

In [131]:
import nltk
import numpy as np
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split, ParameterGrid

### 1. Preprocessing

In [132]:
tokens = {'begin': '<s>', 'end': '</s>', 'unknown': '<unk>', 'separator': '<sep>'}

In [133]:
class Corpus:
    def __init__(self, vocabSize = 100):
        self.size = vocabSize
        self.corpus = None
    def createVocabulary(self):
        words, tokenizer, corpusByWords = [], TweetTokenizer(), []
        for doc in self.corpus:
            tokens = tokenizer.tokenize(doc)
            words += tokens
            corpusByWords += [tokens]
        count = nltk.FreqDist(words)
        count = sorted([(count[key], key) for key in count])[::-1]
        if self.size != -1: count = count[:self.size]
        return [word for _, word in count], corpusByWords

class Tweets(Corpus):
    def __init__():
        pass

In [134]:
def _create_vocabulary(corpus, size):                           # Returns words sorted by frequency
    words, tokenizer, corpusByWords = [], TweetTokenizer(), []
    for doc in corpus:
        tokens = tokenizer.tokenize(doc)
        words += tokens
        corpusByWords += [tokens]
    count = nltk.FreqDist(words)
    count = sorted([(count[key], key) for key in count])[::-1]
    if size != -1: count = count[:size]
    return [word for _, word in count], corpusByWords

def load_corpus(corpus_select = "tweets", vocabSize = 100):
    corpus = []
    path_corpus = "../../data/agresividad/mex_train.txt"

    with open(path_corpus, "r") as f_corpus:
        for tuit in f_corpus:
            corpus += [tuit[:-1]]
    
    vocab, tokenized = _create_vocabulary(corpus, vocabSize)
    corpus = []
    for doc in tokenized:
        tweet = []
        tweet.append(tokens['begin'])
        for word in doc: 
            tweet.append(tokens['unknown'] if word not in vocab else word.lower())
        tweet.append(tokens['end'])
        corpus.append(tweet)
    return corpus

In [135]:
corpus = load_corpus(vocabSize = -100)

#### Comment

### 2. Models Training

In [136]:
class LanguageModel:
    def __init__(self, corpus = None):
        self.corpus = corpus
        self.nGrams, self.vocab = dict(), set()
        for line in corpus:
            self.vocab.update(line)
            for g, grams in enumerate(self.getNGrams(line)):
                gram = self.toString(grams)
                if not gram in self.nGrams: self.nGrams[gram] = 0
                self.nGrams[gram] += 1
        self.sGrams = dict()
        for gram in self.nGrams:
            smallerGram = self.toString(self.toTokens(gram)[: -1])
            if not smallerGram in self.sGrams: self.sGrams[smallerGram] = 0
            self.sGrams[smallerGram] += self.nGrams[gram]
        self.vocab = list(self.vocab)

    def toString(self, gramList):
        gram = ""
        for i, g in enumerate(gramList):
            if i: gram += tokens['separator']
            gram += g
        return gram

    def toTokens(self, gram):
        assert isinstance(gram, str), 'Gram is not a string'
        return gram.split(tokens['separator'])

    def flatten(self, sentence):
        if isinstance(sentence[0], list): sentence = sum(sentence, [])
        return sentence

    def P(self, *words):
        # Laplace smoothing
        assert len(words) == self.gramLen, "n-gram doesn't match the expected length"
        words = [(w if w in self.vocab else tokens['unknown']) for w in words]
        return self._Laplace(words)

    def _Laplace(self, words):
        count = 0
        gram = self.toString(words)
        if gram in self.nGrams: count = self.nGrams[gram]
        ctx = self.toString(words[: -1])
        ctxCount = self.sGrams[ctx] if ctx in self.sGrams else 0
        return (count + 1) / (ctxCount + len(self.vocab))

    def getNGrams(self, line):
        return [line[start: start + self.gramLen] for start in range(len(line) - self.gramLen + 1)]

    def getProbs(self, sentence, log = False):
        sentence = self.flatten(sentence)
        logProb = 0
        for gram in self.getNGrams(sentence):
            p = self.P(*gram)
            logProb += np.log(p)
            assert p > 0, "Probability is zero"
        if log: return logProb
        return np.exp(logProb)

    # Include the <s> and </s> tokens, but don't count </s> - (Page 8, Dan Jurafsky on Language Models)
    def perplexity(self, sentence):
        sentence = self.flatten(sentence)
        pp = 1
        for g in self.getNGrams(sentence):
            pp *= self.P(*g) ** (-1 / (len(sentence) - sentence.count(tokens['begin'])))
        return pp

    def tweet(self, length = 50):
        tweet = [tokens['begin'] for _ in range(self.gramLen - 1)]
        for _ in range(length):
            ctx = tweet[-self.gramLen + 1:] if self.gramLen > 1 else []
            probs = []
            for i, w in enumerate(self.vocab):
                w = ctx + [w]
                p = self.P(*w)
                probs.append(p)
            choice = np.random.choice(self.vocab, p = probs / np.sum(probs))
            tweet += [choice]
            if choice == tokens['end']: break
        return tweet

    def test(self):
        # Hypothesis: The sum of probabilities for a model is: vocabSize ^ (gramLength - 1)
        raise NotImplementedError()

In [137]:
class Unigram(LanguageModel):
    def __init__(self, corpus = None):
        self.gramLen = 1
        super().__init__(corpus)
    
    def test(self):
        p = 0
        for w in self.vocab: p += self.P(w)
        assert np.round(p, decimals = 3) == 1, "Probs don't sum up the expected value"


class Bigram(LanguageModel):
    def __init__(self, corpus = None):
        self.gramLen = 2
        super().__init__(corpus)      

    def test(self):
        for w1 in self.vocab:
            p = 0
            for w2 in self.vocab:
                p += self.P(w1, w2)
            assert np.round(p, decimals = 3) == 1, "Probs don't sum up the expected value"


class Trigram(LanguageModel):
    def __init__(self, corpus = None):
        self.gramLen = 3
        super().__init__(corpus)    

    def test(self):
        for w1 in self.vocab:
            for w2 in self.vocab:
                p = 0   
                for w3 in self.vocab:
                    p += self.P(w1, w2, w3)
                assert np.round(p, decimals = 3) == 1, "Probs don't sum up the expected value"


class N_Gram(LanguageModel):
    def __init__(self, gramLen, corpus = None):
        self.gramLen = gramLen
        super().__init__(corpus)    


class Interpolated(LanguageModel):
    def __init__(self, models = None, lambdas = None):
        #super().__init__(corpus)   
        self.models = models
        self.lambdas = lambdas
        assert len(models) == len(self.lambdas), "The number of models doesn't match the number of lambdas"

    """
    def getProbs(self, sentence, log = False):
        sentence = self.flatten(sentence)
        probs = 0
        for m, model in enumerate(self.models):
            nProb = model.getProbs(sentence, log = True) + np.log(self.lambdas[m])
            print(f'I: Model {m} \tP: {model.getProbs(sentence, log = True)} \twLam: {nProb} \tfProb: {np.exp(nProb)}')
            probs += np.exp(nProb)
        #print(probs)
        probs = np.dot(probs, np.array(self.lambdas).T)
        return probs
    def getProbs(self, sentence, log = False):
        sentence = self.flatten(sentence)
        logProb = 0
        for m, model in enumerate(self.models):
            nProb = model.getProbs(sentence, log = True) + np.log(self.lambdas[m]) 
            nProb *= np.e
            print('\t', nProb)
            logProb += nProb
        print('\t\t', logProb)
        logProb = np.log(logProb)
        if log: return logProb
        return np.exp(logProb)
    """
    def getProbs(self, sentence, log = False):
        pass
        

    def perplexity(self, sentence):
        sentence = self.flatten(sentence)
        pass        
        """
        pps = np.ones((len(self.models)))
        for m, model in enumerate(self.models):
            for g in model.getNGrams(sentence):
                pps[m] *= model.P(*g) ** (-1 / (len(sentence) - sentence.count(tokens['begin'])))
        print(pps)
        pps = np.dot(pps, self.lambdas.T)
        return pps       
        """

In [138]:
uni = Unigram(corpus)
#uni.test()

# Very frequent
print(uni.P("que"))
print(uni.P(tokens['begin']))

# Doesn't exist
print(uni.P("otorrinolaringologo"))

0.026867164736052336
0.044024358292378905
0.0008018864179495526


In [139]:
bi = Bigram(corpus)
#bi.test()

# Very frequent 
print(bi.P('.', tokens['end']))
print(bi.P("es", "que"))

# Doesn't exist
print(bi.P(tokens['begin'], tokens['end']))

0.08451743864181584
0.005644599303135889
5.255689283649551e-05


In [140]:
tri = Trigram(corpus)
#tri.test()

# Very frequent
print(tri.P('!', '!', '!'))
print(tri.P('es', 'que', 'no'))

# Doesn't exist
print(tri.P('Luis', 'Eduardo', 'Robles'))


0.014992184169390365
0.0006635700066357001
7.415647015202076e-05


#### Comment

### 3. Interpolated Model

In [141]:
c_train, c_test = train_test_split(corpus, test_size = 0.001, train_size = 0.999)
c_test, c_val = train_test_split(c_test, test_size = 0.5, train_size = 0.5)
print(f'Lengths of stratified sets:\n\tTrain: {len(c_train)}\n\tTest: {len(c_test)}\n\tValidation: {len(c_val)}')

Lengths of stratified sets:
	Train: 5538
	Test: 3
	Validation: 3


In [142]:
models = [Unigram(c_train), Bigram(c_train), Trigram(c_train)]
for m in models: print(f'PP {m.perplexity(c_val)}\tlogP {m.getProbs(c_val, log = True)} \tP {m.getProbs(c_val)}')

PP 619.0307745302832	logP -295.6951294468907 	P 3.812737935666153e-129
PP 3644.7460487368653	logP -377.2479306608103 	P 1.4564829485569365e-164
PP 12879.10872378737	logP -435.314642744814 	P 8.815618025489594e-190


# FIX PERPLEXITY

In [128]:
params = [(1/3, 1/3, 1/3), (0.4, 0.4, 0.2), (0.2, 0.4, 0.4), (0.5, 0.4, 0.1), (0.1, 0.4, 0.5), (0.9, 0.05, 0.05)]

In [129]:
for i, param in enumerate(params):
    m = Interpolated(models = [uni, bi, tri], lambdas = param)
    print(f'Model {i + 1} \tProbability: {m.getProbs(c_val)} \tlogProb: {m.getProbs(c_val, log = True) }\tPerplexity: {m.perplexity(c_val)}')

Model 1 	Probability: None 	logProb: None	Perplexity: None
Model 2 	Probability: None 	logProb: None	Perplexity: None
Model 3 	Probability: None 	logProb: None	Perplexity: None
Model 4 	Probability: None 	logProb: None	Perplexity: None
Model 5 	Probability: None 	logProb: None	Perplexity: None
Model 6 	Probability: None 	logProb: None	Perplexity: None


#### Comment

## Text Generation

### 1. Tweet Functionality

In [130]:
'''
tweetLength = 50
Interpolated(models = [uni, bi, tri], lambdas = params).tweet(tweetLength)
'''

'\ntweetLength = 50\nInterpolated(models = [uni, bi, tri], lambdas = params).tweet(tweetLength)\n'

#### Comment

### 2. AMLO model

#### Comment

### 3. Evaluation with custom phrases

#### Comment

### 4. More evaluation 

#### Comment

## El ahorcado

### 1. Norvig's Hangman

#### Comment

### 2. Follow-up

#### Comment