# Statistical Model Languages

Tarea 3 - Luis Eduardo Robles Jimenez

## Language Model and Evaluation

In [11]:
import nltk
import numpy as np
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split, ParameterGrid

### 1. Preprocessing

In [12]:
tokens = dict()
tokens['begin'], tokens['end'], tokens['unknown'] = '<s>', '</s>', '<unk>'

In [13]:
def _create_vocabulary(corpus, size): # Returns words sorted by frequency
    words, tokenizer, corpusByWords = [], TweetTokenizer(), []
    for doc in corpus:
        tokens = tokenizer.tokenize(doc)
        words += tokens
        corpusByWords += [tokens]
    count = nltk.FreqDist(words)
    count = sorted([(count[key], key) for key in count])[::-1]
    if size != -1: count = count[:size]
    return [word for _, word in count], corpusByWords

def load_corpus(corpus_select = "tweets", vocabSize = 100):
    corpus = []
    path_corpus = "../../data/agresividad/mex_train.txt"

    with open(path_corpus, "r") as f_corpus:
        for tuit in f_corpus:
            corpus += [tuit[:-1]]
    
    vocab, tokenized = _create_vocabulary(corpus, vocabSize)
    corpus = []
    for doc in tokenized:
        tweet = []
        tweet.append(tokens['begin'])
        for word in doc: 
            tweet.append(tokens['unknown'] if word.lower() not in vocab else word)
        tweet.append(tokens['end'])
        corpus.append(tweet)
    return corpus

In [14]:
corpus = load_corpus(vocabSize = 100)

#### Comment

### 2. Models Training

In [15]:
class LanguageModel:
    def __init__(self, corpus = None):
        self.corpus = corpus
        self.nGrams = dict({self.toString(self.gramLen * tokens['unknown']): 0})
        self.nGramsCount = 0
        for line in corpus:
            for start in range(len(line) - self.gramLen + 1):
                gram = self.toString(line[start: start + self.gramLen])
                if not gram in self.nGrams: self.nGrams[gram] = 0
                self.nGrams[gram] += 1
                self.nGramsCount += 1

    def toString(self, gramList):
        gram = ""
        for i in gramList: gram += i
        return gram
        
    def P(self, *words):    # Laplace smoothing
        assert len(words) == self.gramLen, 'Trying to calculate the probability of more than one word'
        gram = self.toString(words)
        if not gram in self.nGrams:
            gram = ""
            for i in range(self.gramLen):
                gram += tokens['unknown']
        return (self.nGrams[gram] + 1) / (self.nGramsCount + len(self.nGrams))


class Unigram(LanguageModel):
    def __init__(self, corpus = None):
        self.gramLen = 1
        super().__init__(corpus)

class Bigram(LanguageModel):
    def __init__(self, corpus = None):
        self.gramLen = 2
        super().__init__(corpus)      

class Trigram(LanguageModel):
    def __init__(self, corpus = None):
        self.gramLen = 3
        super().__init__(corpus)    

In [16]:
uni = Unigram(corpus)

# Very frequent
print(uni.P("que"))
print(uni.P(tokens['begin']))

# Doesn't exist
print(uni.P("otorrinolaringologo"))

0.030060494079397367
0.04925692661650662
0.39978502838158353


In [17]:
bi = Bigram(corpus)

# Very frequent 
print(bi.P('.', tokens['end']))
print(bi.P("es", "que"))

# Doesn't exist
print(bi.P(tokens['begin'], tokens['end']))

0.01240900962736846
0.0007315355020501057
0.13694163972328088


In [18]:
tri = Trigram(corpus)

# Very frequent
print(tri.P('!', '!', '!'))
print(tri.P('es', 'que', 'no'))

# Doesn't exist
print(tri.P('Luis', 'Eduardo', 'Robles'))


0.0017849136727771057
7.613375855447371e-05
0.04706758139967685


#### Comment

### 3. Interpolated Model

In [19]:
c_train, c_test = train_test_split(corpus, test_size = 0.2, train_size = 0.8)
c_test, c_val = train_test_split(c_test, test_size = 0.5, train_size = 0.5)
print(f'Lengths of stratified sets:\n\tTrain: {len(c_train)}\n\tTest: {len(c_test)}\n\tValidation: {len(c_val)}')

Lengths of stratified sets:
	Train: 4435
	Test: 554
	Validation: 555


#### Comment

## Text Generation

### 1. Tweet Functionality

#### Comment

### 2. AMLO model

#### Comment

### 3. Evaluation with custom phrases

#### Comment

### 4. More evaluation 

#### Comment

## El ahorcado

### 1. Norvig's Hangman

#### Comment

### 2. Follow-up

#### Comment