In [163]:
import nltk 
import numpy as np

# Load Data

In [164]:
def load_data(filename, labels_filename):
    file = open(filename, 'r')
    labels_file = open(labels_filename, 'r')
    tweets = file.read()
    labels = labels_file.read()
    documents = tweets.split('\n')
    labels = labels.split('\n')
    return documents, labels

In [165]:
from nltk import TweetTokenizer
from nltk.corpus import stopwords

def process_documents(documents):
    # tokenize each document
    documents_tokenized = []
    tokenizer = TweetTokenizer()
    for doc in documents:
        documents_tokenized.append(tokenizer.tokenize(doc.lower()))
    return documents_tokenized

def remove_stop_words(documents):
    # build dictionary of stopwords
    stopwords_dict = {word:1 for word in stopwords.words('spanish')}
    non_stop_documents = []
    for doc in documents:
        ndoc = []
        for word in doc:
            if stopwords_dict.get(word) == None:
                ndoc.append(word)
        non_stop_documents.append(ndoc)
    
    return non_stop_documents

In [166]:
documents, labels = load_data('data/mex_train.txt', 'data/mex_train_labels.txt')
val_documents, val_labels = load_data('data/mex_val.txt', 'data/mex_val_labels.txt')

#remove extra lines
documents.pop(-1)
val_documents.pop(-1)
labels.pop(-1)
val_labels.pop(-1)

documents = process_documents(documents)
val_documents = process_documents(val_documents)
all_documents = documents + val_documents

# Preprocess Unigrams and Bigrams

In [428]:
from nltk.probability import FreqDist

# convert documents into bigram documents
def build_bigram_documents(documents):
    bigram_documents = [[word1 + ' ' + word2 for word1, word2 in zip(doc, doc[1:])] for doc in documents]
    return bigram_documents

def add_padding(documents, k, end_padding=True):
    padded_documents = []
    for doc in documents:
        doc =  ['<s>']*k + doc
        if end_padding:
            doc += ['</s>']
            
        padded_documents.append(doc)
    return padded_documents

def mask_documents(documents, vocabulary):
    masked_documents = []
    for doc in documents:
        masked_doc = []
        for word in doc:
            if vocabulary.get(word) is not None:
                masked_doc.append(word)
            else:
                masked_doc.append('<unk>')
                
        masked_documents.append(masked_doc)
    
    return masked_documents

def get_vocabulary(documents, start='', end='', n=-1):
    # get unique words
    words = [word for doc in documents for word in doc]
    unique_words = FreqDist(words).most_common(n) if n!= -1 else FreqDist(words).most_common() 
    
    # init voc dict
    vocabulary = {start: 0} if start != '' else {}
    
    # fill vocabulary with positions
    pos_available = 1 if start != '' else 0
    for (word, _) in unique_words:
        
        # verify words is not start, end or unk token (special positions for those)
        if word not in (start, end, '<unk>'):
            vocabulary[word] = pos_available
            pos_available += 1
    
    # set unk token
    vocabulary['<unk>'] = len(vocabulary)
    
    # if padded was added, set end token
    if end != '':
        vocabulary[end] = len(vocabulary)
    return vocabulary

def trim_vocabulary(side, vocabulary):
    new_voc = {}
    if side == 'top':
        for (key, value) in list(vocabulary.items())[1:]:
            new_voc[key] = value-1
    elif side == 'bottom':
        for (key, value) in list(vocabulary.items())[:-1]:
            new_voc[key] = value
    else:
        for (key, value) in list(vocabulary.items())[1:-1]:
            new_voc[key] = value-1
    
    return new_voc

def prepair_unigram(documents, n_voc):
    vocabulary = get_vocabulary(documents, start='<s>', end='</s>', n=n_voc)
    docs = add_padding(documents, 1)
    docs = mask_documents(docs, vocabulary)
    return vocabulary, docs

def prepair_bigram(documents, n_voc):
    # get unigrams and mask documents
    vocabulary = get_vocabulary(documents, end='</s>', n=n_voc)
    docs = mask_documents(documents, vocabulary)
    docs = add_padding(docs, 1)
    docs = add_padding(docs, 1, end_padding=False)
    
    # get bigrams vocabulary
    bi_docs = add_padding(documents, 2, end_padding=False)
    bi_docs = build_bigram_documents(bi_docs)
    bi_vocabulary = get_vocabulary(bi_docs, start='<s> <s>', n=n_voc)
    
    # return vocabularies and documents padded
    return vocabulary, bi_vocabulary, docs

# Ejercicios

In [168]:
def build_unigram(documents, vocabulary):
    counts = np.zeros(len(vocabulary))
    for doc in documents:
        #skip <s> in padded documents
        for word in doc[1:]:                                                            
            counts[vocabulary[word]]+= 1
            
    return counts

In [169]:
def build_bigram(documents, r_voc, c_voc):
    n = len(r_voc)
    m = len(c_voc)
    counts = np.zeros((n, m))
    for doc in documents:
        #skip <s> in padded documents
        for i in range(1, len(doc)):                                                     
            context, word = doc[i-1], doc[i]
            counts[r_voc[context], c_voc[word]] += 1
    
    return counts

In [170]:
def build_trigram(documents, vocabulary, bi_vocabulary):
    m = len(vocabulary)
    n = len(bi_vocabulary)
    counts = np.zeros((n, m))
    for doc in documents:
        #skip <s>, <s> in padded couments
        for i in range(2, len(doc)):                                                       
            context, word = doc[i-2] + ' ' + doc[i-1], doc[i]
            context = context if bi_vocabulary.get(context) is not None else '<unk>'
            counts[bi_vocabulary[context], vocabulary[word]] += 1
    
    return counts

In [171]:
def sample(probs):
    acc = np.cumsum(probs)       # build cumulative probability
    val = np.random.uniform()    # get random number between [0, 1]
    pos = np.argmax((val < acc)) # get the index of the word to sample
    return pos

In [378]:
class UnigramModel:
    def train(self, documents, voc_size=10000):
        voc, unidocs = prepair_unigram(documents, voc_size)
        self.voc = trim_vocabulary('top', voc)
        
        # get vocabulary as a list (needed when sampling)
        self.voc_words = list(self.voc.keys())
        self.counts = build_unigram(unidocs, self.voc)
        self.get_probs()
    
    def get_probs(self):
        self.probs = self.counts / np.sum(self.counts)
    
    def predict(self):
        c_index = sample(self.probs)
        return self.voc_words[c_index], self.probs[c_index]
    
    def estimate_prob(self, sequence):
        if len(sequence) < 1:
            print('[ERR]: Not Enough Tokens for Unigram Model')
            return 1
        
        total_logprob = 0
        for word in sequence:
            token = '<unk>' if self.voc.get(word) is None else word
            prob = self.probs[self.voc[token]]
            total_logprob += np.log(prob)
            
        return np.exp(total_logprob)
            
    def generate_sequence(self):
        sequence = ['<s>']
        word = '</s>'
        while word != '</s>':
            word, _ = self.predict()
            sequence.append(word)
        
        return sequence
    
    def eval_model(self, documents):
        test_docs = add_padding(documents, k=1)
        return self.perplexity(test_docs)
    
    def perplexity(self, test_set):
        log_perp = 0
        N = 0
        for test in test_set:
            N += len(test) - 1
            for i in range(1, len(test)):
                prob = self.estimate_prob([test[i]])
                log_perp += np.log(1/prob)

        perp = np.exp(1/N * log_perp)
        return perp

In [435]:
class NGramModel:
    def train(self):
        raise NotImplementedError('Subclass should implement own train')
    
    def estimate_prob(self):
        raise NotImplementedError('Subclass should implement own prob function')
    
    def generate_sequence(self):
        raise NotImplementedError('Subclass should implement own generate function')
        
    def eval_model(self, documents):
        raise NotImplementedError('Subclass should implement own eval function')
        
    def perplexity(self, test_set):
        raise NotImplementedError('Subclass should implement own perplexity function')
    
    def smooth(self, k):
        self.counts = self.counts + k
    
    def predict(self, context):
        context = context if self.r_voc.get(context) is not None else '<unk>' 
        r_index = self.r_voc[context]
        c_index = sample(self.probs[r_index])
        return self.voc_words[c_index], self.probs[r_index, c_index]
    
    def conditioned_space(self, context):
        context = context if self.r_voc.get(context) is not None else '<unk>' 
        r_index = self.r_voc[context]
        return self.probs[r_index]

In [436]:
class BigramModel(NGramModel):
    def train(self, documents, k=1, voc_size=10000):
        voc, docs = prepair_unigram(documents, voc_size)
        self.r_voc = trim_vocabulary('bottom', voc)
        self.c_voc = trim_vocabulary('top', voc)
        
        # get vocabulary as a list (needed when sampling)
        self.voc_words = list(self.c_voc.keys())
        self.counts  = build_bigram(docs, self.r_voc, self.c_voc)
        self.smooth(k)
        self.get_probs()
        
    def get_probs(self):
        unicounts = np.sum(self.counts, axis=1)
        self.probs = self.counts/unicounts[:, np.newaxis]     #ignore first token <s> to normalize given the fact that it always starts with this token
    
    def cond_prob(self, word1, word):
        cond_space = self.conditioned_space(word1)
        token = '<unk>' if self.c_voc.get(word) is None else word  
        return cond_space[self.c_voc[token]]
    
    def estimate_prob(self, sequence):
        if len(sequence) < 2:
            print('[ERR]: Not Enough Tokens for Bigram Model')
            return 1
        
        word1 = sequence[0] 
        word = word1
        total_logprob = 0
        for word in sequence[1:]:
            prob = self.cond_prob(word1, word)
            total_logprob += np.log(prob)
            word1 = word
        
        return np.exp(total_logprob)
            
    def generate_sequence(self):
        sequence = ['<s>']
        word1 = '<s>' 
        word = word1
        while word != '</s>':
            word, _ = self.predict(word1)
            word1 = word
            sequence.append(word)
        
        return sequence
    
    def eval_model(self, documents):
        test_docs = add_padding(documents, k=1)
        return self.perplexity(test_docs)
    
    def perplexity(self, test_set):
        log_perp = 0
        N = 0
        for test in test_set:
            N += len(test) - 1 if len(test) > 1 else 0
            for i in range(1, len(test)):
                c1, w = test[i-1], test[i]
                prob = self.estimate_prob([c1, w])
                log_perp += np.log(1/prob)

        perp = np.exp(1/N * log_perp)
        return perp

In [437]:
class TrigramModel(NGramModel):
    def __init__(self):
        super(NGramModel).__init__()
    
    def train(self, documents, k=1, voc_size=10000):
        self.c_voc, self.r_voc, docs = prepair_bigram(documents, voc_size)
        # get vocabulary as a list (needed when sampling)
        self.voc_words = list(self.c_voc.keys())
        self.counts = build_trigram(docs, self.c_voc, self.r_voc)
        self.smooth(k)
        self.get_probs()
    
    def get_probs(self):
        bicounts = np.sum(self.counts, axis=1)
        self.probs = self.counts/bicounts[:, np.newaxis]     #ignore first token <s> to normalize given the fact that it always starts with this token
    
    def cond_prob(self, word1, word2, word):
        cond_space = self.conditioned_space(word1 + ' ' + word2)
        token = '<unk>' if self.c_voc.get(word) is None else word
        return cond_space[self.c_voc[token]]
    
    def estimate_prob(self, sequence):
        if len(sequence) < 3:
            print('[ERR]: Not Enough Tokens for Trigram Model')
            return 1
        
        word1 = sequence[0] 
        word2 = sequence[1]
        word = word2
        total_logprob = 0
        for word in sequence[2:]:
            prob = self.cond_prob(word1, word2, word)
            total_logprob += np.log(prob)
            word1, word2 = word2, word
        
        return np.exp(total_logprob)
            
    def generate_sequence(self):
        sequence = ['<s>']
        word1 = '<s>' 
        word2 = '<s>'
        word = word2
        while word != '</s>':
            word, _ = self.predict(word1 + ' ' + word2)
            word1, word2 = word2, word
            sequence.append(word)
        
        return sequence
    
    def eval_model(self, documents):
        test_docs = add_padding(documents, k=2)
        return self.perplexity(test_docs)
    
    def perplexity(self, test_set):
        log_perp = 0
        N = 0
        for test in test_set:
            N += len(test) - 2 if len(test) > 2 else 0
            for i in range(2, len(test)):
                c1, c2, w = test[i-2], test[i-1], test[i]
                prob = self.estimate_prob([c1, c2, w])
                log_perp += np.log(1/prob)

        perp = np.exp(1/N * log_perp)
        return perp

In [438]:
trigram = TrigramModel()
trigram.train(documents, k=0.05)

In [439]:
trigram.eval_model(val_documents)

591.9993423789825

In [440]:
trigram.estimate_prob(['<s>', '<s>','hijos', 'de', 'la', 'verga', '</s>'])

1.541482916387375e-09

In [441]:
bigram = BigramModel()
bigram.train(documents, k=0.05)

In [442]:
bigram.estimate_prob(['hijos', 'de', 'la', 'verga'])

0.001646601315699137

In [443]:
bigram.eval_model(val_documents)

439.1253897138523

In [446]:
trigram.predict('hola como')

('el', 0.015149952404588923)

In [373]:
seq = trigram.generate_sequence()

In [None]:
seq

In [430]:
bigram.probs.shape

(10002, 10002)

# Lambdas Fijos

In [447]:
lambdas_ = [[1/3, 1/3, 1/3],[.4, .4, .2],[.2, .4, .4],[.5, .4, .1],[.1, .4, .5]]

In [448]:
class InterpolatedModel:
    def __init__(self, lambda_):
        self.l1, self.l2, self.l3 = lambda_
        self.unigram = UnigramModel()
        self.bigram = BigramModel()
        self.trigram = TrigramModel()
        
    def train(self, documents, k=0, voc_size=10000):
        self.unigram.train(documents, voc_size)
        self.bigram.train(documents, k, voc_size)
        self.trigram.train(documents, k, voc_size)
    
    def predict(self, sequence):
        # build contexts
        bicontext = sequence[1]
        tricontext = sequence[0] + ' ' + sequence[1]
        
        # get conditioned spaces
        unispace = self.unigram.probs
        bispace = self.bigram.conditioned_space(bicontext)
        trispace = self.trigram.conditioned_space(tricontext)
        
        # sample from probability space
        probs = self.l1 * unispace + self.l2 * bispace + self.l3 * trispace
        c_index = sample(probs)
        
        return self.unigram.voc_words[c_index], probs[c_index]
    
    def estimate_prob(self, sequence):
        if len(sequence) < 3:
            print('[ERR]: Not Enough Tokens for Interpolated Model')
            return 1
        
        word1 = sequence[0] 
        word2 = sequence[1]
        word = word2
        total_logprob = 0
        for word in sequence[2:]:
            uniprob = self.unigram.estimate_prob(word)
            biprob  = self.bigram.estimate_prob([word2, word])
            triprob = self.trigram.estimate_prob([word1, word2, word])
            prob = self.l1 * uniprob + self.l2 * biprob + self.l3 * triprob
            total_logprob += np.log(prob)
            word1, word2 = word2, word
        
        return np.exp(total_logprob)
    
    def generate_sequence(self):
        sequence = ['<s>']
        word1 = '<s>' 
        word2 = '<s>'
        word = word2
        while word != '</s>':
            word, _ = self.predict([word1, word2])
            word1, word2 = word2, word
            sequence.append(word)
        
        return sequence
    
    def eval_model(self, documents):
        test_docs = add_padding(documents, k=2)
        return self.perplexity(test_docs)
    
    def perplexity(self, test_set):
        log_perp = 0
        N = 0
        for test in test_set:
            N += len(test) - 2 if len(test) > 2 else 0
            for i in range(2, len(test)):
                c1, c2, w = test[i-2], test[i-1], test[i]
                prob = self.estimate_prob([c1, c2, w])
                log_perp += np.log(1/prob)

        perp = np.exp(1/N * log_perp)
        return perp

In [449]:
i_model = InterpolatedModel(lambdas_[-1])

In [450]:
i_model.train(documents)

In [451]:
i_model.generate_sequence()

['<s>',
 '40',
 'minutos',
 'en',
 'el',
 'que',
 'te',
 'indigne',
 '<unk>',
 'por',
 'decir',
 '“',
 'el',
 'nombre',
 'del',
 'ganador',
 'sus',
 '.',
 'en',
 'tu',
 'foto',
 'del',
 '.',
 'que',
 'he',
 'tomado',
 'presentación',
 'lo',
 'jefes',
 'que',
 'es',
 'por',
 'respeto',
 '.',
 'aunque',
 'le',
 '<unk>',
 '.',
 '</s>']

In [409]:
i_model.trigram.probs.shape

(10001, 10002)

# Actualización de Probabilidades 

Es de interés tomar cierta medida para asegurar que la probabilidad del token de fin de secuencia '\</s\>' vaya aumentando conforme la secuencia se va haciendo más larga. Para ello utilizaremos la siguiente regla de actualización: 

Sea $p_s$ la probabilidad de obtener el token de fin de secuencia. Entonces como $p_s \leq 1$, sabemos que ${p^r_s} \geq p_s$ en donde $r<1$. De hecho, sabemos también que 

$$\lim_{n\rightarrow \infty} \sqrt[n]{r} = 1$$ 

Entonces, podemos tomar la regla de actualización $$\hat{p}_s = \sqrt[n]{p_s}$$

Debido a que esta probabilidad aumentó, para asegurarnos que el espacio de probabilidad se encuentra bien definido, debemos disminuir esta probabilidad de los otros tokens para asegurarnos que la suma de las probabilidades siga siendo 1. Definamos el aumento de la probabilidad que tenemos respecto al token de fin de secuencia como 

$$a_p = \hat{p}_s - p_s$$

Entonces, sea $p_i$ la probabilidad de obtener el token $t_i$ en donde $t_i \neq $ '\</s\>'. Definamos a $\sigma$ como 

$$\sigma = \sum_{i=1}^{|V|} p_i$$

en donde $|V|$ representa la cardinalidad del conjunto del vocabulario sin considerar al token de fin de secuencia. Notemos que $\sigma = 1 - p_s$. Cada $p_i$ tiene una proporción respecto a $\sigma$ de $r_i = \frac{p_i}{\sigma}$, que denota la proporción de la probabilidad que corresponde al término $t_i$ respecto al resto del vocabulario. Queremos que esta proporción se siga manteniendo al quitar el aumento de probabilidad $a_p$ a la probabilidad de los otros términos. Entonces, utilizando la siguiente regla de actualización

$$\hat{p}_i = p_i - r_i a_p$$

y definiendo a $$\hat{\sigma} = \sum_{i=1}^{|V|} \hat{p}_i$$

podemos ver que se cumple $$\hat{r}_i = \frac{\hat{p}_i}{\hat{\sigma}} = r_i$$

In [51]:
# receives a probs matrix and the power r.
def diminish(probs, r):
    # calculate new probability
    new_probs = np.zeros(probs.shape)
    new_stop_prob = np.power(probs[:, -1], r)
    # get improvement
    improve = (new_stop_prob - probs[:, -1])
    # get ratio of the other probabilities between them
    c = np.sum(probs[:, :-1], axis=1)
    rat = probs[:, :-1]/c[:, np.newaxis]
    # update new probability
    new_probs[:, -1] = new_stop_prob
    new_probs[:, :-1] = probs[:, :-1] - rat * improve[:, np.newaxis]
    
    return new_probs

In [63]:
probs = [[.4, .3, .3],[.3, .5, .2]]
probs = np.array(probs, dtype=np.float128)

new_probs = diminish(probs, 0.25)
print(new_probs)
np.sum(new_probs,axis=1)

[0.7400828 0.6687403]
[0.4400828 0.4687403]
[1. 1.]
[[0.14852411 0.11139308 0.7400828 ]
 [0.12422239 0.20703731 0.6687403 ]]


array([1., 1.])

In [61]:
.4/(.4 + .3)

0.5714285714285715

In [62]:
.14852411/(0.14852411 + 0.11139308)

0.5714285769248274

# PRUEBAS

In [367]:
trigram_m = TrigramModel()

In [359]:
trigram_m.bi_voc['<s> hola']

1887

In [304]:
np.sum(trigram_m.probs[-2])

1.0

In [369]:
trigram_m.probs.shape, len(trigram_m.bi_voc), len(trigram_m.voc)

((10001, 10002), 10001, 10002)

In [370]:
np.sum(trigram_m.probs)

10000.999999999993

In [364]:
list(trigram_m.bi_voc.items())[891]

('alv .', 891)

In [None]:
for i in range(trigram_m.probs.shape[0]):
    for j in range(trigram_m.probs.shape[1]):
        if np.isnan(trigram_m.probs[i,j]):
            print('nan at', i, ' ', j)

In [368]:
trigram_m.train(documents)

In [227]:
vocabulary, unidocs = prepair_unigram(documents, 10000)
bi_vocabulary, bidocs = prepair_bigram(documents, 10000) 

In [204]:
unigram = build_unigram(unidocs, vocabulary)
unigram_prob = unigram/np.sum(unigram[1:])

In [205]:
bigram = build_bigram(unidocs, vocabulary)

In [206]:
bigram_prob = bigram[:-1]/unigram[:-1, np.newaxis]

In [207]:
np.sum(bigram_prob)

10000.000000000004

In [228]:
bi_padded_docs = add_padding(unidocs, k=1)

In [229]:
trigram = build_trigram(bi_padded_docs, vocabulary, bi_vocabulary)

In [230]:
unigram_of_bigrams = build_unigram(bidocs, bi_vocabulary)

In [231]:
trigram_prob = trigram[:-1]/unigram_of_bigrams[:-1, np.newaxis]

In [241]:
np.sum(trigram, axis=1) == unigram_of_bigrams 

array([ True,  True,  True, ...,  True,  True, False])

In [235]:
list(bi_vocabulary.keys())

['<s> <s>',
 '. </s>',
 '! !',
 '<s> @usuario',
 'la verga',
 'a la',
 '! </s>',
 'de la',
 '@usuario @usuario',
 'que no',
 'que me',
 '<s> no',
 '? </s>',
 'la madre',
 '<s> me',
 '<s> que',
 'los putos',
 'en la',
 '😂 😂',
 '… </s>',
 'puta madre',
 'en el',
 'que se',
 '<s> ya',
 'las putas',
 'su madre',
 'lo que',
 'a su',
 'verga .',
 'que te',
 'y no',
 'voy a',
 '<s> si',
 'no me',
 '<s> a',
 '😂 </s>',
 '? ?',
 'madre .',
 'no se',
 'a los',
 '@usuario </s>',
 'a mi',
 '<s> y',
 '<s> la',
 'vale verga',
 'para que',
 'todos los',
 '<s> ¿',
 'madre </s>',
 'tu madre',
 'de mi',
 'a tu',
 'hijos de',
 'a las',
 'y me',
 'de su',
 'de mierda',
 'hasta la',
 'de los',
 'no es',
 'es que',
 'verga </s>',
 'va a',
 'hijo de',
 'ya me',
 'de tu',
 'por qué',
 'que putas',
 '<s> mi',
 'que le',
 'me vale',
 'si no',
 '<s> el',
 'que es',
 'con el',
 'no te',
 'de verga',
 'sus putas',
 'ya no',
 '️ </s>',
 'y que',
 'con la',
 '... </s>',
 'de las',
 '<s> en',
 'la vida',
 'es un',
 'd

In [39]:
np.sum(bigram[0, :])

5544.0

In [58]:
padded_vocabulary['<unk>']

10000

In [68]:
unigram[-2]

5544.0

In [49]:
bigram_prob[padded_vocabulary['<s>'], padded_vocabulary['<unk>']]

0.02958152958152958