In [1]:
import nltk 
import numpy as np

# Load Data

In [2]:
def load_data(filename, labels_filename):
    file = open(filename, 'r')
    labels_file = open(labels_filename, 'r')
    tweets = file.read()
    labels = labels_file.read()
    documents = tweets.split('\n')
    labels = labels.split('\n')
    return documents, labels

In [3]:
from nltk import TweetTokenizer
from nltk.corpus import stopwords

def process_documents(documents):
    # tokenize each document
    documents_tokenized = []
    tokenizer = TweetTokenizer()
    for doc in documents:
        documents_tokenized.append(tokenizer.tokenize(doc.lower()))
    return documents_tokenized

def remove_stop_words(documents):
    # build dictionary of stopwords
    stopwords_dict = {word:1 for word in stopwords.words('spanish')}
    non_stop_documents = []
    for doc in documents:
        ndoc = []
        for word in doc:
            if stopwords_dict.get(word) == None:
                ndoc.append(word)
        non_stop_documents.append(ndoc)
    
    return non_stop_documents

In [121]:
from sklearn.model_selection import train_test_split

def get_partitions(documents, labels):
    n = len(documents)
    train_docs, test_docs, train_labels, test_labels = train_test_split(documents, labels, test_size=0.10, random_state=42)
    train_docs, val_docs, train_labels, val_labels = train_test_split(train_docs, train_labels, test_size=n//10, random_state=42)
    return train_docs, val_docs, test_docs, train_labels, val_labels, test_labels

In [122]:
documents, labels = load_data('data/mex_train.txt', 'data/mex_train_labels.txt')
val_documents, val_labels = load_data('data/mex_val.txt', 'data/mex_val_labels.txt')

#remove extra lines
documents.pop(-1)
val_documents.pop(-1)
labels.pop(-1)
val_labels.pop(-1)

# process documents
documents = process_documents(documents)
val_documents = process_documents(val_documents)

# build partitions
all_documents = documents + val_documents
all_labels = labels + val_labels
train_corpus, val_corpus, test_corpus, _, _, _ = get_partitions(all_documents, all_labels) 

# Padding and Masking

Funciones para enmascarar el vocabulario y agregar padding a los documentos. Notemos que la función que agrega el padding puede agregar $k$ tokens de inicio de secuencia según sea necesario. 

In [6]:
def add_padding(documents, k, end_padding=True):
    padded_documents = []
    for doc in documents:
        doc =  ['<s>']*k + doc
        if end_padding:
            doc += ['</s>']
            
        padded_documents.append(doc)
    return padded_documents

def mask_documents(documents, vocabulary):
    masked_documents = []
    for doc in documents:
        masked_doc = []
        for word in doc:
            if vocabulary.get(word) is not None:
                masked_doc.append(word)
            else:
                masked_doc.append('<unk>')
                
        masked_documents.append(masked_doc)
    
    return masked_documents

# Vocabulary Functions

In [7]:
from nltk.probability import FreqDist

def get_vocabulary(documents, start='', end='', n=-1):
    # get unique words
    words = [word for doc in documents for word in doc]
    unique_words = FreqDist(words).most_common(n) if n!= -1 else FreqDist(words).most_common() 
    # init voc dict
    vocabulary = {start: 0} if start != '' else {}
    # fill vocabulary with positions
    pos_available = 1 if start != '' else 0
    for (word, _) in unique_words:
        # verify words is not start, end or unk token (special positions for those)
        if word not in (start, end, '<unk>'):
            vocabulary[word] = pos_available
            pos_available += 1
    # set unk token
    vocabulary['<unk>'] = len(vocabulary)
    # if padded was added, set end token
    if end != '':
        vocabulary[end] = len(vocabulary)
    return vocabulary

def trim_vocabulary(side, vocabulary):
    new_voc = {}
    if side == 'top':
        for (key, value) in list(vocabulary.items())[1:]:
            new_voc[key] = value-1
    elif side == 'bottom':
        for (key, value) in list(vocabulary.items())[:-1]:
            new_voc[key] = value
    else:
        for (key, value) in list(vocabulary.items())[1:-1]:
            new_voc[key] = value-1
    
    return new_voc

# Ejercicios

## Ejercicio 1. Preprocess Unigrams and Bigrams

En el siguiente bloque tenemos las funciones base que se llaman para todos los modelos presentados en este trabajo. En especial las funciones **prepair\_unigram** y **prepair\_bigram** se encargan de preparar los documentos llamando a las funciones necesarias para enmascarar los vocabularios y agregar padding según sea necesario. 

Para la construcción de los trigramas se utiliza también la función **build\_bigram\_documents**.

In [8]:
# convert documents into bigram documents
def build_bigram_documents(documents):
    bigram_documents = [[word1 + ' ' + word2 for word1, word2 in zip(doc, doc[1:])] for doc in documents]
    return bigram_documents

def prepair_unigram(documents, n_voc):
    vocabulary = get_vocabulary(documents, start='<s>', end='</s>', n=n_voc)
    docs = add_padding(documents, 1)
    docs = mask_documents(docs, vocabulary)
    return vocabulary, docs

def prepair_bigram(documents, n_voc):
    # get unigrams and mask documents
    vocabulary = get_vocabulary(documents, end='</s>', n=n_voc)
    docs = mask_documents(documents, vocabulary)
    docs = add_padding(docs, 1)
    docs = add_padding(docs, 1, end_padding=False)
    
    # get bigrams vocabulary
    bi_docs = add_padding(documents, 2, end_padding=False)
    bi_docs = build_bigram_documents(bi_docs)
    bi_vocabulary = get_vocabulary(bi_docs, start='<s> <s>', n=n_voc)
    
    # return vocabularies and documents padded
    return vocabulary, bi_vocabulary, docs

## Build N Grams Matrix

In [9]:
def build_unigram(documents, vocabulary):
    counts = np.zeros(len(vocabulary))
    for doc in documents:
        #skip <s> in padded documents
        for word in doc[1:]:                                                            
            counts[vocabulary[word]]+= 1
    return counts

In [10]:
def build_bigram(documents, r_voc, c_voc):
    n = len(r_voc)
    m = len(c_voc)
    counts = np.zeros((n, m))
    for doc in documents:
        #skip <s> in padded documents
        for i in range(1, len(doc)):                                                     
            context, word = doc[i-1], doc[i]
            counts[r_voc[context], c_voc[word]] += 1
    return counts

In [11]:
def build_trigram(documents, vocabulary, bi_vocabulary):
    m = len(vocabulary)
    n = len(bi_vocabulary)
    counts = np.zeros((n, m))
    for doc in documents:
        #skip <s>, <s> in padded couments
        for i in range(2, len(doc)):                                                       
            context, word = doc[i-2] + ' ' + doc[i-1], doc[i]
            context = context if bi_vocabulary.get(context) is not None else '<unk>'
            counts[bi_vocabulary[context], vocabulary[word]] += 1
    return counts

## Utilities for All Models

In [12]:
def sample(probs):
    acc = np.cumsum(probs)       # build cumulative probability
    val = np.random.uniform()    # get random number between [0, 1]
    pos = np.argmax((val < acc)) # get the index of the word to sample
    return pos

In [45]:
def bold_string(string):
    return '\033[1m' + string + '\033[0m '

def print_sequence(seq):
    for word in seq[1:-1]:
        print(word, end=' ')
    print('') #flush with new line

# Ejercicio 2. Unigramas, Bigramas, Trigramas

Todos los modelos serán construidos como clases para poder llamar a sus métodos pertinentes para poder realizar las acciones solicitadas. Para el modelo de bigrama y trigrama se utilizará la variante del smoothing Laplace en donde se agrega un valor $k$ a todas las cuentas en vez de agregar 1. Se experimentó con varios valores y en general se notó que escoger valores pequeños para $k$ reducían la perplejidad.

Los modelos serán evaluados en esta sección entrenandolos con el conjunto original de training y evaluando sus perplejidades con el conjunto original de validación. Es decir, las particiones creadas no serán utilizadas en esta sección. 

## Unigram Model

In [37]:
class UnigramModel:
    def train(self, documents, voc_size=10000):
        voc, unidocs = prepair_unigram(documents, voc_size)
        self.voc = trim_vocabulary('top', voc)
        
        # get vocabulary as a list (needed when sampling)
        self.voc_words = list(self.voc.keys())
        self.counts = build_unigram(unidocs, self.voc)
        self.get_probs()
    
    def get_probs(self):
        self.probs = self.counts / np.sum(self.counts)
    
    def predict(self):
        c_index = sample(self.probs)
        return self.voc_words[c_index], self.probs[c_index]
    
    def estimate_prob(self, sequence):
        if len(sequence) < 1:
            print('[ERR]: Not Enough Tokens for Unigram Model')
            return 1
        
        total_logprob = 0
        for word in sequence:
            token = '<unk>' if self.voc.get(word) is None else word
            prob = self.probs[self.voc[token]]
            total_logprob += np.log(prob)
            
        return np.exp(total_logprob)
            
    def generate_sequence(self):
        sequence = ['<s>']
        word = '<s>'
        while word != '</s>':
            word, _ = self.predict()
            sequence.append(word)
        
        return sequence
    
    def eval_model(self, documents):
        test_docs = add_padding(documents, k=1)
        return self.perplexity(test_docs)
    
    def perplexity(self, test_set):
        log_perp = 0
        N = 0
        for test in test_set:
            N += len(test) - 1
            for i in range(1, len(test)):
                prob = self.estimate_prob([test[i]])
                log_perp += np.log(1/prob)

        perp = np.exp(1/N * log_perp)
        return perp

### NGram Model Base Class

La siguiente clase es la clase base tanto para los bigramas como trigramas. Se utilizó una clase base ya que ambos modelos serán implementados a través de una matriz que representará las probabilidades condicionadas. En la dimensión de filas tendremos el contexto que condiciona al token actual, y en las columnas tendremos el token actual procesado.

In [38]:
class NGramModel:
    def train(self):
        raise NotImplementedError('Subclass should implement own train')
    
    def estimate_prob(self):
        raise NotImplementedError('Subclass should implement own prob function')
    
    def generate_sequence(self):
        raise NotImplementedError('Subclass should implement own generate function')
        
    def eval_model(self, documents):
        raise NotImplementedError('Subclass should implement own eval function')
        
    def perplexity(self, test_set):
        raise NotImplementedError('Subclass should implement own perplexity function')
    
    def smooth(self, k):
        self.counts = self.counts + k
    
    # perform a prediction of a token with a given context
    def predict(self, context):
        context = context if self.r_voc.get(context) is not None else '<unk>' 
        r_index = self.r_voc[context]
        c_index = sample(self.probs[r_index])
        return self.voc_words[c_index], self.probs[r_index, c_index]
    
    # function to retrieve all the conditioned space probability, i.e. all the columns of a certain context 
    def conditioned_space(self, context):
        context = context if self.r_voc.get(context) is not None else '<unk>'            # mask if necessary
        r_index = self.r_voc[context]
        return self.probs[r_index]

# Bigram Model

In [197]:
class BigramModel(NGramModel):
    def train(self, documents, k=1, voc_size=10000):
        voc, docs = prepair_unigram(documents, voc_size)
        self.r_voc = trim_vocabulary('bottom', voc)
        self.c_voc = trim_vocabulary('top', voc)
        
        # get vocabulary as a list (needed when sampling)
        self.voc_words = list(self.c_voc.keys())
        self.counts  = build_bigram(docs, self.r_voc, self.c_voc)
        self.smooth(k)
        self.get_probs()
        
    def get_probs(self):
        unicounts = np.sum(self.counts, axis=1)
        self.probs = self.counts/unicounts[:, np.newaxis]    
    
    def cond_prob(self, word1, word):
        cond_space = self.conditioned_space(word1)                 # get conditioned space p(.|word1)
        token = '<unk>' if self.c_voc.get(word) is None else word  # mask if necessary
        return cond_space[self.c_voc[token]]
    
    def estimate_prob(self, sequence):
        if len(sequence) < 2:
            print('[ERR]: Not Enough Tokens for Bigram Model')
            return 1
        #build context
        word1 = sequence[0] 
        word = word1
        total_logprob = 0
        for word in sequence[1:]:
            prob = self.cond_prob(word1, word)     #conditional probability
            total_logprob += np.log(prob)
            word1 = word
        
        return np.exp(total_logprob)
            
    def generate_sequence(self, max_length=None, strat=None, activation_window=3):
        sequence = ['<s>']
        word1 = '<s>' 
        word = word1
        actual_probs = self.probs
        while word != '</s>':
            word, _ = self.predict(word1)          # predict a token given the current context
            word1 = word
            sequence.append(word)
            if strat is not None:
                new_prob_table = strat(self.probs, len(sequence), max_length, activation_window)
                if new_prob_table is not None:
                    self.probs=new_prob_table
        
        self.probs = actual_probs
        return sequence
    
    def eval_model(self, documents):
        test_docs = add_padding(documents, k=1)
        return self.perplexity(test_docs)
    
    def perplexity(self, test_set):
        log_perp = 0
        N = 0
        for test in test_set:
            N += len(test) - 1   
            for i in range(1, len(test)):         # skip <s> token
                c1, w = test[i-1], test[i]
                prob = self.cond_prob(c1, w)
                log_perp += np.log(1/prob)

        perp = np.exp(1/N * log_perp)
        return perp

# Trigram Model

Para los trigramas obtenemos dos vocabularios. Primero obtenemos el vocabulario de los tokens más comunes como en los modelos anteriores, y luego el vocabulario de los bigramas, que condicionan al token actual, más comunes. Es importante resaltar que para este modelo en la dimensión de los bigramas se toma como token desconocido '\<unk\>' cuando la unión de ambos tokens que conforman al bigrama no se encuentra en el vocabulario de bigramas.

In [40]:
class TrigramModel(NGramModel):
    def __init__(self):
        super(NGramModel).__init__()
    
    def train(self, documents, k=1, voc_size=10000):
        self.c_voc, self.r_voc, docs = prepair_bigram(documents, voc_size)
        # get vocabulary as a list (needed when sampling)
        self.voc_words = list(self.c_voc.keys())
        self.counts = build_trigram(docs, self.c_voc, self.r_voc)
        self.smooth(k)
        self.get_probs()
    
    def get_probs(self):
        bicounts = np.sum(self.counts, axis=1)
        self.probs = self.counts/bicounts[:, np.newaxis]    
    
    def cond_prob(self, word1, word2, word):
        cond_space = self.conditioned_space(word1 + ' ' + word2)
        token = '<unk>' if self.c_voc.get(word) is None else word           # mask if necessary
        return cond_space[self.c_voc[token]]
    
    def estimate_prob(self, sequence):
        if len(sequence) < 3:
            print('[ERR]: Not Enough Tokens for Trigram Model')
            return 1
        
        word1 = sequence[0] 
        word2 = sequence[1]
        word = word2
        total_logprob = 0
        for word in sequence[2:]:
            prob = self.cond_prob(word1, word2, word)
            total_logprob += np.log(prob)
            word1, word2 = word2, word
        
        return np.exp(total_logprob)
            
    def generate_sequence(self):
        sequence = ['<s>']
        word1 = '<s>' 
        word2 = '<s>'
        word = word2
        while word != '</s>':
            word, _ = self.predict(word1 + ' ' + word2)
            word1, word2 = word2, word
            sequence.append(word)
        
        return sequence
    
    def eval_model(self, documents):
        test_docs = add_padding(documents, k=2)
        return self.perplexity(test_docs)
    
    def perplexity(self, test_set):
        log_perp = 0
        N = 0
        for test in test_set:
            N += len(test) - 2
            for i in range(2, len(test)):                      # skip both <s> <s> tokens
                c1, c2, w = test[i-2], test[i-1], test[i]
                prob = self.cond_prob(c1, c2, w)
                log_perp += np.log(1/prob)

        perp = np.exp(1/N * log_perp)
        return perp

## Pruebas de Modelos

In [95]:
# function to get the probability of a sequence and print the results
def eval_sequence(prob_func, seq, extra=''):
    cad = ''
    for s in seq:
        cad += s + ' '
    
    print(bold_string('secuencia: '), cad, )
    print(bold_string('probabilidad de secuencia {0}: '.format(extra)), prob_func(seq))
    print('')

### Prueba Unigrama

In [41]:
unigram = UnigramModel()
unigram.train(documents, voc_size=11000)

In [52]:
eval_sequence(unigram.estimate_prob, ['te', 'amo'])
eval_sequence(unigram.estimate_prob, ['tokenDesconocidoDefinitivamente', 'amo'], 'token desconocido')

print(bold_string('Generación de Secuencia'))
print_sequence(unigram.generate_sequence())

[1msecuencia: [0m  te amo 
[1mprobabilidad de secuencia : [0m  2.543942286175518e-06

[1msecuencia: [0m  tokenDesconocidoDefinitivamente amo 
[1mprobabilidad de secuencia token desconocido: [0m  9.474624878238107e-06

[1mGeneración de Secuencia[0m 
que ¿ <unk> . tengo que creer día . valiendo 💦 mundo a awebo 


### Prueba Bigrama

In [83]:
bigram = BigramModel()
bigram.train(documents, k=0.005, voc_size=11000)

In [84]:
eval_sequence(bigram.estimate_prob, ['hijos', 'de', 'la', 'verga'])
eval_sequence(bigram.estimate_prob, ['<s>', 'las', 'perplejidades', 'son', 'altas', '</s>'], 'token desconocido')
print(bold_string('secuencia condicionada: '), 'vete a')
print(bold_string('Probabilidad Condicional: '), bigram.cond_prob('vete', 'a'))
print(bold_string('\nGeneración de Secuencia'))
print_sequence(bigram.generate_sequence())

[1msecuencia: [0m  hijos de la verga 
[1mprobabilidad de secuencia : [0m  0.008143255147411203

[1msecuencia: [0m  <s> las perplejidades son altas </s> 
[1mprobabilidad de secuencia token desconocido: [0m  1.7890221731532313e-15

[1msecuencia condicionada: [0m  vete a
[1mProbabilidad Condicional: [0m  0.20783015192832105
[1m
Generación de Secuencia[0m 
@usuario ya lo voy a ver si está de verga x tu meme estl firmado brinque actriz pides antes de youtuber suba numero fairplay dinero ; party quítenme comunidad jajajajajajajajaja vuelvan irrrrr póster taller arruinar solecitos dámela oigo privada mera remplazo excelente tarde-noche uniones tragando cachorrito lucrar consiguio país sabrosear malnacido emocionan ponte 💛 recuperas juanes armado boludeces ulises indie curada programacion adolecía desahogo bajan agarrar khé 10 muertos #basica redes novia dulces del pan superé divertidos vacaciones mentirle boxeadores emperra #0pedosmorra automáticas echarnos jsjaja gordota note o

In [177]:
bigram.eval_model(val_documents)

556.010612606513

### Pruebas Trigramas

In [92]:
trigram = TrigramModel()
trigram.train(documents, k=0.005, voc_size=11000)

In [94]:
eval_sequence(trigram.estimate_prob, ['hijos', 'de', 'la', 'verga'])
eval_sequence(trigram.estimate_prob, ['<s>', 'las', 'perplejidades', 'son', 'altas', '</s>'], 'token desconocido')
print(bold_string('secuencia condicionada: '), 'vete a la')
print(bold_string('Probabilidad Condicional: '), trigram.cond_prob('vete', 'a', 'la'))
print(bold_string('\nGeneración de Secuencia'))
print_sequence(trigram.generate_sequence())

[1msecuencia: [0m  hijos de la verga 
[1mprobabilidad de secuencia : [0m  0.02352065607856261

[1msecuencia: [0m  <s> las perplejidades son altas </s> 
[1mprobabilidad de secuencia token desconocido: [0m  2.1106826559148392e-13

[1msecuencia condicionada: [0m  vete a la
[1mProbabilidad Condicional: [0m  0.19722574285311928
[1m
Generación de Secuencia[0m 
con violador putas cago lo no chingas a tu leia ... 


# Perplejidades

In [93]:
print(bold_string('unigram perplexity: \t'), unigram.eval_model(val_documents))
print(bold_string('bigram perplexity: \t'), bigram.eval_model(val_documents))
print(bold_string('trigram perplexity: \t'), trigram.eval_model(val_documents))

[1munigram perplexity: 	[0m  368.36738387642487
[1mbigram perplexity: 	[0m  390.9887421025693
[1mtrigram perplexity: 	[0m  515.9447123589185


Podemos resaltar varias cosas interesantes a partir de estas pruebas. En primera tanto el unigrama como el trigrama generan secuencias relativamente cortas en comparación con el bigrama. Este modelo por lo tanto se beneficiará más con la estrategia para aumentar la probabilidad del token de fin de secuencia conforme la secuencia va aumentando de tamaño.  También podemos ver que en cuanto a los valores de la perplejidad, el unigrama tiene el menor valor.

# Interpolated Model

En los siguientes bloques tenemos la clase del modelo Interpolado y la función para optimizar los pesos del modelo interpolado con el algoritmo de EM. 

La clase del modelo interpolado cuenta con todas las funcionalidades necesarias para realizar lo solicitado en toda la tarea, por lo tanto tanto la función de entrenamiento con lambdas fijos como la función de entrenamiento con EM se encuentran definidas en la clase.

In [27]:
def optimize_em(prob_matrix, n_iter, init_weights = None):
    # Initialize model weights
    if init_weights is not None:
        weights = np.array(init_weights)
    else:
        n_models = prob_matrix.shape[1]
        weights = np.ones(n_models) / n_models

    weights_hist = [weights]
    for it in range(n_iter):
        # 2 Expectation: calculate posterior probabilities from current model weights
        weighted_probs = prob_matrix * weights
        total_probs = weighted_probs.sum(axis=1, keepdims=True)
        posterior_probs = weighted_probs / total_probs

        # 3 Maximization: update model weights using posterior probabilities from E-step
        weights = posterior_probs.mean(axis=0)
        # add weights to weight history
        weights_hist.append(weights)

    return weights, weights_hist

In [244]:
class InterpolatedModel:
    def __init__(self, lambda_):
        self.l1, self.l2, self.l3 = lambda_
        self.unigram = UnigramModel()
        self.bigram = BigramModel()
        self.trigram = TrigramModel()
    
    def verify_vocs(self):
        uvoc = self.unigram.voc
        bvoc = self.bigram.c_voc
        tvoc = self.trigram.c_voc
        
        for u, b, t in zip(uvoc.keys(), bvoc.keys(), tvoc.keys()):
            if u != b or b!=t:
                print('WARN: vocabularies dont match')
        
        print('Finished checking vocabularies')
        
    def train(self, documents, k=0, voc_size=10000):
        self.unigram.train(documents, voc_size)
        self.bigram.train(documents, k, voc_size)
        self.trigram.train(documents, k, voc_size)
        self.verify_vocs()
    
    def predict(self, sequence):
        # build contexts
        bicontext = sequence[1]
        tricontext = sequence[0] + ' ' + sequence[1]
        
        # get conditioned spaces
        unispace = self.unigram.probs
        bispace = self.bigram.conditioned_space(bicontext)
        trispace = self.trigram.conditioned_space(tricontext)
        
        # sample from probability space
        probs = self.l1 * unispace + self.l2 * bispace + self.l3 * trispace
        c_index = sample(probs)
        
        return self.unigram.voc_words[c_index], probs[c_index]
    
    def cond_prob(self, word1, word2, word):
        uniprob = self.unigram.estimate_prob(word)
        biprob  = self.bigram.cond_prob(word2, word)
        triprob = self.trigram.cond_prob(word1, word2, word)
        prob = self.l1 * uniprob + self.l2 * biprob + self.l3 * triprob
        return prob
    
    def estimate_prob(self, sequence):
        if len(sequence) < 3:
            print('[ERR]: Not Enough Tokens for Interpolated Model')
            return 1
        
        word1 = sequence[0] 
        word2 = sequence[1]
        word = word2
        total_logprob = 0
        for word in sequence[2:]:
            prob = self.cond_prob(word1, word2, word)
            total_logprob += np.log(prob)
            word1, word2 = word2, word
        
        return np.exp(total_logprob)
    
    def generate_sequence(self, max_length=None, strat=None, activation_window=3):
        sequence = ['<s>']
        word1 = '<s>' 
        word2 = '<s>'
        word = word2
        actual_probs = [np.copy(self.bigram.probs), np.copy(self.trigram.probs)]
        while word != '</s>':
            word, _ = self.predict([word1, word2])
            word1, word2 = word2, word
            sequence.append(word)
            if max_length is not None and len(sequence) >= max_length:
                sequence.append('</s>')
                return sequence
            
            if strat is not None:
                new_biprobs = strat(self.bigram.probs, len(sequence), max_length, activation_window)
                if new_biprobs is not None:
                    self.bigram.probs = new_biprobs
                    self.trigram.probs= strat(self.trigram.probs, len(sequence), max_length, activation_window)
        
        self.bigram.probs = actual_probs[0]
        self.trigram.probs = actual_probs[1]
            
        return sequence
    
    def eval_model(self, documents):
        test_docs = add_padding(documents, k=2)
        return self.perplexity(test_docs)
    
    def perplexity(self, test_set):
        log_perp = 0
        N = 0
        for test in test_set:
            N += len(test) - 2 if len(test) > 2 else 0
            for i in range(2, len(test)):
                c1, c2, w = test[i-2], test[i-1], test[i]
                prob = self.cond_prob(c1, c2, w)
                log_perp += np.log(1/prob)

        perp = np.exp(1/N * log_perp)
        return perp
    
    def fixed_lambdas_train(self, val_set, lambdas):
        val_docs = add_padding(val_set, k=2)
        perplexities = []
        for lambda_ in lambdas:
            self.l1, self.l2, self.l3 = lambda_
            perp = self.perplexity(val_docs)
            perplexities.append(perp)
        
        lower_index = np.argsort(np.array(perplexities))[0]
        self.l1, self.l2, self.l3 = lambdas[lower_index]
        return perplexities
    
    def em_train(self, val_set, max_it, init_weights=None):
        val_docs = add_padding(val_set, k=2)
        probs = []
        for val_doc in val_docs:
            for i in range(2, len(val_doc)):
                w1, w2, w = val_doc[i-2], val_doc[i-1], val_doc[i]
                uniprob = self.unigram.estimate_prob(w)
                biprob  = self.bigram.cond_prob(w2, w)
                triprob = self.trigram.cond_prob(w1, w2, w)
                probs.append([uniprob, biprob, triprob])
        
        weights, hist = optimize_em(np.array(probs), max_it, init_weights)
        self.l1, self.l2, self.l3 = weights
        
        perplexities = []
        for weight in hist:
            self.l1, self.l2, self.l3 = weight
            perplexities.append(self.perplexity(val_set))
        
        return perplexities

# Ejercicio 3. Fixed Lambdas

In [111]:
lambdas_ = [[1/3, 1/3, 1/3],[.4, .4, .2],[.2, .4, .4],[.5, .4, .1],[.1, .4, .5], [0.05, 0.25, 0.7]]

In [104]:
i_model = InterpolatedModel(lambdas_[0])
i_model.train(train_corpus, k=0.0001, voc_size=11000)
perplexities = i_model.fixed_lambdas_train(val_corpus, lambdas_)

In [138]:
for l, p in zip(lambdas_, perplexities):
    print(bold_string('Lambdas: '), l)
    print(bold_string('perplexity in val: '), p, '\n')

[1mLambdas: [0m  [0.3333333333333333, 0.3333333333333333, 0.3333333333333333]
[1mperplexity in val: [0m  368.9836466855404 

[1mLambdas: [0m  [0.4, 0.4, 0.2]
[1mperplexity in val: [0m  415.1575185790954 

[1mLambdas: [0m  [0.2, 0.4, 0.4]
[1mperplexity in val: [0m  316.9369524677756 

[1mLambdas: [0m  [0.5, 0.4, 0.1]
[1mperplexity in val: [0m  523.1147219629094 

[1mLambdas: [0m  [0.1, 0.4, 0.5]
[1mperplexity in val: [0m  289.5122983113761 

[1mLambdas: [0m  [0.05, 0.25, 0.7]
[1mperplexity in val: [0m  296.0531070879839 



In [139]:
print(bold_string('perplexity in test set: '), i_model.eval_model(test_corpus))

[1mperplexity in test set: [0m  287.26091614460216


Podemos ver como las perplejidades aumentan y disminuyen. El conjunto de valores lambda con menor perplejidad son $\lambda_1 = 0.1$, $\lambda_2 = 0.4$, $\lambda_3 = 0.5$. Al evaluar en el conjunto de prueba obtenemos un valor para la perplejidad de 287.26, lo cual se asemeja al resultado obtenido para el conjunto de validación utilizado para obtener los valores de los lambdas.

# Sección 3

## Ejercicio 1. Modelo Interpolado Entrenado con EM

In [142]:
i_model = InterpolatedModel(lambdas_[0])
i_model.train(train_corpus, k=0.0001, voc_size=11000)
perplexities = i_model.em_train(val_corpus, 10)

Finished checking vocabularies


In [146]:
for i, p in enumerate(perplexities):
    print(bold_string('iter: '), i, bold_string(' perplexity in val: '), p)

[1miter: [0m  0 [1m perplexity in val: [0m  438.7230487809928
[1miter: [0m  1 [1m perplexity in val: [0m  333.6293142239757
[1miter: [0m  2 [1m perplexity in val: [0m  328.3406306089978
[1miter: [0m  3 [1m perplexity in val: [0m  328.0715003556267
[1miter: [0m  4 [1m perplexity in val: [0m  328.0729048436725
[1miter: [0m  5 [1m perplexity in val: [0m  328.07810428929037
[1miter: [0m  6 [1m perplexity in val: [0m  328.07944305459466
[1miter: [0m  7 [1m perplexity in val: [0m  328.0795473173216
[1miter: [0m  8 [1m perplexity in val: [0m  328.0794282243586
[1miter: [0m  9 [1m perplexity in val: [0m  328.07932393392196
[1miter: [0m  10 [1m perplexity in val: [0m  328.07926200636825


In [147]:
print(bold_string('perplexity in test set: '), i_model.eval_model(test_corpus))

[1mperplexity in test set: [0m  269.65591326798165


### Generación de Secuencia con el modelo Interpolado

In [148]:
seq = i_model.generate_sequence()
print_sequence(seq)

una verga en el dinero tarda el a vez que ganar ! para que nadie los lea no solo el suelo simulando la cockblock con 3000 


In [243]:
seq = i_model.generate_sequence()
print_sequence(seq)

#esfeocuandoteenteras que andan te chupo 🤗 audio ? ? váyanse que @usuario está de la celebración de tus <unk> pongo soy es ! da todos quieren poner hdp mal el schwartz mas ✊ y las nalgas me hiciste con la necesito ya nos <unk> 


In [193]:
seq = i_model.generate_sequence()
print_sequence(seq)

bien hermano masturbarme narración de toda la cara sería que . 


Podemos ver como el algoritmo EM consigue que los valores de la perplejidad vayan bajando en cada iteración y a partir de 4, 5 iteraciones ya no se observa un cambio sustancial. Es decir el algoritmo converge rápidamente a una solución. Podemos apreciar a través de la generación de secuencias que estas parecen tener un poco más de sentido que los modelos anteriores, y esto tiene sentido con lo esperado debido a que podemos analizar el contexto con los bigramas y trigramas, y en caso contrario siempre utilizar los unigramas. 

# Ejercicio 2. Twittear y Actualización de Probabilidades

## Actualización de Probabilidades 

Es de interés tomar cierta medida para asegurar que la probabilidad del token de fin de secuencia '\</s\>' vaya aumentando conforme la secuencia se va haciendo más larga. Para ello utilizaremos la siguiente regla de actualización: 

Sea $p_s$ la probabilidad de obtener el token de fin de secuencia. Entonces como $p_s \leq 1$, sabemos que ${p^r_s} \geq p_s$ en donde $r<1$. De hecho, sabemos también que 

$$\lim_{n\rightarrow \infty} \sqrt[n]{r} = 1$$ 

Entonces, podemos tomar la regla de actualización $$\hat{p}_s = \sqrt[n]{p_s}$$

Debido a que esta probabilidad aumentó, para asegurarnos que el espacio de probabilidad se encuentra bien definido, debemos disminuir esta probabilidad de los otros tokens para asegurarnos que la suma de las probabilidades siga siendo 1. Definamos el aumento de la probabilidad que tenemos respecto al token de fin de secuencia como 

$$a_p = \hat{p}_s - p_s$$

Entonces, sea $p_i$ la probabilidad de obtener el token $t_i$ en donde $t_i \neq $ '\</s\>'. Definamos a $\sigma$ como 

$$\sigma = \sum_{i=1}^{|V|} p_i$$

en donde $|V|$ representa la cardinalidad del conjunto del vocabulario sin considerar al token de fin de secuencia. Notemos que $\sigma = 1 - p_s$. Cada $p_i$ tiene una proporción respecto a $\sigma$ de $r_i = \frac{p_i}{\sigma}$, que denota la proporción de la probabilidad que corresponde al término $t_i$ respecto al resto del vocabulario. Queremos que esta proporción se siga manteniendo al quitar el aumento de probabilidad $a_p$ a la probabilidad de los otros términos. Entonces, utilizando la siguiente regla de actualización

$$\hat{p}_i = p_i - r_i a_p$$

y definiendo a $$\hat{\sigma} = \sum_{i=1}^{|V|} \hat{p}_i$$

podemos ver que se cumple $$\hat{r}_i = \frac{\hat{p}_i}{\hat{\sigma}} = r_i$$

Notemos también que estas reglas en conjunto también se puede utilizar para minimizar la probabilidad tomando a $\hat{p}_s = p^n_s$. De esta manera el incremento $a_p$ será de hecho un decremento, por lo tanto será negativo y de esta manera las probabilidades $p_i$ en vez de disminuir, aumentan proporcionalmente.

In [151]:
# receives a probs matrix and the power r.
def diminish(probs, r):
    # calculate new probability
    new_probs = np.zeros(probs.shape)
    new_stop_prob = np.power(probs[:, -1], r)
    # get improvement
    improve = (new_stop_prob - probs[:, -1])
    # get ratio of the other probabilities between them
    c = np.sum(probs[:, :-1], axis=1)
    rat = probs[:, :-1]/c[:, np.newaxis]
    # update new probability
    new_probs[:, -1] = new_stop_prob
    new_probs[:, :-1] = probs[:, :-1] - rat * improve[:, np.newaxis]
    return new_probs

In [152]:
def diminish_strat(prob_matrix, current_length, max_length=50, activation_window=3):
    threshold = max_length - activation_window
    if current_length >= threshold:
        diff = current_length - threshold + 2
        return diminish(prob_matrix, 1/diff)
    return None

## Pruebas de Estrategia para Aumentar La Probabilidad de Paro.

Como vimos anteriormente, el modelo de bigrama puede generar secuencias muy largas a veces. Entonces, probaremos la estrategia de paro con este modelo. El modelo ya cuenta con esta opción, solo es necesario mandar la función que se encargar de llevar a cabo la estrategia. 

In [198]:
bigram = BigramModel()
bigram.train(documents, k=0.005, voc_size=11000)

In [166]:
seq = bigram.generate_sequence(max_length=50, strat=diminish_strat)
print_sequence(seq)
print(bold_string('sequence length: '), len(seq) - 2)

@usuario #pedazo yendo imposición humos votó próximas multiorgasmica debieron ubicar fotos valer verga en mi mama amanece toquen señal facturar palabra master danza pelón fotografías una-poetiza-loca valiste equivocabas chuchito arde acosar a todos con las que haya condiciones bojórquez gladys bojórquez trota-juzgados metas filas acomplejado chavas políticamente valimos 
[1msequence length: [0m  48


In [200]:
seq = bigram.generate_sequence(max_length=50, strat=diminish_strat, activation_window=3)
print_sequence(seq)
print(bold_string('sequence length: '), len(seq) - 2)

nada como @usuario @usuario y con derecho criticones culiacán rompen llenos exitoso tocará naturales veganos #addi as 💓 sacarse valdría aires recuerden temblar mucha komo apure camioneta reo bue tranquilidad televisa 2015 ahorita entró otra jornada pájaros 1975 hermosas seria bómboro chivastv tregua amanezco aportan manoseo buatsap escríbele :( 
[1msequence length: [0m  49


In [201]:
seq = bigram.generate_sequence(max_length=50, strat=diminish_strat, activation_window=10)
print_sequence(seq)
print(bold_string('sequence length: '), len(seq) - 2)

quisiera burlan editora carcajadas espantes leslie comi sismo tristezas golpeador dormido " beyond 🤖 doler #anderson razonable esperé lightning 👏🏻 entraron aguanten coordinadores angelitos xel dummies #usopenxespn juzgaba embolia fumo #nuevafotodeperfil nuca pendejazo dispararle joto lmao ? 🌟 peduki #protocol_terminal inundar 
[1msequence length: [0m  41


In [203]:
seq = bigram.generate_sequence(max_length=50, strat=diminish_strat, activation_window=10)
print_sequence(seq)
print(bold_string('sequence length: '), len(seq) - 2)

acabo de mierda " al cabo tigres prieta doña feel teporocho mamartelo pokedex americanista principal vaso puras clasificara putisima concuerdan náuseas envío gustaría quieraponer mueres life firmado aguilas nick agendar abril cornudo buenas noches 3000 guantes chivas parecer tardar sala burro 
[1msequence length: [0m  41


#### Podemos ver como conforme aumentamos la ventana de activación, es decir que que cantidad de palabras antes de la cantidad máxima de palabras se debe empezar a aplicar la estrategia para aumentar la probabilidad de paro, las secuencias en efecto tienen una longitud cercana a la esperada. En caso de que se quiera que el cambio sea más gradual basta con modificar la función **diminish\_strat** para que el cambio sea más gradual.   

## Twittear con el Modelo Interpolado y la Actualización de Probabilidades

In [245]:
i_model = InterpolatedModel(lambdas_[0])
i_model.train(train_corpus, k=0.0001, voc_size=11000)
perplexities = i_model.em_train(val_corpus, 10)

Finished checking vocabularies


In [210]:
seq = i_model.generate_sequence(max_length=50, strat=diminish_strat, activation_window=3)
print_sequence(seq)
print(bold_string('sequence length: '), len(seq) - 2)

habla a xfa correcta ¡ planeta like machin esos 
[1msequence length: [0m  9


In [211]:
seq = i_model.generate_sequence(max_length=50, strat=diminish_strat, activation_window=3)
print_sequence(seq)
print(bold_string('sequence length: '), len(seq) - 2)

empata pinche la dijo que me lleva el de perra madre 
[1msequence length: [0m  11


In [213]:
seq = i_model.generate_sequence(max_length=50, strat=diminish_strat, activation_window=3)
print_sequence(seq)
print(bold_string('sequence length: '), len(seq) - 2)

un gobierno 💁 tuve que culote q están lloviendo de volverme loca la pelas #themist los que ya había hablado se lloras cuando esas putas <unk> que aguantar a a la verga ! deja al final vale la es que era la tus buenas se 20 mierda 
[1msequence length: [0m  47


In [214]:
seq = i_model.generate_sequence(max_length=50, strat=diminish_strat, activation_window=3)
print_sequence(seq)
print(bold_string('sequence length: '), len(seq) - 2)

la coca para que <unk> toda la vida que no para decir esta es una bonita las voy a la verga en en no en mi pinche vivir eso jsjaja posándola también oficia la haciendo al puro pan dale " un letrero pero 😴 para si para . </s> 
[1msequence length: [0m  49


In [246]:
seq = i_model.generate_sequence(max_length=50, strat=diminish_strat, activation_window=3)
print_sequence(seq)
print(bold_string('sequence length: '), len(seq) - 2)

son bien graciosas ... por fin lo mismísimo y todavía pone la mamón 45 💞 
[1msequence length: [0m  15


In [248]:
seq = i_model.generate_sequence(max_length=50, strat=diminish_strat, activation_window=3)
print_sequence(seq)
print(bold_string('sequence length: '), len(seq) - 2)

@usuario @usuario basura q tiene <unk> narra . verguitas ¿ de que me arañó que la chingada esos 
[1msequence length: [0m  18


In [249]:
seq = i_model.generate_sequence(max_length=50, strat=diminish_strat, activation_window=3)
print_sequence(seq)
print(bold_string('sequence length: '), len(seq) - 2)

si le pierdes tele los te eres que mierda inmensamente fics en … 
[1msequence length: [0m  13


In [250]:
seq = i_model.generate_sequence(max_length=50, strat=diminish_strat, activation_window=3)
print_sequence(seq)
print(bold_string('sequence length: '), len(seq) - 2)

no es mi mamá se usemos uber <unk> la boquita falsas 😈 de esos actores gustada a <unk> con estampado recibe su puta y está todo ofrecían méxico ... ! ! ! ? 🤔 
[1msequence length: [0m  34


In [254]:
seq = i_model.generate_sequence(max_length=50, strat=diminish_strat, activation_window=3)
print_sequence(seq)
print(bold_string('sequence length: '), len(seq) - 2)

no se que asco . como siempre ! mojada un <unk> de ahora y haviendo gente en un señor sol ❌ se 
[1msequence length: [0m  22


In [255]:
seq = i_model.generate_sequence(max_length=50, strat=diminish_strat, activation_window=3)
print_sequence(seq)
print(bold_string('sequence length: '), len(seq) - 2)

levo todos los putos mosquitos permiso tendría puñal de decir ? besos #televisa leés fumo y coincidiendo putas mames esta princesa nunca el parte por pendejo me dejo platicas es sólo las pinches simios de subiendo la se pasa en contratos pinche y xd 
[1msequence length: [0m  44


In [260]:
seq = i_model.generate_sequence(max_length=50, strat=diminish_strat, activation_window=3)
print_sequence(seq)
print(bold_string('sequence length: '), len(seq) - 2)

mamá luchona de descanse esa foto se vayan a alguien se te ha pasado que recuerdos que sabes putas en pleno siglo xxi mantiene sus tuits de ahí clavó arcaico 
[1msequence length: [0m  30


#### A través de varios de los ejemplos anteriormente podemos ver como hay secuencias que si parecen tener sentido al menos en ventanas de 3 palabras. Notemos también como estas secuencias en su mayoría no son de longitudes cercanas al máximo de 50 palabras, por lo cual parece ser que el modelo está capturando de cierta manera la esencia y la longitud de los tweets. Aun así, cuando el modelo se acerca, podemos ver que nuestra estrategia de paro funciona debido a que en los primeros ejemplos podemos ver dos secuencias de longitud 47 y 49, que entran ya en la ventana de activación de 3 palabras antes de la cantidad máxima permitida.

# Entrenar Modelo con Discursos de AMLO.

In [273]:
filenames = glob.glob('conferencias_fecha/*')

amlo_docs = []
for filename in filenames:
    file = open(filename, 'r')
    amlo_docs.append(file.read())

amlo_docs = process_documents(amlo_docs)

NameError: name 'glob' is not defined

# Permutar Oraciones

In [458]:
from itertools import permutations

def get_permutations(sentence):
    return set(permutations(sentence))

# PRUEBAS

In [63]:
probs = [[.4, .3, .3],[.3, .5, .2]]
probs = np.array(probs, dtype=np.float128)

new_probs = diminish(probs, 0.25)
print(new_probs)
np.sum(new_probs,axis=1)

[0.7400828 0.6687403]
[0.4400828 0.4687403]
[1. 1.]
[[0.14852411 0.11139308 0.7400828 ]
 [0.12422239 0.20703731 0.6687403 ]]


array([1., 1.])

In [61]:
.4/(.4 + .3)

0.5714285714285715

In [62]:
.14852411/(0.14852411 + 0.11139308)

0.5714285769248274

In [367]:
trigram_m = TrigramModel()

In [359]:
trigram_m.bi_voc['<s> hola']

1887

In [304]:
np.sum(trigram_m.probs[-2])

1.0

In [369]:
trigram_m.probs.shape, len(trigram_m.bi_voc), len(trigram_m.voc)

((10001, 10002), 10001, 10002)

In [370]:
np.sum(trigram_m.probs)

10000.999999999993

In [364]:
list(trigram_m.bi_voc.items())[891]

('alv .', 891)

In [None]:
for i in range(trigram_m.probs.shape[0]):
    for j in range(trigram_m.probs.shape[1]):
        if np.isnan(trigram_m.probs[i,j]):
            print('nan at', i, ' ', j)

In [368]:
trigram_m.train(documents)

In [227]:
vocabulary, unidocs = prepair_unigram(documents, 10000)
bi_vocabulary, bidocs = prepair_bigram(documents, 10000) 

In [204]:
unigram = build_unigram(unidocs, vocabulary)
unigram_prob = unigram/np.sum(unigram[1:])

In [205]:
bigram = build_bigram(unidocs, vocabulary)

In [206]:
bigram_prob = bigram[:-1]/unigram[:-1, np.newaxis]

In [207]:
np.sum(bigram_prob)

10000.000000000004

In [228]:
bi_padded_docs = add_padding(unidocs, k=1)

In [229]:
trigram = build_trigram(bi_padded_docs, vocabulary, bi_vocabulary)

In [230]:
unigram_of_bigrams = build_unigram(bidocs, bi_vocabulary)

In [231]:
trigram_prob = trigram[:-1]/unigram_of_bigrams[:-1, np.newaxis]

In [241]:
np.sum(trigram, axis=1) == unigram_of_bigrams 

array([ True,  True,  True, ...,  True,  True, False])

In [None]:
list(bi_vocabulary.keys())

In [39]:
np.sum(bigram[0, :])

5544.0

In [58]:
padded_vocabulary['<unk>']

10000

In [68]:
unigram[-2]

5544.0

In [49]:
bigram_prob[padded_vocabulary['<s>'], padded_vocabulary['<unk>']]

0.02958152958152958