# David Gamaliel Arcos Bravo
### T3: Modelos de Lenguaje Estadísticos

In [94]:
import nltk
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from collections import Counter
from nltk.tokenize import TweetTokenizer
from nltk import FreqDist

In [95]:
def get_files(PATH_TRAIN, PATH_TRAIN_LABELS):
    train, train_labels = [], []
    with open(PATH_TRAIN, 'r', encoding="utf8") as f:
        train = [ line for line in f]
    with open(PATH_TRAIN_LABELS, 'r', encoding="utf8") as f:
        train_labels = f.readlines()
    return np.array(train), np.array(train_labels)

PATH_TRAIN = '../DatasetAgresividad/mex_train.txt'
PATH_LABELS = '../DatasetAgresividad/mex_train_labels.txt'
tr_txt, tr_y = get_files(PATH_TRAIN, PATH_LABELS)
tr_txt.shape, tr_y.shape

((5544,), (5544,))

1. (5pts) Preprocese todos los tuits de agresividad (positivos y negativos) según su intu-
ición para construir un buen corpus para un modelo de lenguaje (e.g., solo palabras en minúscula, etc.). Agregue tokens especiales de <s> y </s> según usted considere (e.g., al
inicio y final de cada tuit). Defina su vocabulario y enmascare con <unk> toda palabra
que no esté en su vocabulario.

In [96]:
def calculate_fdist(corpus_words, filter = False):
    fdist = nltk.FreqDist(corpus_words)
    aux = [(fdist[key], key) for key in fdist]
    aux.sort()
    aux.reverse()
    if filter == True: 
        fdist = aux[:1000]
    else:
        fdist = aux
    fdist = dict([(word, freq) for freq, word in fdist])
    fdist_idx = dict([(word, i) for i, word in enumerate(fdist)])
    return fdist, fdist_idx

def get_corpus(tweets, vocab):
    corpus_words = []
    for txt in tweets:
        for word in txt:
            if word in vocab:
                corpus_words.append(word)
    return corpus_words

In [97]:
def process_corpus(tr_txt):

    # Tokenizar y agregar <s> y </s>

    tknzr = TweetTokenizer()
    tr_txt_tknzd = [['<s>'] + tknzr.tokenize(tuit) + ['</s>'] for tuit in tr_txt]
    
    # Crear vocabulario

    vocab = set()
    for tuit in tr_txt_tknzd:
        for word in tuit:
            if word.isalpha() or word in ['<s>', '</s>']:
                vocab.add(word)

    # Crear corpus como lista y calcular fdist filtrando

    corpus_words = get_corpus(tr_txt_tknzd, vocab)
    fdist, fdist_idx = calculate_fdist(corpus_words, filter = True)
    
    # Generar vocabulario nuevo con palabras filtradas

    vocab = set(fdist.keys())
    vocab.add('<unk>')
    vocab.add('<s>')
    vocab.add('</s>')

    # Retokenizar y cambiar valores de vocab por unk

    tr_txt_tknzd = [[word if word in vocab else '<unk>' for word in tuit] for tuit in tr_txt_tknzd]

    # Generar nuevo corpus y fdist
    corpus_words = get_corpus(tr_txt_tknzd, vocab)
    fdist, fdist_idx = calculate_fdist(corpus_words)

    return tr_txt_tknzd, vocab, fdist, fdist_idx

tr_txt_tknzd, vocab, fdist, fdist_idx = process_corpus(tr_txt)

In [98]:
print(tr_txt_tknzd[np.random.randint(len(tr_txt_tknzd))])

['<s>', '<unk>', '<unk>', 'putos', '<unk>', '<unk>', 'los', '<unk>', 'minutos', 'que', 'se', 'hacen', 'de', 'mi', 'casa', 'a', 'mi', '<unk>', '<unk>', '<unk>', '</s>']


Para el procesamiento primero tokenize los tweets para agregar los caracteres especiales, filtre caracteres no validos y saque distribuciones, Despues repeti el proceso para tener un corpus mas estable en palabras importante y sin caracteres innecesarios.

2. (20pts) Entrene tres modelos de lenguaje sobre todos los tuits: unigramas, bigrama , trigramas
Para cada uno proporcione una interfaz (función) sencilla 
Los modelos deben tener una estrategia común para lidiar con secuencias no vistas. 
Puede optar por un suavizamiento Laplace o un Good-Turing
discounting. Muestre un par de ejemplos de como funciona, al menos uno con una palabra fuera del vocabulario.

In [99]:
def get_unigram_counts(vocab, tweets):
    # Numpy array
    unigram_counts = np.zeros(len(vocab), dtype=np.float64)
    for tuit in tweets:
        for word in tuit:
            unigram_counts[fdist_idx[word]] += 1
    unigram_counts = unigram_counts / unigram_counts.sum()
    return unigram_counts

unigram_counts = get_unigram_counts(vocab, tr_txt_tknzd)

def generate_sentence(unigram_counts, vocab, max_len=20):
    sentence = ['<s>']
    while len(sentence) < max_len:
        word = np.random.choice(list(vocab), p=unigram_counts)
        if word == '<s>':
            continue
        sentence.append(word)
        if word == '</s>':
            break
    return sentence

print(generate_sentence(unigram_counts, vocab))

['<s>', 'pendejo', 'pendejo', 'traen', 'tráfico', 'cree', 'escucha', 'pendejo', 'vídeo', 'pendejo', 'viejas', 'pendejo', 'puñal', 'cierto', 'neta', 'vi', 'seguir', 'debería', 'ellas', 'va']


In [100]:
def get_bigram_counts(fdist, fdist_idx, tweets):
    bigram_counts = np.zeros(
        (len(fdist), len(fdist)),
        dtype=np.float64
    )
    for tuit in tweets:
        for i in range(len(tuit) - 1):
            if tuit[i] in vocab and tuit[i + 1] in vocab:
                bigram_counts[fdist_idx[tuit[i]], fdist_idx[tuit[i + 1]]] += 1
    for i in range(len(fdist)):
        total = np.sum(bigram_counts[i]) + len(fdist)
        bigram_counts[i] = (bigram_counts[i] + 1) / total

    return bigram_counts

bigram_counts = get_bigram_counts(fdist, fdist_idx, tr_txt_tknzd)
# print(bigram_counts)

def generate_sentence_bigram(bigram_counts, fdist, fdist_idx, vocab, max_len=20):
    sentence = ['<s>']
    while len(sentence) < max_len:
        if sentence[-1] == '</s>':
            break
        word = np.random.choice(list(vocab), p=bigram_counts[fdist_idx[sentence[-1]]])
        sentence.append(word)
    return sentence

print(generate_sentence_bigram(bigram_counts, fdist, fdist_idx, vocab))

['<s>', 'tenía', 'mía', 'tipo', 'ciudad', 'loca', 'como', 'cada', 'méxico', 'decirte', 'jaja', 'pensé', 'escucha', 'metro', 'jugadores', 'bendición', 'enfermo', 'maricones', 'chido', 'buenas']


In [101]:
# Trigrama

def get_trigram_counts(fdist, fdist_idx, tweets):
    trigram_counts = np.ones(
        (len(fdist), len(fdist), len(fdist)),
        dtype=np.float64
    )
    for tuit in tweets:
        for i in range(len(tuit) - 2):
            if tuit[i] in vocab and tuit[i + 1] in vocab and tuit[i + 2] in vocab:
                trigram_counts[fdist_idx[tuit[i]], fdist_idx[tuit[i + 1]], fdist_idx[tuit[i + 2]]] += 1
    for i in range(len(fdist)):
        for j in range(len(fdist)):
            total = np.sum(trigram_counts[i, j]) + len(fdist)
            trigram_counts[i, j] = (trigram_counts[i, j] + 1) / total

    return trigram_counts

trigram_counts = get_trigram_counts(fdist, fdist_idx, tr_txt_tknzd)

def generate_sentence_trigram(trigram_counts, fdist, fdist_idx, vocab, max_len=20):
    sentence = ['<s>', '<s>']
    while len(sentence) < max_len:
        if sentence[-1] == '</s>':
            break
        word = np.random.choice(list(vocab), p=trigram_counts[fdist_idx[sentence[-2]], fdist_idx[sentence[-1]]])
        sentence.append(word)
    return sentence

print(trigram_counts.shape)
print(generate_sentence_trigram(trigram_counts, fdist, fdist_idx, vocab))

(1001, 1001, 1001)
['<s>', '<s>', 'eres', 'ella', 'unos', 'cree', 'normal', 'creer', 'chivas', 'todavía', 'sabes', 'desde', 'caliente', 'comprar', 'osorio', 'porqué', 'espero', 'matar', 'maldito', 'saludos']


En esta parte la principal tarea es correr un modelo con suficientes valores para poder generar sentencias con sentido, donde la memoria es importante de priorizar. Decidi filtrar sobre 2000 para tener un rendimiento estable con todos los modelos sin perder contexto ni palabras y de formas que corrieran de manera eficiente.

3. (25pts) Construya un modelo interpolado con valores λ fijos:

In [102]:
deltas = [0.2, 0.3, 0.5]

def generate_sentence_interpolation(trigram_counts, bigram_counts, unigram_counts, fdist, fdist_idx, vocab, deltas, max_len=50):
    sentence = ['<s>', '<s>']
    while len(sentence) < max_len:
        if sentence[-1] == '</s>':
            break
        word = np.random.choice(
            list(vocab), 
            p = deltas[0] * trigram_counts[fdist_idx[sentence[-2]], fdist_idx[sentence[-1]]] +
                deltas[1] * bigram_counts[fdist_idx[sentence[-1]]] +
                deltas[2] * unigram_counts
        )
        sentence.append(word)
    return sentence

print(generate_sentence_interpolation(trigram_counts, bigram_counts, unigram_counts, fdist, fdist_idx, vocab, deltas))

['<s>', '<s>', 'esperando', 'triste', 'hermano', 'pendejo', 'xd', 'leche', 'final', 'rica', 'ratas', 'saliendo', 'traen', 'voy', 'esas', 'esperando', 'va', 'pendejo', 'caer', 'ojalá', 'siempre', 'ser', 'vídeo', 'pendejo', 'dando', 'boca', 'fotos', 'sales', 'risa', 'usar', 'pendejo', 'puerta', 'ustedes', 'feliz', 'llena', 'pendejo', 'vecinos', 'fea', 'hablan', 'acaba', 'enfermo', 'seguir', 'juego', 'usa', 'pendejo', 'like', 'estuvo', 'estudiar', 'viejas', 'nueva']


1. (20pts) Haga una función "tuitear" con base en su modelo de lenguaje P̂ del último punto.
El modelo deberá poder parar automáticamente cuando genere el símbolo de terminación
de tuit al final (e.g., "</s>"), o 50 palabras. Proponga algo para que en los últimos tokens
sea más probable generar el token "</s>". Muestre al menos cinco ejemplos.

In [103]:
deltas = [0.2, 0.35, 0.45]

def tuitear():
    return ' '.join(generate_sentence_interpolation(trigram_counts, bigram_counts, unigram_counts, fdist, fdist_idx, vocab, deltas))

for i in range(5):
    print(tuitear())

<s> <s> falta besos conmigo pendejo vuelven estudiar siempre porqué pendejo valga fotos asi palabras otros extraño televisa madres pinche pendejo frente estaría whatsapp escribir hicieron familia escuchando hdp primero jamás pendejo fue boca imagen saben vídeo pendejo tetas besos quienes ve gustas ten pendejo pierda tareas asi ponerme mal
<s> <s> mandarte idea pendejo asi vagina nombre darte siente tetas bye hablan puros pendejo boca naturaleza puedo cierto américa fotos pendejo fotos cachonda vean seguir uno sido matar pendejo perro entre pendejo misma salió clase mismos haber pendejo pendejo hecho traen nueva conmigo gritando fuera espero facebook esperando juego
<s> <s> saben like cierto estuvo pendejo pendejo traen pendeja carajo amor vídeo padres estás cierto sus pinche wey comentarios recuerdo gusta pendejo vi mamar visto manda toca pendejo nunca pendejo una cuando urge pinche cachetada cree imagino tengo traen tenía pendejo pendejo chica clase seguir seguir pendejo hoy super
<s>

2. (10pts) Use la intuición que ha ganado en esta tarea y los datos de las mañaneras para
entrenar un modelo de lenguaje AMLO. Haga una un función "dar_conferencia()". Generé
un discurso de 300 palabras y detenga al modelo de forma abrupta.

In [106]:
PATH_TRAIN = '../DatasetDownloadTarea/estenograficas_limpias_por_fecha/2019-03-14'
PATH_LABELS = '../DatasetDownloadTarea/estenograficas_limpias_por_fecha/2019-05-12'

amlo_tr_txt, amlo_tr_y = get_files(PATH_TRAIN, PATH_LABELS)
amlo_tr_txt_tknzd, amlo_vocab, amlo_fdist, amlo_fdist_idx = process_corpus(tr_txt)

amlo_unigram_counts = get_unigram_counts(amlo_vocab, amlo_tr_txt_tknzd)
amlo_bigram_counts = get_bigram_counts(amlo_fdist, amlo_fdist_idx, amlo_tr_txt_tknzd)
amlo_trigram_counts = get_trigram_counts(amlo_fdist, amlo_fdist_idx, amlo_tr_txt_tknzd)

amlo_deltas = [0.2, 0.35, 0.45]

def dar_conferencia(unigram_counts, bigram_counts, trigram_counts, fdist_idx, vocab, deltas, stop_words = 300):
    conferencia = ['<s>', '<s>']
    while len(conferencia) < stop_words:
        word = np.random.choice(
            list(vocab), 
            p = deltas[0] * trigram_counts[fdist_idx[conferencia[-2]], fdist_idx[conferencia[-1]]] +
                deltas[1] * bigram_counts[fdist_idx[conferencia[-1]]] +
                deltas[2] * unigram_counts
        )
        if word == '</s>':
            continue
        conferencia.append(word)
    return ' '.join(conferencia)

print(dar_conferencia(amlo_unigram_counts, amlo_bigram_counts, amlo_trigram_counts, amlo_fdist_idx, amlo_vocab, amlo_deltas))

<s> <s> saben hizo trabajo relevante subsecretaria www Jesús proceso relevante problema hizo respecto mexicanos bajan principio principio hemos en relevante crean Para saben Ahora atendiendo siquiera autobuses diera principio pública estaban relevante Procedimientos cada puede da días pero casi principio tanta tenido estaban nacionalidad relevante correspondiente policía grave tercos procede justamente dio va relevante Cuarta legales transaccional integrante queremos seguir sigue León dato teníamos duda venganza relevante Sala Proyecto relevante legisladores servidor relevante agentes Código vemos favor importante Rayos corruptas Agustín relevante oficiosa parte conducta estaban Yo estenográfica Tamaulipas tanto ha plana relevante relevante Migración orden enero verificando explicara eso intuición saber cierto aplicar utilizado Antes juez Alejandro LUIS ha denuncias seguir relevante Qué Código trata señor relevante tienen pasados ese sumados está Tamaulipas seguir investigaciones relev

Dada la naturaleza de los modelos fue imposible generarlo para todos los archivos, puesto que estos pesaban mucho y tardaba demasiado en cargar, ademas que los reucrsos en memoria no daban por lo que decidi hacerlo con un archivo de las mananeras especialmente

3. (10pts) Calcule el estimado de cada uno sus modelos de lenguaje (el de tuits y el de amlo)
para las frases: "sino gano me voy a la chingada", "ya se va a acabar la corrupción".

In [137]:
def estimate_sentence_probability(sentence, trigram_counts, bigram_counts, unigram_counts, fdist, fdist_idx, deltas):
    sentence = ['<s>', '<s>'] + sentence + ['</s>']
    log_prob = 0
    for i, word in enumerate(sentence):
        if word not in vocab:
            sentence[i] = '<unk>'
    for i in range(2, len(sentence)):
        log_prob += np.log(
            deltas[0] * trigram_counts[fdist_idx[sentence[-2]], fdist_idx[sentence[-1]]] +
            deltas[1] * bigram_counts[fdist_idx[sentence[-1]]] +
            deltas[2] * unigram_counts
        )
    # print( "Probability of " + ''.join(sentence) + " is " + str(np.exp(log_prob)))
    return np.sum(np.exp(log_prob))

print(estimate_sentence_probability(['sino', 'gano', 'me', 'voy', 'a', 'la', 'chingada'], trigram_counts, bigram_counts, unigram_counts, fdist, fdist_idx, deltas))
print(estimate_sentence_probability(['ya', 'se', 'va', 'a', 'acabar', 'la', 'corrupción'], amlo_trigram_counts, amlo_bigram_counts, amlo_unigram_counts, amlo_fdist, amlo_fdist_idx, amlo_deltas))

9.726317341747217e-09
9.726317341747217e-09


Un punto importante a destacar en este modelo es que algunas palabras como chingada no estaban en el corpus original por lo que tuve que parsear estos valores por unk, y es interesante como las probabilidades son tan bjaas que incluso son iguales con modelos diferentes.

4. (10pts) Para cada oración del punto anterior, haga todas las permutaciones posibles.
Calcule su probabilidad a cada nueva frase y muestre el top 3 mas probable y el top 3
menos probable (para ambos modelos de lenguaje). Proponga una frase más y haga lo
mismo.

In [138]:
def get_permutations(sentence):
    permutations = []
    for i in range(len(sentence)):
        for j in range(len(sentence)):
            if i == j:
                continue
            permutation = sentence.copy()
            permutation[i], permutation[j] = permutation[j], permutation[i]
            permutations.append(permutation)
    return permutations

def get_top_n_permutations(sentence, n, trigram_counts, bigram_counts, unigram_counts, fdist, fdist_idx, deltas):
    permutations = get_permutations(sentence)
    probs = []
    for permutation in permutations:
        probs.append(estimate_sentence_probability(permutation, trigram_counts, bigram_counts, unigram_counts, fdist, fdist_idx, deltas))
    top_n = np.argsort(probs)[::-1][:n]
    return [permutations[i] for i in top_n]

sentence = ['sino', 'gano', 'me', 'voy', 'a', 'la', 'chingada']
print(get_top_n_permutations(sentence, 3, trigram_counts, bigram_counts, unigram_counts, fdist, fdist_idx, deltas))

sentence = ['ya', 'se', 'va', 'a', 'acabar', 'la', 'corrupción']
print(get_top_n_permutations(sentence, 3, amlo_trigram_counts, amlo_bigram_counts, amlo_unigram_counts, amlo_fdist, amlo_fdist_idx, amlo_deltas))

[['sino', 'gano', 'me', 'voy', 'a', 'chingada', 'la'], ['sino', 'la', 'me', 'voy', 'a', 'gano', 'chingada'], ['voy', 'gano', 'me', 'sino', 'a', 'la', 'chingada']]
[['ya', 'se', 'va', 'a', 'acabar', 'corrupción', 'la'], ['ya', 'la', 'va', 'a', 'acabar', 'se', 'corrupción'], ['a', 'se', 'va', 'ya', 'acabar', 'la', 'corrupción']]


Primero tenemos una funcion para generar permutaciones, con esta creamos todas las permutaciones de una sentencia y obtenemos las sentencias con mayor probabilidad de aparecer en nuestro modelos.

Esta tarea fue muy interesante y retadora, ya que aprendi mas sobre el funcionamiento de los ngramas, asi como su eficiencia y limitaciones que pueden llegar a tener, como es el caso del corpus y la memoria que consume, por lo que estos modelos son muy utiles con recursos de memoria superiores a los de un ordenador. No obstante, osn sencillos y generan resultados interesantes con una complejidad sencilla de programacion.