$$\textbf{PLN. Tarea 4: Modelos de Lenguaje Estadísticos}$$
$$\textit{Y. Sarahi García Gozález}$$

<font size=4.5 color='lightblue'>

$\textit{Librerías}$

In [1]:
import nltk
import numpy as np
import sklearn
import re
import os
import string

from nltk.tokenize import TweetTokenizer,RegexpTokenizer
from collections import Counter
from math import log, exp


#para divdir train y test
from sklearn.model_selection import train_test_split
#para darle formato a las cuniones
from __future__ import annotations
from typing import List, Tuple, Callable

In [2]:
print("Tarea realizada en MacOs. \nLas versiones de las librerías y de python utilizadas fueron:\n")
from platform import python_version
print("Python version:", python_version())
print("NumPy version:", np.__version__)
print("NLTK version:", nltk.__version__)
print("Scikit-learn version:", sklearn.__version__)

Tarea realizada en MacOs. 
Las versiones de las librerías y de python utilizadas fueron:

Python version: 3.11.0
NumPy version: 1.23.5
NLTK version: 3.8.1
Scikit-learn version: 1.3.0




<font size=4.5 color='lightblue'>

$\textit{Modelo de lenguaje y evaluación}$

Primero definiremos las funciones para cargar la lista de documentos y los diccionarios necesarios para crear el corpus y el vocabulario.

In [3]:
def get_text_from_file(path_corpus,path_truth):

    tr_txt=[]
    tr_labels=[]

    with open(path_corpus, "r") as f_corpus,open(path_truth, "r") as f_truth:
        for tweet in f_corpus:
            tr_txt += [tweet]
        for label in f_truth:
            tr_labels += [label]   
             
    return tr_txt, tr_labels

def create_list_freq(corpus,n):
    fdist = nltk.FreqDist(corpus)
    aux=[(fdist[key],key) for key in fdist]
    aux.sort()
    aux.reverse()
    aux=aux[:n]

    return aux

def create_dict_freq(list_freq):
    freq_dict=dict()

    for (key,word) in list_freq:
        freq_dict[word]=key
        
    return freq_dict

def create_dic_ranking(dic_freq):
    dict_indices=dict()
    cont = 0
    for weight, word in dic_freq:
        dict_indices[word]= cont
        cont+= 1

    return dict_indices


<font size=3 color='lightblue'>

2.1 $\textit{Preprocesamiento}$


In [4]:

def remove_emojis(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

###### Funcion que crea un corpus con palabras en minusculas y con los caracteres especiales ###### 
###### <s> al inicio del tuit t </s> al final del tuit ###### 

def create_corpus_from_text(tuit_list, tokenizer):
    '''
    Funcion que crea un corpus con palabras en minusculas
    y con los caracteres especiales <s> al inicio del tuit t </s> al final del tuit
       
    Parametros: 
    - lista de tuits (o documentos en general)
    - tokenizador
    Returns:
    - corpus con todos los documentos tokenizados (se indica el iniciio y fin de cada uno
      con los caracteres especiales <s>,</s>)
    '''

    corpus_palabras = []

    for tuit in tuit_list:
        # Eliminar emojis y emoticones
        tuit = remove_emojis(tuit)
        # Convertir a minúsculas
        tuit = tuit.lower()
        # Eliminar signos de puntuación
        tuit = remove_punctuation(tuit)
        # Agregar token especial <s> al inicio del tuit
        tuit = "<s> " + tuit
        # Agregar token especial </s> al final del tuit
        tuit += " </s>"
        # Tokenizar el tuit y agregar las palabras al corpus
        corpus_palabras.extend(tokenizer.tokenize(tuit))

    return corpus_palabras


def create_vocabulary(dictionary_freq,freq_umbral):
    '''
    Función que crea el vocabulario a partir de un diccionario de frecuencias,
    sólo se toman las palabras con frecuencia mayor o igual a freq_umbral
    '''

    #palabras con una frecuencia igual o mayor que el umbral
    vocabulario = [palabra for palabra, frecuencia in dictionary_freq.items() if frecuencia >= freq_umbral]
    #token especial <unk> para las palabras desconocidas
    vocabulario.append("<unk>")
    return vocabulario


def enmascarar(corpus, vocabulario):
    # Enmascarar las palabras desconocidas con <unk>
    corpus_enmascarado = [palabra if palabra in vocabulario else "<unk>" for palabra in corpus]
    
    return corpus_enmascarado

In [5]:
#guardamos los textos de entrenamiento y validación
tr_txt,tr_labels=get_text_from_file("/Users/ely/Documents/Maestria/segundo_semestre/cimat2023-1/lenguaje/practicas/03_practica/mex20_train.txt","/Users/ely/Documents/Maestria/segundo_semestre/cimat2023-1/lenguaje/practicas/03_practica/mex20_train_labels.txt")
val_txt,val_labels=get_text_from_file("/Users/ely/Documents/Maestria/segundo_semestre/cimat2023-1/lenguaje/practicas/03_practica/mex20_val.txt","/Users/ely/Documents/Maestria/segundo_semestre/cimat2023-1/lenguaje/practicas/03_practica/mex20_val_labels.txt")

In [6]:
#combinamos en un único conjunto
txt=tr_txt+val_txt
labels=tr_labels+val_labels

In [7]:
#Tokenizamos
tokenizer=TweetTokenizer()
#Generamos el corpus con los tokens especiales de inicio y fin
corpus=create_corpus_from_text(txt,tokenizer)
#lista con la cantidad de veces que aparece cada token (5000 tokens más frecuentes)
freq_list=create_list_freq(corpus,5000)
#diccionario con la cantidad de veces que aparece cada token (5000 tokens más frecuentes)
freq_dict=create_dict_freq(freq_list)

In [8]:
corpus

['<s>',
 'usuario',
 'usuario',
 'usuario',
 'q',
 'se',
 'puede',
 'esperar',
 'del',
 'maricon',
 'de',
 'closet',
 'de',
 'la',
 'yañez',
 'aun',
 'recuerdo',
 'esa',
 'ves',
 'q',
 'lo',
 'vi',
 'en',
 'zona',
 'rosa',
 'viendo',
 'quien',
 'lo',
 'levantada',
 '</s>',
 '<s>',
 'usuario',
 'la',
 'piel',
 'nueva',
 'siempre',
 'arde',
 'un',
 'poquito',
 'los',
 'primeros',
 'días',
 'y',
 'más',
 'con',
 'este',
 'puto',
 'clima',
 '</s>',
 '<s>',
 'ustedes',
 'no',
 'se',
 'enamoran',
 'de',
 'mí',
 '…',
 'por',
 'tontas',
 '</s>',
 '<s>',
 'me',
 'las',
 'va',
 'a',
 'pagar',
 'esa',
 'puta',
 'gorda',
 'roba',
 'tuits',
 '</s>',
 '<s>',
 'usuario',
 'la',
 'gente',
 'es',
 'tonta',
 'porque',
 'no',
 'se',
 'dan',
 'cuenta',
 'que',
 'tú',
 'haces',
 'a',
 'batman',
 'azul',
 '</s>',
 '<s>',
 'estoy',
 'muy',
 'encabronada',
 'con',
 'las',
 'pseudo',
 'feministas',
 'por',
 'tontas',
 'e',
 'iletradas',
 'a',
 'veces',
 'me',
 'avergüenza',
 'ser',
 'mujer',
 'preferiría',
 't

In [9]:
#definimos nuestro vocabulario con todas las palabras que aparecen al menos 5 veces en el corpus
vocabulario=create_vocabulary(freq_dict,5)
len(vocabulario) 

1747

In [10]:
vocabulario

['<s>',
 '</s>',
 'que',
 'de',
 'la',
 'a',
 'y',
 'no',
 'me',
 'usuario',
 'el',
 'en',
 'se',
 'es',
 'verga',
 'madre',
 'los',
 'con',
 'por',
 'las',
 'mi',
 'un',
 'te',
 'ya',
 'si',
 'lo',
 'putos',
 'putas',
 'una',
 'para',
 'como',
 'pero',
 'su',
 'tu',
 'loca',
 'le',
 'más',
 'yo',
 'gorda',
 'al',
 'del',
 'cuando',
 'bien',
 'o',
 'son',
 'estoy',
 'qué',
 '¿',
 'porque',
 'les',
 'feas',
 'ni',
 'está',
 'pinche',
 'todos',
 'esta',
 'puta',
 'ser',
 'puto',
 'todo',
 'sus',
 'soy',
 'hdp',
 'tan',
 'eso',
 'hasta',
 'así',
 'muy',
 'hay',
 '…',
 'mis',
 'q',
 'este',
 'ver',
 'quiero',
 'mamar',
 'joto',
 'siempre',
 'solo',
 'url',
 'nos',
 '¡',
 'hace',
 'eres',
 'ahora',
 'mejor',
 'vida',
 'marica',
 'nada',
 'cosas',
 'vale',
 'tengo',
 'hoy',
 'esa',
 'va',
 'gente',
 'voy',
 'día',
 'pinches',
 'jajaja',
 'ese',
 'sin',
 'vez',
 'mierda',
 'tiene',
 'jajajaja',
 'luchona',
 'tienen',
 'hacer',
 'pues',
 'alguien',
 'tontas',
 'toda',
 'están',
 'tus',
 'mas',

In [11]:
corpus=enmascarar(corpus,vocabulario) #enmascaramos con UNK todas las palabras que no aparezcan en el corpus

In [12]:
#imprimimos el primer tuit con los tokens especiales d einciio y fin
inicio= corpus.index("<s>") 
fin= corpus.index("</s>", inicio)
primer_tuit = " ".join(corpus[inicio : fin+1])

print("Primer tuit del corpus:")
print(primer_tuit)

Primer tuit del corpus:
<s> usuario usuario usuario q se puede esperar del maricon de closet de la <unk> aun recuerdo esa ves q lo vi en zona <unk> viendo quien lo <unk> </s>


Para este corpus:
1. Se eliminan los emojis y emoticones utilizando la función remove_emojis.
2. Se convierten los tweets a minúsculas.
3. Se eliminan los signos de puntuación utilizando la función remove_punctuation.
4. Se agregan tokens especiales $<s>$ al inicio y $</s>$ al final de cada tweet.
5. Se tokenizan los tweets usando el TweetTokenizer.
6. Con ayuda de un diccionario de frecuencia, se define un vocabulario (todas las palabras que aparecen al menos 5 veces)

7. Se enmascaran las palabras que no estén en el vocabulario con el token especial $<UNK>$

<font size=3 color='lightblue'>

2.2 $\textit{Tres modelos de lenguaje}$

Para las probabilidades de los n-gramas usaremos la ecuación 3.12 de Dan Jurafsky, (MLE n-gram)
\begin{equation}
P(w_n \mid w_{n-N+1:n-1}) = \frac{C(w_{n-N+1:n-1} \, w_n)}{\sum_wC(w_{n-N+1:n-1}w)}=\frac{C(w_{n-N+1:n-1} \, w_n)}{C(w_{n-N+1:n-1})}
\end{equation}

Para el suavizado de Laplace
\begin{equation}
P(w_n \mid w_{n-N+1:n-1}) = \frac{C(w_{n-N+1:n-1} \, w_n)+1}{\sum_wC(w_{n-N+1:n-1}w)+1}=\frac{C(w_{n-N+1:n-1} \, w_n)+1}{C(w_{n-N+1:n-1})+V}
\end{equation}

In [13]:
#Creamos la clase N_gramModel
#La utilizaremos para construir y manejar modelos de lenguaje basados en n-gramas.
#Con el atributo ngram_probability calculamos la probabilidad de una palabra dadas las n-1 anteriores 
#Y usamos suavizado de Laplace para manejar secuencias no vistas
class N_gramModel:
    def __init__(self, n: int, corpus: List[str],vocabulario: List[str] ) -> None:
        """
        Constructor

        parametros:
        n: Tamaño del n-grama (1 para unigrama, 2 para bigrama, etc.).
        corpus: Lista de palabras tokenizadas del corpus.
        vocabulario: Lisya de palabras unicas en el corpus
        """
        self.n = n
        self.corpus = corpus
        self.vocab= vocabulario

        #contruimos todos los n-gramas del corpus
        self.ngrams = self.build_ngrams(corpus, n)
        #Contamos la frecuencia de cada n-grama
        self.ngram_counts = Counter(self.ngrams)
        #Contamos la cantidad de n-gramas
        self.total_ngrams = sum(self.ngram_counts.values())
        #contamos los contextos
        self.context_counts = Counter(self.build_ngrams(corpus, n - 1)) if n > 1 else Counter(corpus)
 

    def build_ngrams(self, corpus: List[str], n: int) -> List[Tuple[str, ...]]:
        """
        Construye una lista de n-gramas a partir del corpus

        Parámetros:
        corpus Lista de palabras tokenizadas.
        n Tamaño del n-grama.

        Returns:
        Lista de tuplas: n-gramas
        """
        ngrams = []
        for i in range(len(corpus) - n + 1):
            # Crear un n-grama como una tupla de n palabras consecutivas.
            ngram = tuple(corpus[i:i + n])
            ngrams.append(ngram)
        return ngrams
    

    def laplace_smoothing(self, ngram: Tuple[str, ...]) -> float:
        """
        Calcular la probabilidad suavizada de Laplace para un n-grama dado.

        :param ngram: El n-grama para el cual se calculará la probabilidad.
        :return: La probabilidad suavizada de Laplace.
        """
        vocab_size = len(self.vocab)
        ngram_count = self.ngram_counts[ngram]
        #para bigrama y trigrama contamos el numero de veces que aparece
        #"el contexto" (es decir el unigrama o bien bigrama anterior) en el texto
        #para unigramas es el numero de tokens
        context_count = self.context_counts[ngram[:-1]] if self.n > 1 else len(self.corpus)
        
        #Suavizado de Laplace: (count + 1) / (count_context + vocab size)
        return (ngram_count + 1) / (context_count + vocab_size)
    
    def ngram_probability(self, ngram: Tuple[str, ...], suav: bool) -> float:
        """
        Calcular la probabilidad de un n-grama 
        (se puede usar la probabilidad de laplace o no)

        parametros:
        ngram:El n-grama para el cual se calculará la probabilidad

        returns: La probabilidad del n-grama.
        """

        if suav: #si suavizamiento=True utilizamos la prob de laplace_smoothing
            return self.laplace_smoothing(ngram)
        
        #Frecuencia del n-grama en el corpus.
        ngram_count = self.ngram_counts[ngram]
        #para bigrama y trigrama contamos el numero de veces que aparece
        #"el contexto" (es decir el unigrama o bien bigrama anterior) en el texto
        #para unigramas es el numero de tokens
        context_count = self.context_counts[ngram[:-1]] if self.n > 1 else len(self.corpus)

        return (ngram_count) / context_count

* Modelo de unigramas

In [14]:
#Creamos el modelo
unigram_model = N_gramModel(1, corpus,vocabulario)  
#Generamos unigramas
unigrams=unigram_model.build_ngrams(corpus,1)
unigrams

[('<s>',),
 ('usuario',),
 ('usuario',),
 ('usuario',),
 ('q',),
 ('se',),
 ('puede',),
 ('esperar',),
 ('del',),
 ('maricon',),
 ('de',),
 ('closet',),
 ('de',),
 ('la',),
 ('<unk>',),
 ('aun',),
 ('recuerdo',),
 ('esa',),
 ('ves',),
 ('q',),
 ('lo',),
 ('vi',),
 ('en',),
 ('zona',),
 ('<unk>',),
 ('viendo',),
 ('quien',),
 ('lo',),
 ('<unk>',),
 ('</s>',),
 ('<s>',),
 ('usuario',),
 ('la',),
 ('piel',),
 ('nueva',),
 ('siempre',),
 ('<unk>',),
 ('un',),
 ('<unk>',),
 ('los',),
 ('primeros',),
 ('días',),
 ('y',),
 ('más',),
 ('con',),
 ('este',),
 ('puto',),
 ('clima',),
 ('</s>',),
 ('<s>',),
 ('ustedes',),
 ('no',),
 ('se',),
 ('<unk>',),
 ('de',),
 ('mí',),
 ('…',),
 ('por',),
 ('tontas',),
 ('</s>',),
 ('<s>',),
 ('me',),
 ('las',),
 ('va',),
 ('a',),
 ('pagar',),
 ('esa',),
 ('puta',),
 ('gorda',),
 ('<unk>',),
 ('tuits',),
 ('</s>',),
 ('<s>',),
 ('usuario',),
 ('la',),
 ('gente',),
 ('es',),
 ('tonta',),
 ('porque',),
 ('no',),
 ('se',),
 ('dan',),
 ('cuenta',),
 ('que',),
 ('

In [15]:
#ejemplo de probabilidad para una palabra que SÍ está en el vocabulario 
#con suavizamiento
num=15
p1=unigram_model.ngram_probability((vocabulario[num],),True)
#sin suavizamiento
p2=unigram_model.ngram_probability((vocabulario[num],),False)

print("La probabilidad del unigrama -",vocabulario[num],"- con suavizamiento, es ",p1)
print("La probabilidad del unigrama -",vocabulario[num],"- sin suavizamiento, es ",p2)

La probabilidad del unigrama - madre - con suavizamiento, es  0.00791188906737569
La probabilidad del unigrama - madre - sin suavizamiento, es  0.008032203882857438


In [16]:
#ejemplo de probabilidad para una palabra que NO está en el vocabulario 
#con suavizamiento
num=85
palabra="quesos"
if palabra in vocabulario:
    print("sí está en vocabulario")
else:
    p1=unigram_model.ngram_probability((palabra,),True)
    #sin suavizamiento
    p2=unigram_model.ngram_probability((palabra,),False)

    print("La probabilidad del unigrama -",palabra,"- con suavizamiento, es ",p1)
    print("La probabilidad del unigrama -",palabra,"- sin suavizamiento, es ",p2)

La probabilidad del unigrama - quesos - con suavizamiento, es  9.232075924592404e-06
La probabilidad del unigrama - quesos - sin suavizamiento, es  0.0


* Modelo de Bigramas

In [17]:
#Creamos el modelo
bigram_model = N_gramModel(2, corpus,vocabulario)  
#Generamos bigramas
bigrams=bigram_model.build_ngrams(corpus,2)
bigrams

[('<s>', 'usuario'),
 ('usuario', 'usuario'),
 ('usuario', 'usuario'),
 ('usuario', 'q'),
 ('q', 'se'),
 ('se', 'puede'),
 ('puede', 'esperar'),
 ('esperar', 'del'),
 ('del', 'maricon'),
 ('maricon', 'de'),
 ('de', 'closet'),
 ('closet', 'de'),
 ('de', 'la'),
 ('la', '<unk>'),
 ('<unk>', 'aun'),
 ('aun', 'recuerdo'),
 ('recuerdo', 'esa'),
 ('esa', 'ves'),
 ('ves', 'q'),
 ('q', 'lo'),
 ('lo', 'vi'),
 ('vi', 'en'),
 ('en', 'zona'),
 ('zona', '<unk>'),
 ('<unk>', 'viendo'),
 ('viendo', 'quien'),
 ('quien', 'lo'),
 ('lo', '<unk>'),
 ('<unk>', '</s>'),
 ('</s>', '<s>'),
 ('<s>', 'usuario'),
 ('usuario', 'la'),
 ('la', 'piel'),
 ('piel', 'nueva'),
 ('nueva', 'siempre'),
 ('siempre', '<unk>'),
 ('<unk>', 'un'),
 ('un', '<unk>'),
 ('<unk>', 'los'),
 ('los', 'primeros'),
 ('primeros', 'días'),
 ('días', 'y'),
 ('y', 'más'),
 ('más', 'con'),
 ('con', 'este'),
 ('este', 'puto'),
 ('puto', 'clima'),
 ('clima', '</s>'),
 ('</s>', '<s>'),
 ('<s>', 'ustedes'),
 ('ustedes', 'no'),
 ('no', 'se'),
 ('se

In [18]:
vocabulario.index('vi')

424

In [19]:
#ejemplo de probabilidad para un bigrama que SÍ está en el corpus
#con suavizamiento
num1=25
num2=424
p1=bigram_model.ngram_probability((vocabulario[num1],vocabulario[num2]),True)
#sin suavizamiento
p2=bigram_model.ngram_probability((vocabulario[num1],vocabulario[num2]),False)

print("Con suavizamiento: ")
print("La probabilidad de la palabra -",vocabulario[num2],"- dada la palabra -",vocabulario[num1],"- es ",p1)
print("\nSin suavizamiento: ")
print("La probabilidad de la palabra -",vocabulario[num2],"- dada la palabra -",vocabulario[num1],"- es ",p2)

Con suavizamiento: 
La probabilidad de la palabra - vi - dada la palabra - lo - es  0.0016645859342488557

Sin suavizamiento: 
La probabilidad de la palabra - vi - dada la palabra - lo - es  0.004573170731707317


In [20]:
#ejemplo de probabilidad para un bigrama que NO está en el corpus 
#con suavizamiento
num1=25
num2=425
if (vocabulario[num1],vocabulario[num2]) in bigrams:
    print("el bigrama sí existe en el corpus")
else:
    p1=bigram_model.ngram_probability((vocabulario[num1],vocabulario[num2]),True)
    #sin suavizamiento
    p2=bigram_model.ngram_probability((vocabulario[num1],vocabulario[num2]),False)

    print("Con suavizamiento: ")
    print("La probabilidad de la palabra -",vocabulario[num2],"- dada la palabra -",vocabulario[num1],"- es ",p1)
    print("\nSin suavizamiento: ")
    print("La probabilidad de la palabra -",vocabulario[num2],"- dada la palabra -",vocabulario[num1],"- es ",p2)

Con suavizamiento: 
La probabilidad de la palabra - será - dada la palabra - lo - es  0.0004161464835622139

Sin suavizamiento: 
La probabilidad de la palabra - será - dada la palabra - lo - es  0.0


* Modelo trigramas

In [21]:
#Creamos el modelo
trigram_model = N_gramModel(3, corpus,vocabulario)  
#Generamos bigramas
trigrams=trigram_model.build_ngrams(corpus,3)
trigrams

[('<s>', 'usuario', 'usuario'),
 ('usuario', 'usuario', 'usuario'),
 ('usuario', 'usuario', 'q'),
 ('usuario', 'q', 'se'),
 ('q', 'se', 'puede'),
 ('se', 'puede', 'esperar'),
 ('puede', 'esperar', 'del'),
 ('esperar', 'del', 'maricon'),
 ('del', 'maricon', 'de'),
 ('maricon', 'de', 'closet'),
 ('de', 'closet', 'de'),
 ('closet', 'de', 'la'),
 ('de', 'la', '<unk>'),
 ('la', '<unk>', 'aun'),
 ('<unk>', 'aun', 'recuerdo'),
 ('aun', 'recuerdo', 'esa'),
 ('recuerdo', 'esa', 'ves'),
 ('esa', 'ves', 'q'),
 ('ves', 'q', 'lo'),
 ('q', 'lo', 'vi'),
 ('lo', 'vi', 'en'),
 ('vi', 'en', 'zona'),
 ('en', 'zona', '<unk>'),
 ('zona', '<unk>', 'viendo'),
 ('<unk>', 'viendo', 'quien'),
 ('viendo', 'quien', 'lo'),
 ('quien', 'lo', '<unk>'),
 ('lo', '<unk>', '</s>'),
 ('<unk>', '</s>', '<s>'),
 ('</s>', '<s>', 'usuario'),
 ('<s>', 'usuario', 'la'),
 ('usuario', 'la', 'piel'),
 ('la', 'piel', 'nueva'),
 ('piel', 'nueva', 'siempre'),
 ('nueva', 'siempre', '<unk>'),
 ('siempre', '<unk>', 'un'),
 ('<unk>', 'un

In [22]:
('se', 'puede', 'esperar') 

('se', 'puede', 'esperar')

In [23]:
vocabulario.index('se')

12

In [24]:
#ejemplo de probabilidad para un trigrama que SÍ está en el corpus
#con suavizamiento
num1=12
num2=151
num3=747
p1=trigram_model.ngram_probability((vocabulario[num1],vocabulario[num2],vocabulario[num3]),True)
#sin suavizamiento
p2=trigram_model.ngram_probability((vocabulario[num1],vocabulario[num2],vocabulario[num3]),False)

print("Con suavizamiento: ")
print("La probabilidad de la palabra -",vocabulario[num3],"- dado el bigrama -",(vocabulario[num1],vocabulario[num2]),"- es ",p1)
print("\nSin suavizamiento: ")
print("La probabilidad de la palabra -",vocabulario[num3],"- dado el bigrama -",(vocabulario[num1],vocabulario[num2]),"- es ",p2)

Con suavizamiento: 
La probabilidad de la palabra - esperar - dado el bigrama - ('se', 'puede') - es  0.002826455624646693

Sin suavizamiento: 
La probabilidad de la palabra - esperar - dado el bigrama - ('se', 'puede') - es  0.18181818181818182


In [25]:
#ejemplo de probabilidad para un trigrama que NO está en el corpus
#con suavizamiento
num1=12
num2=151
num3=748

if (vocabulario[num1],vocabulario[num2],vocabulario[num3]) in trigrams:
    print("el trigrama sí existe en el corpus")
else:
    p1=trigram_model.ngram_probability((vocabulario[num1],vocabulario[num2],vocabulario[num3]),True)
    #sin suavizamiento
    p2=trigram_model.ngram_probability((vocabulario[num1],vocabulario[num2],vocabulario[num3]),False)

    print("Con suavizamiento: ")
    print("La probabilidad de la palabra -",vocabulario[num3],"- dado el bigrama -",(vocabulario[num1],vocabulario[num2]),"- es ",p1)
    print("\nSin suavizamiento: ")
    print("La probabilidad de la palabra -",vocabulario[num3],"- dado el bigrama -",(vocabulario[num1],vocabulario[num2]),"- es ",p2)

Con suavizamiento: 
La probabilidad de la palabra - empiezan - dado el bigrama - ('se', 'puede') - es  0.0005652911249293386

Sin suavizamiento: 
La probabilidad de la palabra - empiezan - dado el bigrama - ('se', 'puede') - es  0.0


Notamos que en todos los casos, cuando una palbra/n-grama no se encuentra en el vocabulario/n-gramas generados con el modelo y NO hay suavizado, la probabilidad es cero. En cambio con el suavizado de laplace, la probabilidad es distinta de cero.

En este caso usamos n-gramas que no se encontraban entre los generados por el modelo aunque también se podría haber utilizado alguna palabra que no está en el vocabulario.


En los casos en que la palabra/negrama sí se en el vocabulario/n-gramas generados con el modelo, la probabilidad es distinta cuando sí se toma el suavizado y cuando no hay. Lo que tiene sentido porque el suavizado modifica la masa de probabilidad.

<font size=3 color='lightblue'>

2.3 $\textit{Modelo interpolado}$

In [26]:
class InterpolatedModel:
    def __init__(self, unigram_model: N_gramModel, bigram_model: N_gramModel, trigram_model: N_gramModel, lambdas: Tuple[float, float, float]) -> None:
        """
        Constructor para el modelo interpolado.
        parametro:

        unigram_model,bigram_modeltrigram_model: Modelos de unigramas, bigramas y trigramas respectivamente 
        (clase N_gramModel definida anteriormente)

        lambdas: Tupla de lambdas.

        """
        self.unigram_model = unigram_model
        self.bigram_model = bigram_model
        self.trigram_model = trigram_model
        self.lambdas = lambdas

    def interpolated_probability(self, trigram: Tuple[str, str, str]) -> float:
        """
        Esra función alcula la probabilidad interpolada para un trigram dado.

        :param trigram: El trigram para el cual se calculará la probabilidad.
        :return: La probabilidad interpolada.
        """

        #calculamos la prob del unigrama
        unigram_prob = self.unigram_model.ngram_probability((trigram[2],),True) #solo nos quedamos con la ultima entrads
        #calculamos la prob del bigrama
        bigram_prob = self.bigram_model.ngram_probability(trigram[1:],True) #para el bigrama nos quedamos con las ultimas dos entrads
        #calculamos la prob del trigrama
        trigram_prob = self.trigram_model.ngram_probability(trigram,True)

        return (self.lambdas[0] * trigram_prob +
                self.lambdas[1] * bigram_prob +
                self.lambdas[2] * unigram_prob)

In [27]:
#definimos la perplejidad 
def perplexity(model: InterpolatedModel, corpus: List[str]) -> float:
    N = len(corpus)
    log_prob = 0.0

    for i in range(2, N):  # Empezamos en 2 para tener un trigram completo
        trigram = (corpus[i-2], corpus[i-1], corpus[i])
        prob = model.interpolated_probability(trigram)
        log_prob += log(prob)

    perplexity = exp(-log_prob / N)
    return perplexity




In [28]:
#dividimos en conjunto test+validación(20%) y entrenamiento (80%)
corpus_train,corpus_test_val = train_test_split(corpus, test_size=0.2, random_state=42)

#dividimos en conjunto test+validación en test(10%) y validacion(10%)
corpus_test, corpus_val = train_test_split(corpus_test_val, test_size=0.5, random_state=42)

In [29]:
print("Num. de elementos en el corpus original:", len(corpus))
print("Num. de elementos en el conjunto de entrenamiento:",len(corpus_train))
print("Num. de elementos en el conjunto de test:",len(corpus_test))
print("Num. de elementos en el conjunto de validacion:",len(corpus_val))

Num. de elementos en el corpus original: 106571
Num. de elementos en el conjunto de entrenamiento: 85256
Num. de elementos en el conjunto de test: 10657
Num. de elementos en el conjunto de validacion: 10658


In [30]:
#definimos el modelo
lambdas = (1, 1, 1) #lambdas inicial trigrama sin interpolacion
interpolated_model = InterpolatedModel(unigram_model, bigram_model, trigram_model, lambdas)

In [31]:
# Ajuste de parámetros λ y reevaluación
lambda_values = [(1/3, 1/3, 1/3),(0.4, 0.4, 0.2), (0.2, 0.4, 0.4), (0.5, 0.4, 0.1), (0.1, 0.4, 0.5)]

#inicializamos mejor perplejidad y lambdas con el moedeo actual
best_perplexity = perplexity(interpolated_model, corpus_val)
best_lambdas = lambdas 

for lambdas in lambda_values:
    #variamos el parametro lamdas del modelo interpolado
    interpolated_model.lambdas = lambdas
    #calculamos la perplejidad del modelo actual en validación
    val_perplexity = perplexity(interpolated_model, corpus_val)
    #imprimimos
    print("Perplejidad en validación con λ=",lambdas,":" ,val_perplexity)

    #guardamos "best lambdas" y "best perplexity" si la perplejidad es menor que la anterior
    if val_perplexity < best_perplexity:
        best_perplexity = val_perplexity
        best_lambdas = lambdas

Perplejidad en validación con λ= (0.3333333333333333, 0.3333333333333333, 0.3333333333333333) : 230.90688750123113
Perplejidad en validación con λ= (0.4, 0.4, 0.2) : 284.55018864536214
Perplejidad en validación con λ= (0.2, 0.4, 0.4) : 210.78780000535843
Perplejidad en validación con λ= (0.5, 0.4, 0.1) : 371.23255902490325
Perplejidad en validación con λ= (0.1, 0.4, 0.5) : 191.38843966648508


In [32]:
#Probamos el mejor modelo en prueba
interpolated_model.lambdas = best_lambdas #usamos las mejores lambda
test_perplexity = perplexity(interpolated_model, corpus_test)

print("Perplejidad en prueba:", test_perplexity)

Perplejidad en prueba: 76.84922135550796


<font size=4.5 color='lightblue'>

$\textit{Generación de Texto}$

<font size=3 color='lightblue'>

3.1 $\textit{Expectation maximization}$

A la clase InterpolatedModel, agregaremos dos funciones más: exp_max y perplexity.

Para expectation maximization usaremos:

![Ejercicio 1a](algorithm.png)

In [33]:
class InterpolatedModel:
    def __init__(self, unigram_model: N_gramModel, bigram_model: N_gramModel, trigram_model: N_gramModel, lambdas: Tuple[float, float, float]) -> None:
        """
        Constructor para el modelo interpolado.
        parametro:

        unigram_model,bigram_modeltrigram_model: Modelos de unigramas, bigramas y trigramas respectivamente 
        (clase N_gramModel definida anteriormente)

        lambdas: Tupla de lambdas.

        """
        self.unigram_model = unigram_model
        self.bigram_model = bigram_model
        self.trigram_model = trigram_model
        self.lambdas = lambdas

    def interpolated_probability(self, trigram: Tuple[str, str, str]) -> float:
        """
        Esra función alcula la probabilidad interpolada para un trigram dado.

        :param trigram: El trigram para el cual se calculará la probabilidad.
        :return: La probabilidad interpolada.
        """

        #calculamos la prob del unigrama
        unigram_prob = self.unigram_model.ngram_probability((trigram[2],),True) #solo nos quedamos con la ultima entrads
        #calculamos la prob del bigrama
        bigram_prob = self.bigram_model.ngram_probability(trigram[1:],True) #para el bigrama nos quedamos con las ultimas dos entrads
        #calculamos la prob del trigrama
        trigram_prob = self.trigram_model.ngram_probability(trigram,True)

        return (self.lambdas[0] * trigram_prob +
                self.lambdas[1] * bigram_prob +
                self.lambdas[2] * unigram_prob)
    
    def perplexity(self, corpus: List[str]) -> float:
        N = len(corpus)
        log_prob = 0.0
        for i in range(2, N):  # Empezamos en 2 para tener un trigram completo
            trigram = (corpus[i-2], corpus[i-1], corpus[i])
            prob = self.interpolated_probability(trigram)
            log_prob += log(prob)
        perplexity = exp(-log_prob / (N))  # Ajuste por el tamaño del corpus menos los primeros 2 términos
        return perplexity
    
            
    def exp_max(self, corpus: List[str], num_iterations: int = 10) -> None:
        """
        Esta función implementa el algoritmo de 
        Expectation Maximization (Jacob Eisenstein. Draft of October 15, 2018)
        para ajustar los valores de lambda.

        Parametros
        corpus: Lista de tokens del corpus.
        num_iterations
        """
        nmax = 3  # Número de modelos (unigramas, bigramas, trigramas)
        print("Inizializado: perplejidad",self.perplexity(corpus))
        M=len(corpus)
        suma = np.zeros(nmax)
        for j in range(num_iterations):
            for i in range(2, M):
                #construimos trigrama
                trigram = (corpus[i-2], corpus[i-1], corpus[i])

                ###### E-Step (calculo de probabilidades y mult por lambda actual)########

                #probabilidades de los n-gramas
                trigram_prob = self.trigram_model.ngram_probability(trigram,True)
                bigram_prob = self.bigram_model.ngram_probability(trigram[1:],True)
                unigram_prob = self.unigram_model.ngram_probability(trigram[2],True)
                probs = np.array([trigram_prob, bigram_prob, unigram_prob])

                #multiplicamos or los valores actuales de lambda
                probs *= self.lambdas 
                #normalizamos
                probs =probs/np.linalg.norm(probs)

                #### M-step: (la suma)#####
                
                #suamos 
                suma += probs 

            self.lambdas = (1/M)*(suma) # Normaliza las sumas acumuladas
            if j%5==0 and j!=0:
                print("iteración ",j,": perplejidad",self.perplexity(corpus))

In [34]:
#inicializamos con las best_lambdas del ejercio anterior
lambdas=best_lambdas
interpolated_model = InterpolatedModel(unigram_model, bigram_model, trigram_model, lambdas)
interpolated_model.exp_max(corpus_val, num_iterations=26)
print("Lambdas ajustados:", interpolated_model.lambdas)

Inizializado: perplejidad 76.98483185017044
iteración  5 : perplejidad 89.9883046259312
iteración  10 : perplejidad 49.77391861816142
iteración  15 : perplejidad 34.42956169252285
iteración  20 : perplejidad 26.326441500476882
iteración  25 : perplejidad 21.315368431318255
Lambdas ajustados: [1.10604476e+01 2.16610600e+01 9.40477535e-03]


In [35]:
#probamos en test
interpolated_model.perplexity(corpus_test)

21.063333666800876

<font size=3 color='lightblue'>

3.2 $\textit{Tuitear}$

In [36]:
def take_ngram(ngrams_list: List[Tuple[Tuple[str, ...], float]]) -> Tuple[str, ...]:
    """
    Toma un n-grama basado en una distribución de probabilidad.
    
    parametros:
    ngrams_list: Lista de n-gramas y sus probabilidades.
    returns
    Un n-grama  basado en la dist de probabilidad.
    """
    # Separa los n-gramas y sus pesos
    ngrams, weights = zip(*ngrams_list)
    # Calcula las probabilidades normalizadas
    probabilities = np.array(weights) / np.sum(weights)
    # Selecciona un n-grama basado en las probabilidades usando np.random.choice
    return ngrams[np.random.choice(len(ngrams), p=probabilities)]



def tuitear(ngrams: List[Tuple[str, ...]], probability_function: Callable[[Tuple[str, ...]], float], num_palabras: int = 50) -> str:
    """
    Genera un tweet utilizando un modelo de n-gramas.

    parametros:
    n: Tamaño del n-grama.
    ngrams: Lista de n-gramas.
    probability_function: calcula la prob de un n-grama.
    num_palabras


    returns:
    tuit generado como cadena de texto.
    """

    #Lista con n-gramas iniciales que SÍ contienen <s> y NO contienen <unk> ni </s>
    ngrams_list = [(ngram, probability_function(ngram)) for ngram in ngrams if ("<s>" in ngram) and (not "<unk>" in ngram) and (not  "</s>" in ngram) ]
    
    # Inicializar el tweet con un n-grama inicial
    s = list(take_ngram(ngrams_list))
    
    #Generaramos palabras hasta alcanzar el límite o encontrar </s>
    while len(s) < num_palabras and not "</s>" in s: 
       
        #Lista de n-gramas para generar texto que NO contengan "<s>" ni "<unk>"
        ngrams_list_1 = [
            (ngram, probability_function(ngram))
            for ngram in ngrams
            if  (not "<s>" in ngram) and (not "<unk>" in ngram)
        ]

        
        if len(s) < (0.6 * num_palabras):
            # Evitar que el tweet termine demasiado pronto
            ngrams_list_1 = [(ngram, weight) for ngram, weight in ngrams_list_1 if ngram[-1] != "</s>"]
        
        #agregamos el n-grama
        s += list(take_ngram(ngrams_list_1))
    
    return " ".join(s)

In [37]:
ngrams = list(trigram_model.ngram_counts.keys())

#Definimos la función de probabilidad para el tuit
def probability_function(ngram: Tuple[str, ...]) -> float:
    return interpolated_model.interpolated_probability(ngram)

#Generamos 5 ejemplos
for i in range(5):
    generated_tweet = tuitear(ngrams=ngrams, probability_function=probability_function)
    print(generated_tweet,"\n")

<s> una cosa a esta mamada fin iba a a la gorda tengo yo no bueno que te puto huevo te gente idiota no orgullo en el viven ahi puro vale verga no es de las la puta madre de hdp </s> 

<s> a cabrón como siempre partidos verga todo el tipo que me otro en la hueva no quiero cuenta que mis tan tonta a abajo de mi algo aunque sea pues pero que pero un día marica a mi quieren que sea chingo a mi estúpida de mi salió de la 

<s> ¡ alguien te puedes soltar si no pueden de estar solo la gorda y 6 de la pero estaba bien con todas las derechos de la pasado que ven que mejor mi cabrón xd a del usuario usuario haber que hacen lo celosa que el cabrón se hasta para ser 

<s> vale verga asi a la se acaba de la hijas de hasta de las mi no me vez que te que no mame gorda en la tuvo una amiga que … </s> 

<s> me queda en el medio sí a la usuario pero aquí parece que te que vayas a yo tengo uno caso 🤦 ‍ son las que puede por un chava en el nada … </s> 



Nota: en general los tuits terminaban demasiado pronto debido a que es altamente probable que aparezca un "$</s>$" así que no agregué nada para que en los últimos tokens aparezca este token especial. Más bien se agregó la condición if len(s) < (0.6 * num_palabras) para evitar que el tuit termine demasiado pronto.

<font size=3 color='lightblue'>

3.3 $\textit{Modelo AMLO}$

In [38]:

def leer_archivos(ruta_corpus: str) -> List[str]:
    """
    Esta función lee todos los archivos .txt en una carpeta y guarda su contenido en una lista,
    donde cada archivo se almacena como un string con un "\n" al final.

    Argumentos:
        ruta_corpus: Ruta a la carpeta que contiene los archivos .txt.

    Regresa:
        Una lista de strings, cada uno con el contenido de un archivo .txt.
    """

    archivos_contenido: List[str] = []  # Lista para almacenar el contenido de los archivos
    contador: int = 0  # Para contar el número de documentos

    for archivo in os.listdir(ruta_corpus):
        f = os.path.join(ruta_corpus, archivo)

        # Verificamos si es un archivo y termina en .txt
        if os.path.isfile(f) and archivo.endswith('.txt'):
            try:
                # Leemos el contenido del archivo
                with open(f, 'r') as file:
                    conferencia_actual: str = file.read()
                    archivos_contenido.append(conferencia_actual + "\n")  # Añadimos el contenido con un "\n" al final
                    contador += 1
            except Exception as e:
                print(f"Error leyendo archivo {f}: {e}")  # Error

    print(f"El número de conferencias guardadas en la lista es: {contador}")

    return archivos_contenido

def create_corpus_from_text(text_list, tokenizer):
    '''
    Funcion que crea un corpus con palabras en minusculas
    y con los caracteres especiales <s> al inicio del tuit t </s> al final del tuit
       
    Parametros: 
    - lista de tuits (o documentos en general)
    - tokenizador
    Returns:
    - corpus con todos los documentos tokenizados (se indica el iniciio y fin de cada uno
      con los caracteres especiales <s>,</s>)
    '''

    corpus_palabras = []
    i=0
    for text in text_list:
        # Convertir a minúsculas
        text=text.lower()
        # Eliminar signos de puntuación
        text =remove_punctuation(text)
        # Agregar tokens especial <s> al inicio y final del text
        text = "<s> " + text + " </s>"       
        #Tokenizar agregar las palabras al corpus
        corpus_palabras.extend(tokenizer.tokenize(text))
        i=i+1
    return corpus_palabras

In [39]:
ruta_corpus="/Users/ely/Documents/Maestria/segundo_semestre/cimat2023-1/lenguaje/practicas/01_practica/clean_data"
conferencias = leer_archivos(ruta_corpus)

El número de conferencias guardadas en la lista es: 1720


In [40]:
exp_reg = r'(<s>|\b[A-Za-z]+\b|</s>)'
tokenizer_reg_exp = RegexpTokenizer(exp_reg)
amlo_corpus=create_corpus_from_text(conferencias, tokenizer_reg_exp)

In [41]:
len(amlo_corpus)

12863580

In [42]:
#lista con la cantidad de veces que aparece cada token (5000 tokens más frecuentes)
freq_list_amlo=create_list_freq(amlo_corpus,20000)
#diccionario con la cantidad de veces que aparece cada token (5000 tokens más frecuentes)
freq_dict_amlo=create_dict_freq(freq_list_amlo)

In [43]:
#definimos nuestro vocabulario con todas las palabras que aparecen al menos 100 veces en el corpus
vocabulario_amlo=create_vocabulary(freq_dict_amlo,100)
len(vocabulario_amlo) 

6078

In [44]:
amlo_corpus=enmascarar(amlo_corpus,vocabulario_amlo) #enmascaramos con UNK todas las palabras que no aparezcan en el corpus

In [45]:
type(vocabulario_amlo)

list

In [46]:
#dividimos en conjunto test+validación(20%) y entrenamiento (80%)
corpus_train_amlo,corpus_test_val_amlo = train_test_split(amlo_corpus, test_size=0.2, random_state=42)

#dividimos en conjunto test+validación en test(10%) y validacion(10%)
corpus_test_amlo, corpus_val_amlo = train_test_split(corpus_test_val_amlo, test_size=0.5, random_state=42)

In [47]:
print("Num. de elementos en el corpus original:", len(amlo_corpus))
print("Num. de elementos en el conjunto de entrenamiento:",len(corpus_train_amlo))
print("Num. de elementos en el conjunto de test:",len(corpus_test_amlo))
print("Num. de elementos en el conjunto de validacion:",len(corpus_val_amlo))

Num. de elementos en el corpus original: 12863580
Num. de elementos en el conjunto de entrenamiento: 10290864
Num. de elementos en el conjunto de test: 1286358
Num. de elementos en el conjunto de validacion: 1286358


In [48]:
#modelos de lenguaje
unigram_model_amlo = N_gramModel(1, amlo_corpus,vocabulario_amlo)  
biigram_model_amlo = N_gramModel(2, amlo_corpus,vocabulario_amlo)  
trigram_model_amlo = N_gramModel(3, amlo_corpus,vocabulario_amlo)  

In [49]:
#inicializamos con lambdas 1
lambdas_amlo=[1,1,1]
interpolated_model_amlo = InterpolatedModel(unigram_model_amlo, biigram_model_amlo, trigram_model_amlo, lambdas_amlo)
interpolated_model.exp_max(corpus_val_amlo, num_iterations=36)
print("Lambdas ajustados:", interpolated_model.lambdas)

Inizializado: perplejidad 37.594193598943704
iteración  5 : perplejidad 157.58599566004744
iteración  10 : perplejidad 85.88295090117008
iteración  15 : perplejidad 59.01509650346726
iteración  20 : perplejidad 44.94826798527628
iteración  25 : perplejidad 36.29478888740372
iteración  30 : perplejidad 30.434353508966694
iteración  35 : perplejidad 26.20275808775592
Lambdas ajustados: [1.68763821e+01 3.02941475e+01 6.61254732e-06]


In [50]:
#probamos en test
interpolated_model_amlo.perplexity(corpus_test_amlo)

179.0072771908339

In [55]:
def dar_conferencia(ngrams: List[Tuple[str, ...]], probability_function: Callable[[Tuple[str, ...]], float], num_palabras: int = 110) -> str:
    """
    Genera un tweet utilizando un modelo de n-gramas.

    parametros:
    ngrams: Lista de n-gramas.
    probability_function: calcula la prob de un n-grama.
    num_palabras


    returns:
    tuit generado como cadena de texto.
    """

    #Lista con n-gramas iniciales que SÍ contienen <s> y NO contienen <unk> ni </s>
    ngrams_list = [(ngram, probability_function(ngram)) for ngram in ngrams if ("<s>" in ngram) and (not "<unk>" in ngram) and (not  "</s>" in ngram) ]
    
    # Inicializar el tweet con un n-grama inicial
    s = list(take_ngram(ngrams_list))
    
    #Generaramos palabras hasta alcanzar el límite o encontrar </s>
    while len(s) < num_palabras and not "</s>" in s: 
       
        #Lista de n-gramas para generar texto que NO contengan "<s>" ni "<unk>"
        ngrams_list_1 = [
            (ngram, probability_function(ngram))
            for ngram in ngrams
            if  (not "<s>" in ngram) and (not "<unk>" in ngram)
        ]

        
        if len(s) < (0.6 * num_palabras):
            # Evitar que el tweet termine demasiado pronto
            ngrams_list_1 = [(ngram, weight) for ngram, weight in ngrams_list_1 if ngram[-1] != "</s>"]
        
        if len(s)%10==0:
            s += '\n'
        #agregamos el n-grama
        s += list(take_ngram(ngrams_list_1))
    
    return " ".join(s)

In [56]:
ngrams_amlo = list(trigram_model_amlo.ngram_counts.keys())
#Definimos la función de probabilidad para el tuit
def probability_function_amlo(ngram: Tuple[str, ...]) -> float:
    return interpolated_model_amlo.interpolated_probability(ngram)
generated_conf=dar_conferencia(ngrams=ngrams_amlo,probability_function=probability_function_amlo)
print(generated_conf)

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x107eb1c90>>
Traceback (most recent call last):
  File "/Users/ely/Documents/Maestria/segundo_semestre/cimat2023-1/lenguaje/.conda/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


In [1]:
print(generated_conf)

NameError: name 'generated_conf' is not defined

<font size=3 color='lightblue'>

3.4 $\textit{Estimado}$

In [53]:
comparative=["sino gano me voy a la chingada", "ya se va a acabar la corrupción"]

for s in comparative:
    print("\n\t",s)
    print("- Tweet model:\tPerplexity= ",  interpolated_model.perplexity(s) )
    print("- AMLO model:\tPerplexity= ", interpolated_model_amlo.perplexity(s) )
    


	 sino gano me voy a la chingada
- Tweet model:	Perplexity=  613.7462434187453
- AMLO model:	Perplexity=  31.33267213333864

	 ya se va a acabar la corrupción
- Tweet model:	Perplexity=  814.914496317666
- AMLO model:	Perplexity=  32.80928114683462


<font size=3 color='lightblue'>

3.5 $\textit{Permutaciones}$

In [54]:
import itertools

permutation_tweet = []
permutation_amlo = []

for s in comparative:
    s = s.split(" ")
    for ss in itertools.permutations(s):
        ss = " ".join(ss)
        tweet_p = interpolated_model_amlo.interpolated_probability(ss)
        amlo_p = interpolated_model.interpolated_probability(ss)
        permutation_tweet.append( [ss, tweet_p] )
        permutation_amlo.append( [ss, amlo_p] )

permutation_tweet = sorted(permutation_tweet, key=lambda x: x[1], reverse=True)
permutation_amlo = sorted(permutation_amlo, key=lambda x: x[1], reverse=True)

print("-Modelo Tweet")
print("Mas probale\t\t\t\tMenos probable")
for i in range(3):
    print( permutation_tweet[i][0],"\t",permutation_tweet[-(i+1)][0] )

print("\n-Modelo AMLO")
print("Mas probale\t\t\t\tMenos probable")
for i in range(3):
    print( permutation_amlo[i][0],"\t",permutation_amlo[-(i+1)][0] )


-Modelo Tweet
Mas probale				Menos probable
a acabar ya se va la corrupción 	 corrupción la acabar a va se ya
a acabar ya se va corrupción la 	 corrupción la acabar a va ya se
a acabar ya se la va corrupción 	 corrupción la acabar a se va ya

-Modelo AMLO
Mas probale				Menos probable
a acabar ya se va la corrupción 	 corrupción la acabar a va se ya
a acabar ya se va corrupción la 	 corrupción la acabar a va ya se
a acabar ya se la va corrupción 	 corrupción la acabar a se va ya
