## Tarea 3
## Isaac Rodríguez Bribiesca

Bibliotecas usadas

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import bigrams
from nltk import ngrams
import preprocessor as p
import numpy as np

Se lee archivo de entrenamiento de tweets, así como las etiquetas

In [67]:
with open('mex_train.txt', 'r') as f:
    corpus = f.readlines()

In [68]:
with open('mex_train_labels.txt', 'r') as f:
    labels = f.readlines()

labels = [int(lab.strip('\n')) for lab in labels]

Se separan tweets agresivos y no agresivos

In [69]:
tweets_agg = [tw for tw, lab in zip(corpus, labels) if lab == 1]

In [70]:
tweets_noagg = [tw for tw, lab in zip(corpus, labels) if lab == 0]

Funciones para preprocesar los tweets

In [71]:
def process_word(w, punct):
    is_punct = True if w in punct else False
    is_digit = w.isnumeric()
    is_stopword = w in stopwords.words('spanish')

    return "" if is_punct or is_digit or is_stopword else w.lower()

def process_sentence(sent, punct):
    s = []
    for w in sent:

        is_punct = True if w in punct else False
        is_digit = w.isnumeric()
        is_stopword = w in stopwords.words('spanish')

        if not(is_punct or is_digit or is_stopword):
            s.append(w.lower())

    return " ".join(s)

def num_tokens(tweets):
    tk = TweetTokenizer()
    tokens = [process_word(w, punct) for sent in tweets for w in tk.tokenize(sent)]
    tokens = list(filter(None, tokens))
    
    return len(tokens)

Simbolos a filtrar

In [72]:
punct = set(['.', ',', ';', ':', '-', '!', '¡', '¿', '?', '"', '\'', '...', '<url>', '*', '@usuario'])

Función para convertir una lista de tokens a ngramas de tamaño n

In [73]:
def words_to_ngrams(words, n, sep=" "):
    if n > 1:
        return [tuple(words[i:i+n]) for i in range(len(words)-n+1)]
    else:
        return words

Función para calcular tabla de frecuencias de ngramas con ayuda de TweetTokenizer y FreqDist

In [74]:
def build_ngrams(tweets, punct, n):
    tk = TweetTokenizer()
    tokens = [process_word(w, punct) for sent in tweets for w in tk.tokenize(sent)]
    tokens = list(filter(None, tokens))
    tw_trigrams = words_to_ngrams(tokens, n)
    tw_trigrams = FreqDist(tw_trigrams)
    
    return tw_trigrams

### Ejercicio 1. Conteos de unigramas sin suavizado

Funcion que genera unigramas

In [75]:
def build_unigram(tweets, punct):
    return build_ngrams(tweets, punct, 1)

### Ejercicio 1. Conteos de bigramas sin suavizado

Funcion que genera bigramas

In [76]:
def build_bigram(tweets, punct):
    return build_ngrams(tweets, punct, 2)

### Ejercicio 2. Comparación unigramas y bigramas para clases tweets agresivos y no agresivos

#### Unigramas y bigramas más comunes en tweets no agresivos

Unigramas tweeets no agresivos

In [77]:
tw_unigrams = build_unigram(tweets_noagg, punct)

In [78]:
tw_unigrams.most_common(10)

[('verga', 864),
 ('madre', 675),
 ('putas', 547),
 ('loca', 542),
 ('si', 421),
 ('putos', 359),
 ('😂', 259),
 ('bien', 155),
 ('…', 155),
 ('vale', 127)]

Bigramas tweeets no agresivos

In [158]:
tw_bigrams = build_bigram(tweets_noagg, punct)

In [160]:
tw_bigrams.most_common(10)

[(('😂', '😂'), 106),
 (('vale', 'verga'), 77),
 (('puta', 'madre'), 70),
 (('❤', '️'), 38),
 (('🏻', '\u200d'), 31),
 (('valer', 'verga'), 30),
 (('vale', 'madre'), 30),
 (('mamá', 'luchona'), 29),
 (('😭', '😭'), 29),
 (('😡', '😡'), 25)]

#### Unigramas y bigramas más comunes en tweets agresivos

Unigramas tweeets no agresivos

In [161]:
tw_unigrams = build_unigram(tweets_agg, punct)

In [162]:
tw_unigrams.most_common(10)

[(('putos',), 472),
 (('madre',), 404),
 (('putas',), 348),
 (('verga',), 284),
 (('si',), 242),
 (('hdp',), 215),
 (('puta',), 171),
 (('pinche',), 171),
 (('😂',), 118),
 (('puto',), 114)]

Bigramas tweeets no agresivos

In [163]:
tw_bigrams = build_bigram(tweets_agg, punct)

In [164]:
tw_bigrams.most_common(10)

[(('puta', 'madre'), 90),
 (('😂', '😂'), 59),
 (('chingar', 'madre'), 32),
 (('mil', 'putas'), 32),
 (('chinguen', 'madre'), 32),
 (('hijo', 'puta'), 28),
 (('hijos', 'puta'), 27),
 (('chinga', 'madre'), 27),
 (('chingas', 'madre'), 27),
 (('putas', 'madres'), 25)]

Para el caso de los unigramas no se observa mucha diferencia en el tipo de palabras más frecuentes entre tweets agresivos y no agresivos, variando sólamente las frecuencias en que aparecen las palabras. En el caso de bigramas se hace más notoria la diferencia entre tweets agresivos y no agresivos, ya que en los tweets agresivos aparecen más groserías como "hijo puta" o "chingas madre", que en los bigramas de tweeets no agresivos no son tan frecuentes.

### Ejercicio 3. Bigramas y Trigramas con Add-one Smoothing

Se obtienen unigramas y bigramas para construir las tablas

In [14]:
tw_unigrams = build_unigram(corpus, punct)
tw_bigrams = build_bigram(corpus, punct)

Tabla de bigramas

In [15]:
bigram_table = {}

bigram_list = list(tw_bigrams.keys())
vocab_size = len(list(tw_unigrams))

for v in bigram_list:
    if v[0] not in bigram_table:
        bigram_table[v[0]] = {}
    bigram_table[v[0]][v[1]] = (tw_bigrams[(v[0], v[1])] + 1)/(tw_unigrams[v[0]] + vocab_size)

for v in bigram_list:
    if v[1] not in bigram_table:
        bigram_table[v[1]] = {}
    bigram_table[v[1]][v[0]] = 1/(tw_unigrams[v[1]] + vocab_size)

In [None]:
bigram_table = {}

bigram_list = list(tw_bigrams.keys())
vocab_size = len(list(tw_unigrams))

unigram_vocab = list(tw_unigrams.keys())
    
for v1 in unigram_vocab:
    for v2 in unigram_vocab:
        
        if (v1, v2) in tw_bigrams:
            if v1 not in bigram_table:
                bigram_table[v1] = {}
            bigram_table[v1][v2] = (tw_bigrams[(v1, v2)] + 1)/(tw_unigrams[v1] + vocab_size)
            
        else:
            if v1 not in bigram_table:
                bigram_table[v1] = {}
            bigram_table[v1][v2] = 1/(tw_unigrams[v1] + vocab_size)

Obteniendo trigramas

In [17]:
tw_trigrams = build_ngrams(corpus, punct, 3)

Tabla de trigramas

In [18]:
trigram_table = {}

trigram_list = list(tw_trigrams.keys())
vocab_size = len(list(tw_bigrams))

for v in trigram_list:
    if v[0] not in trigram_table:
        trigram_table[v[0]] = {}

    if v[1] not in trigram_table[v[0]]:
        trigram_table[v[0]][v[1]] = {}
    
    trigram_table[v[0]][v[1]][v[2]] = (tw_trigrams[(v[0], v[1], v[2])] + 1)/(tw_bigrams[(v[0], v[1])] + vocab_size)

### Ejercicio 4. Bigramas y Trigramas con Good-Turing Disccount

Para los valores de $N_{c+1}$ que no existan, se ajustará un modelo de ley de potencia: $N_{c+1} = a*(c+1)^{b}$ con $b < -1$

In [87]:
def power_law(coeffs, x):
    return np.exp(coeffs[1])*(x**(coeffs[0]))

Tabla de bigramas

In [92]:
bigram_table = {}

bigram_list = list(tw_bigrams.keys())
N = len(bigram_list)

# Calcula conteo N_c
limit = 20  # A paritr de este valor se usa el modelo de ley de potencia
Nk = {}
for f in set(tw_bigrams.values()):
    if f >= 20:
        break
    Nk[f] = len([w for w in tw_bigrams.keys() if tw_bigrams[w] == f])

# Ajusta modelo de ley de potencia
Nk_log = []
k = []
for f in set(tw_bigrams.values()):
    Nk_log.append(np.log(len([w for w in tw_bigrams.keys() if tw_bigrams[w] == f])))
    k.append(np.log(f))
    
Nk_log = np.array(Nk_log)
k = np.array(k)
z = np.polyfit(k, Nk_log, 1)

# Calcula tabla de bigramas
for v in bigram_list:
    
    if v[0] not in bigram_table:
        bigram_table[v[0]] = {}
        
    if tw_bigrams[(v[0], v[1])] > limit or (tw_bigrams[(v[0], v[1])]+1) not in Nk:
        c = power_law(z, tw_bigrams[(v[0], v[1])]+1)
    else:
        c = (tw_bigrams[(v[0], v[1])] + 1)*(Nk[tw_bigrams[(v[0], v[1])]+1]/Nk[tw_bigrams[(v[0], v[1])]])

    bigram_table[v[0]][v[1]] = c/N

Tabla de trigramas

In [93]:
trigram_table = {}

trigram_list = list(tw_trigrams.keys())
N = len(trigram_list)

# Calcula conteo N_c
limit = 20  # A paritr de este valor se usa el modelo de ley de potencia
Nk = {}
for f in set(tw_trigrams.values()):
    if f >= 20:
        break
    Nk[f] = len([w for w in tw_trigrams.keys() if tw_trigrams[w] == f])

# Ajusta modelo de ley de potencia
Nk_log = []
k = []
for f in set(tw_trigrams.values()):
    Nk_log.append(np.log(len([w for w in tw_trigrams.keys() if tw_trigrams[w] == f])))
    k.append(np.log(f))
    
Nk_log = np.array(Nk_log)
k = np.array(k)
z = np.polyfit(k, Nk_log, 1)

# Calcula tabla de bigramas
for v in trigram_list:
    if v[0] not in trigram_table:
        trigram_table[v[0]] = {}

    if v[1] not in trigram_table[v[0]]:
        trigram_table[v[0]][v[1]] = {}
        
    if tw_trigrams[(v[0], v[1], v[2])] > limit or (tw_trigrams[(v[0], v[1], v[2])]+1) not in Nk:
        c = power_law(z, tw_trigrams[(v[0], v[1], v[2])]+1)
    else:
        c = (tw_trigrams[(v[0], v[1], v[2])] + 1)*(Nk[tw_trigrams[(v[0], v[1], v[2])]+1]/Nk[tw_trigrams[(v[0], v[1], v[2])]])

    bigram_table[v[0]][v[1], v[2]] = c/N

### Ejercicio 5. Modelo lenguaje con Add-one Smoothing en tweets agresivos

In [138]:
class Ngram():
    
    def __init__(self, corpus, punct):
        self.corpus = corpus
        self.punct = punct
        
        self.unigrams = self.build_ngrams(1)
        self.bigrams = self.build_ngrams(2)
        self.trigrams = self.build_ngrams(3)
        
        self.unigram_vocab = set(self.unigrams.keys())
        self.bigram_vocab = set(self.bigrams.keys())
        self.trigram_vocab = set(self.trigrams.keys())
        self.unigram_size = len(self.unigram_vocab)
        self.bigram_size = len(self.bigram_vocab)
        self.trigram_size = len(self.trigram_vocab)

        self.bigrams_table = self.build_bigram_table()
        self.trigrams_table = self.build_trigram_table()
        
    def process_word(self, w):
        is_punct = True if w in self.punct else False
        is_digit = w.isnumeric()
        is_stopword = w in stopwords.words('spanish')

        return "" if is_punct or is_digit else w.lower()
        
    def build_ngrams(self, n):
        tk = TweetTokenizer()
        tokens = [self.process_word(w) for sent in self.corpus for w in tk.tokenize(sent)]
        tokens = list(filter(None, tokens))
        tw_ngrams = words_to_ngrams(tokens, n)
        tw_ngrams = FreqDist(tw_ngrams)

        return tw_ngrams
    
    def build_bigram_table(self):
        bigram_table = {}

        for v in self.bigram_vocab:
            if v[0] not in bigram_table:
                bigram_table[v[0]] = {}
            bigram_table[v[0]][v[1]] = (self.bigrams[(v[0], v[1])] + 1)/(self.unigrams[v[0]] + self.bigram_size)
        
        return bigram_table
    
    def build_trigram_table(self):
        trigram_table = {}

        for v in self.trigram_vocab:
            if v[0] not in trigram_table:
                trigram_table[v[0]] = {}

            if v[1] not in trigram_table[v[0]]:
                trigram_table[v[0]][v[1]] = {}

            trigram_table[v[0]][v[1]][v[2]] = (self.trigrams[(v[0], v[1], v[2])] + 1)/(self.bigrams[(v[0], v[1])] + self.trigram_size)
        return trigram_table
    
    def prob_sentence_bigram(self, s):
        words_split = s.split()
        words = []
        for w in words_split:
            if w in self.unigram_vocab:
                words.append(w)
                
        if len(words) == 0:
            return 0
        
        prob = 0
        prob += np.log(self.unigrams[words[0]]/self.unigram_size)
        
        if len(words) == 1:
            return -prob
        elif len(words) == 2:
            prob += np.log(self.prob_bigram(words[1], words[0]))
            return -prob
        
        for i in range(1, len(words)-1):
            prob += np.log(self.prob_bigram(words[i+1], words[i]))
        
        return -prob
    
    def prob_bigram(self, w2, w1):
        if w1 in self.bigrams_table and w2 in self.bigrams_table[w1]:
            return self.bigrams_table[w1][w2]
        else:
            return 1/(self.unigrams[w1] + self.bigram_size)
        
    def prob_sentence_trigram(self, s):
        words_split = s.split()
        words = []
        for w in words_split:
            if w in self.unigram_vocab:
                words.append(w)
                
        if len(words) == 0:
            return 0
        
        prob = 0
        prob += self.unigrams[words[0]]/self.unigram_size
        
        if len(words) == 1:
            return -prob
        elif len(words) == 2:
            prob += np.log(self.prob_bigram(words[1], words[0]))
            return -prob
        elif len(words) == 3:
            prob += np.log(self.prob_trigram(words[2], words[0], words[1]))
            return -prob
        
        for i in range(1, len(words)-2):
            prob += np.log(self.prob_trigram(words[i+2], words[i], words[i+1]))
        
        return -prob
    
    def prob_trigram(self, w3, w1, w2):
        if w1 in self.trigrams_table and w2 in self.trigrams_table[w1] and w3 in self.trigrams_table[w1][w2]:
            return self.trigrams_table[w1][w2][w3]
        else:
            return 1/(self.bigrams[(w1, w2)] + self.trigram_size)

In [151]:
model = Ngram(tweets_agg, punct)

In [143]:
model.prob_sentence_bigram("hijo de puta")

1.2884328463463779e-05

In [144]:
model.prob_sentence_bigram("de puta hijo")

8.962210844651776e-06

In [145]:
model.prob_sentence_trigram("hijo de puta")

4.241810949130512e-06

In [146]:
model.prob_sentence_trigram("de puta hijo")

6.643867211531108e-06

### Ejercicios 6 y 7. Perplejidad para modelo de bigramas y trigramas en tweets agresivos de conjunto de datos test

Se leen los datos del conjunto test de la clase 1 (agresivo)

In [24]:
with open('mex_test.txt', 'r') as f:
    corpus_test = f.readlines()
    
with open('mex_test_labels.txt', 'r') as f:
    labels_test = f.readlines()

labels_test = [int(lab.strip('\n')) for lab in labels_test]
tweets_agg_test = [tw for tw, lab in zip(corpus_test, labels_test) if lab == 1]

In [25]:
sentence_test = " ".join([s.strip('\n') for s in tweets_agg_test])

Se crean los modelos de bigramas y trigramas con el conjunto de training de la clase 1 (agresivo)

In [26]:
with open('mex_train.txt', 'r') as f:
    corpus_train = f.readlines()
    
with open('mex_train_labels.txt', 'r') as f:
    labels_train = f.readlines()

labels_train = [int(lab.strip('\n')) for lab in labels_train]
tweets_agg_train = [tw for tw, lab in zip(corpus_train, labels_train) if lab == 1]

In [29]:
model = Ngram(tweets_agg_train, punct)

In [36]:
N = num_tokens(tweets_agg_test)

#### Se calcula perplejidad del modelo de bigramas

In [39]:
np.power(1./model.prob_sentence_bigram(sentence_test), -1./N)

1.002027839277331

#### Se calcula perplejidad del modelo de trigramas

In [40]:
np.power(1./model.prob_sentence_trigram(sentence_test), -1./N)

1.0020350653129673

### Ejercicio 8. Modelo combinado (unigramas, bigramas y trigramas)

In [137]:
class TrigramInterpolation():
    
    def __init__(self, corpus, punct):
        self.corpus = corpus
        self.punct = punct
        
        self.unigrams = self.build_ngrams(1)
        self.bigrams = self.build_ngrams(2)
        self.trigrams = self.build_ngrams(3)
        
        self.unigram_vocab = set(self.unigrams.keys())
        self.bigram_vocab = set(self.bigrams.keys())
        self.trigram_vocab = set(self.trigrams.keys())
        self.unigram_size = len(self.unigram_vocab)
        self.bigram_size = len(self.bigram_vocab)
        self.trigram_size = len(self.trigram_vocab)

        self.bigrams_table = self.build_bigram_table()
        self.trigrams_table = self.build_trigram_table()
        
        self.lambda1 = 0.6
        self.lambda2 = 0.3
        self.lambda3 = 0.1
        
    def process_word(self, w):
        is_punct = True if w in self.punct else False
        is_digit = w.isnumeric()
        is_stopword = w in stopwords.words('spanish')

        return "" if is_punct or is_digit or is_stopword else w.lower()
        
    def build_ngrams(self, n):
        tk = TweetTokenizer()
        tokens = [self.process_word(w) for sent in self.corpus for w in tk.tokenize(sent)]
        tokens = list(filter(None, tokens))
        tw_ngrams = words_to_ngrams(tokens, n)
        tw_ngrams = FreqDist(tw_ngrams)

        return tw_ngrams
    
    def build_bigram_table(self):
        bigram_table = {}

        for v in self.bigram_vocab:
            if v[0] not in bigram_table:
                bigram_table[v[0]] = {}
            bigram_table[v[0]][v[1]] = (self.bigrams[(v[0], v[1])] + 1)/(self.unigrams[v[0]] + self.bigram_size)
        
        return bigram_table
    
    def build_trigram_table(self):
        trigram_table = {}

        for v in self.trigram_vocab:
            if v[0] not in trigram_table:
                trigram_table[v[0]] = {}

            if v[1] not in trigram_table[v[0]]:
                trigram_table[v[0]][v[1]] = {}

            trigram_table[v[0]][v[1]][v[2]] = (self.trigrams[(v[0], v[1], v[2])] + 1)/(self.bigrams[(v[0], v[1])] + self.trigram_size)
        return trigram_table
    
    def prob_sentence_bigram(self, s):
        words_split = s.split()
        words = []
        for w in words_split:
            if w in self.unigram_vocab:
                words.append(w)
                
        if len(words) == 0:
            return 0
        
        prob = 0
        prob += np.log(self.unigrams[words[0]]/self.unigram_size)
        
        if len(words) == 1:
            return -prob
        elif len(words) == 2:
            prob += np.log(self.prob_bigram(words[1], words[0]))
            return -prob
        
        for i in range(1, len(words)-1):
            prob += np.log(self.prob_bigram(words[i+1], words[i]))
        
        return -prob
    
    def prob_bigram(self, w2, w1):
        if w1 in self.bigrams_table and w2 in self.bigrams_table[w1]:
            return self.bigrams_table[w1][w2]
        else:
            return 1./(self.unigrams[w1] + self.bigram_size)
        
    def prob_sentence_trigram(self, s):
        words_split = s.split()
        words = []
        for w in words_split:
            if w in self.unigram_vocab:
                words.append(w)
                
        if len(words) == 0:
            return 0
        
        prob = 0
        prob += np.log(self.unigrams[words[0]]/self.unigram_size)
        
        if len(words) == 1:
            return -prob
        elif len(words) == 2:
            prob += np.log(self.prob_bigram(words[1], words[0]))
            return -prob
        elif len(words) == 3:
            prob += np.log(self.prob_trigram(words[2], words[0], words[1]))
            return -prob
        
        for i in range(1, len(words)-2):
            prob += np.log(self.prob_trigram(words[i+2], words[i], words[i+1]))
        
        return -prob
    
    def prob_trigram(self, w3, w1, w2):
        if w1 in self.trigrams_table and w2 in self.trigrams_table[w1] and w3 in self.trigrams_table[w1][w2]:
            return self.trigrams_table[w1][w2][w3]
        else:
            return 1./(self.bigrams[(w1, w2)] + self.trigram_size)
        
    def prob_sentence_trigram_interpol(self, s):
        words_split = s.split()
        words = []
        for w in words_split:
            if w in self.unigram_vocab:
                words.append(w)
                
        if len(words) == 0:
            return 0
        
        prob = 0
        prob += np.log(self.unigrams[words[0]]/self.unigram_size)
        
        if len(words) == 1:
            return -prob
        elif len(words) == 2:
            prob += np.log(self.prob_bigram(words[1], words[0]))
            return -prob
        elif len(words) == 3:
            prob += np.log(self.prob_trigram_interpol(words[2], words[0], words[1]))
            return -prob
        
        for i in range(1, len(words)-2):
            prob += np.log(self.prob_trigram_interpol(words[i+2], words[i], words[i+1]))
        
        return -prob
    
    def prob_trigram_interpol(self, w3, w1, w2):
        prob = 0
        if w1 in self.trigrams_table and w2 in self.trigrams_table[w1] and w3 in self.trigrams_table[w1][w2]:
            prob += self.lambda1*self.trigrams_table[w1][w2][w3]
        else:
            prob += self.lambda1*(1./(self.bigrams[(w1, w2)] + self.trigram_size))
            
        if w2 in self.bigrams_table and w3 in self.bigrams_table[w2]:
            prob += self.lambda2*self.bigrams_table[w2][w3]
        else:
            prob += self.lambda2*(1./(self.unigrams[w2] + self.bigram_size))
            
        if w3 in self.unigrams:
            prob += self.lambda3*self.unigrams[w3]
        else:
            prob += self.lambda3*(1./self.unigram_size)
            
        return prob

Se crea modelo combinado mediante interpolación

In [58]:
model = TrigramInterpolation(tweets_agg_train, punct)

### Ejercico 9. Perplejidad modelo combinado

In [63]:
np.power(-model.prob_sentence_trigram_interpol(sentence_test), -1./N)

0.9985662601545036

### Ejercicio 10. Función agredir() para generar oraciones a partir de modelo de trigramas add-one

In [207]:
def agredir(model):
    
    vocab = list(model.unigrams.keys())
    
    # Get random firts word
    index = np.random.randint(0,len(vocab),1)[0]
    first_word = vocab[index]
    
    # Get second word according to bigram probabilities
    max_prob = max(model.bigrams_table[first_word].values())
    max_prob_words = [k for k, v in model.bigrams_table[first_word].items() if v == max_prob]
    
    if len(max_prob_words) == 1:
        next_word = max_prob_words[0]
    else:
        index = np.random.randint(0,len(max_prob_words),1)[0]
        next_word = max_prob_words[index]
        
    sentence = [first_word, next_word]
    max_len = 30
    curr_len = 2
    
    # Get the next words according to trigram probabilities, until '</s>' is obtained or max length is reached
    while next_word != "</s>" and curr_len < max_len:
        
        max_prob = max(model.trigrams_table[sentence[-2]][sentence[-1]].values())
        max_prob_words = [k for k, v in model.trigrams_table[sentence[-2]][sentence[-1]].items() if v == max_prob]

        if len(max_prob_words) == 1:
            next_word = max_prob_words[0]
        else:
            index = np.random.randint(0,len(max_prob_words),1)[0]
            next_word = max_prob_words[index]
            
        sentence.append(next_word)
        curr_len += 1

    return " ".join(sentence[:-1]).strip(" ")

Se añade símbolo '</s>' al final de cada tweet para incluirlo en las probabilidades

In [84]:
with open('mex_train.txt', 'r') as f:
    corpus_train = f.readlines()
    
with open('mex_train_labels.txt', 'r') as f:
    labels_train = f.readlines()

labels_train = [int(lab.strip('\n')) for lab in labels_train]
tweets_agg_train = [tw.strip('\n')+' </s>' for tw, lab in zip(corpus_train, labels_train) if lab == 1]

Se crea modelo de trigramas con dataset de entrenamiento en tweets agresivos

In [139]:
model = Ngram(tweets_agg_train, punct)

Ejemplos de tweets generados a partir de trigramas

In [214]:
for i in range(5):
    print("Tweet {0}: {1} \n".format(i+1, agredir(model)))

Tweet 1: jaja 

Tweet 2: #amlo no confronte a los putos de los putos gringos de mierda 

Tweet 3: golf los encinos s a 

Tweet 4: csm el mal parido que gusto saber de ti y eso le enseñas a tus putos amigos mamones piensen 🙂 

Tweet 5: fama y la gente que se vaya a chingar a su madre 



### Ejercicio 11. Modelo de clasificación a partir de bigramas

Se leen tweets de conjunto training no agresivos

In [215]:
with open('mex_train.txt', 'r') as f:
    corpus_train = f.readlines()
    
with open('mex_train_labels.txt', 'r') as f:
    labels_train = f.readlines()

labels_train = [int(lab.strip('\n')) for lab in labels_train]
tweets_noagg_train = [tw.strip('\n') for tw, lab in zip(corpus_train, labels_train) if lab == 0]
tweets_agg_train = [tw.strip('\n') for tw, lab in zip(corpus_train, labels_train) if lab == 1]

Se crean los dos modelos de bigramas, para tweets agresivos y no agresivos

In [216]:
model_noagg = Ngram(tweets_noagg_train, punct)
model_agg = Ngram(tweets_agg_train, punct)

Se cargan datos de test para evaluar el modelo

In [219]:
with open('mex_test.txt', 'r') as f:
    corpus_test = f.readlines()
    
with open('mex_test_labels.txt', 'r') as f:
    labels_test = f.readlines()

labels_test = [int(lab.strip('\n')) for lab in labels_test]
tweets_test = [tw.strip('\n') for tw in corpus_test]

Realiza predicciones en datos de conjunto test

In [220]:
y_pred = []

for tweet in tweets_test:
    if model_noagg.prob_sentence_bigram(tweet) > model_agg.prob_sentence_bigram(tweet):
        y_pred.append(0)
    else:
        y_pred.append(1)

### Ejercicio 12. Evaluación de modelo con métricas: Accuracy y F-score en clase positiva, negativa y Macro F-score

In [222]:
labels_test = np.array(labels_test)
y_pred = np.array(y_pred)

Accuracy

In [224]:
accuracy = np.mean(labels_test == y_pred)
print("Accuracy: ", accuracy)

Accuracy:  0.5688311688311688


F1-score para clase tweets agresivos (1)

In [269]:
TP = y_pred[labels_test == y_pred]

precision_1 = np.sum(TP==1)/np.sum(y_pred==1)
recall_1 =  np.sum(TP==1)/np.sum(labels_test==1)

f1_score_1 = 2*((precision_1*recall_1)/(precision_1+recall_1))

print("F1-Score clase 1: ", f1_score_1)

F1-Score clase 1:  0.2766884531590414


F1-score para clase tweets no agresivos (0)

In [265]:
TP = y_pred[labels_test == y_pred]

precision_0 = np.sum(TP==0)/np.sum(y_pred==0)
recall_0 =  np.sum(TP==0)/np.sum(labels_test==0)

f1_score_0 = 2*((precision_0*recall_0)/(precision_0+recall_0))

print("F1-Score clase 0: ", f1_score_0)

F1-Score clase 0:  0.6928769657724329


F1-score macro

In [270]:
f1_score_macro = (f1_score_0 + f1_score_1)/2

print("F1-Score macro: ", f1_score_macro)

F1-Score macro:  0.4847827094657371
