## Tarea 3
## Isaac Rodríguez Bribiesca

Bibliotecas usadas

In [85]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import bigrams
from nltk import ngrams
import preprocessor as p
import numpy as np

Se lee archivo de entrenamiento de tweets, así como las etiquetas

In [4]:
with open('mex_train.txt', 'r') as f:
    corpus = f.readlines()

In [5]:
with open('mex_train_labels.txt', 'r') as f:
    labels = f.readlines()

labels = [int(lab.strip('\n')) for lab in labels]

Se separan tweets agresivos y no agresivos

In [6]:
tweets_agg = [tw for tw, lab in zip(corpus, labels) if lab == 1]

In [7]:
tweets_noagg = [tw for tw, lab in zip(corpus, labels) if lab == 0]

Funciones para preprocesar los tweets

In [8]:
def process_word(w, punct):
    is_punct = True if w in punct else False
    is_digit = w.isnumeric()
    is_stopword = w in stopwords.words('spanish')

    return "" if is_punct or is_digit or is_stopword else w.lower()

def process_sentence(sent, punct):
    s = []
    for w in sent:

        is_punct = True if w in punct else False
        is_digit = w.isnumeric()
        is_stopword = w in stopwords.words('spanish')

        if not(is_punct or is_digit or is_stopword):
            s.append(w.lower())

    return " ".join(s)

Simbolos a filtrar

In [9]:
punct = set(['.', ',', ';', ':', '-', '!', '¡', '¿', '?', '"', '\'', '...', '<url>', '*', '@usuario'])

Función para convertir una lista de tokens a ngramas de tamaño n

In [29]:
def words_to_ngrams(words, n, sep=" "):
    if n > 1:
        return [tuple(words[i:i+n]) for i in range(len(words)-n+1)]
    else:
        return words

Función para calcular tabla de frecuencias de ngramas con ayuda de TweetTokenizer y FreqDist

In [30]:
def build_ngrams(tweets, punct, n):
    tk = TweetTokenizer()
    tokens = [process_word(w, punct) for sent in tweets for w in tk.tokenize(sent)]
    tokens = list(filter(None, tokens))
    tw_trigrams = words_to_ngrams(tokens, n)
    tw_trigrams = FreqDist(tw_trigrams)
    
    return tw_trigrams

### Ejercicio 1. Conteos de unigramas sin suavizado

Funcion que genera unigramas

In [31]:
def build_unigram(tweets, punct):
    return build_ngrams(tweets, punct, 1)

### Ejercicio 1. Conteos de bigramas sin suavizado

Funcion que genera bigramas

In [13]:
def build_bigram(tweets, punct):
    return build_ngrams(tweets, punct, 2)

### Ejercicio 2. Comparación unigramas y bigramas para clases tweets agresivos y no agresivos

#### Unigramas y bigramas más comunes en tweets no agresivos

Unigramas tweeets no agresivos

In [32]:
tw_unigrams = build_unigram(tweets_noagg, punct)

In [33]:
tw_unigrams.most_common(10)

[('verga', 864),
 ('madre', 675),
 ('putas', 547),
 ('loca', 542),
 ('si', 421),
 ('putos', 359),
 ('😂', 259),
 ('bien', 155),
 ('…', 155),
 ('vale', 127)]

Bigramas tweeets no agresivos

In [158]:
tw_bigrams = build_bigram(tweets_noagg, punct)

In [160]:
tw_bigrams.most_common(10)

[(('😂', '😂'), 106),
 (('vale', 'verga'), 77),
 (('puta', 'madre'), 70),
 (('❤', '️'), 38),
 (('🏻', '\u200d'), 31),
 (('valer', 'verga'), 30),
 (('vale', 'madre'), 30),
 (('mamá', 'luchona'), 29),
 (('😭', '😭'), 29),
 (('😡', '😡'), 25)]

#### Unigramas y bigramas más comunes en tweets agresivos

Unigramas tweeets no agresivos

In [161]:
tw_unigrams = build_unigram(tweets_agg, punct)

In [162]:
tw_unigrams.most_common(10)

[(('putos',), 472),
 (('madre',), 404),
 (('putas',), 348),
 (('verga',), 284),
 (('si',), 242),
 (('hdp',), 215),
 (('puta',), 171),
 (('pinche',), 171),
 (('😂',), 118),
 (('puto',), 114)]

Bigramas tweeets no agresivos

In [163]:
tw_bigrams = build_bigram(tweets_agg, punct)

In [164]:
tw_bigrams.most_common(10)

[(('puta', 'madre'), 90),
 (('😂', '😂'), 59),
 (('chingar', 'madre'), 32),
 (('mil', 'putas'), 32),
 (('chinguen', 'madre'), 32),
 (('hijo', 'puta'), 28),
 (('hijos', 'puta'), 27),
 (('chinga', 'madre'), 27),
 (('chingas', 'madre'), 27),
 (('putas', 'madres'), 25)]

Para el caso de los unigramas no se observa mucha diferencia en el tipo de palabras más frecuentes entre tweets agresivos y no agresivos, variando sólamente las frecuencias en que aparecen las palabras. En el caso de bigramas se hace más notoria la diferencia entre tweets agresivos y no agresivos, ya que en los tweets agresivos aparecen más groserías como "hijo puta" o "chingas madre", que en los bigramas de tweeets no agresivos no son tan frecuentes.

### Ejercicio 3. Bigramas y Trigramas con Add-one Smoothing

Se obtienen unigramas y bigramas para construir las tablas

In [14]:
tw_unigrams = build_unigram(corpus, punct)
tw_bigrams = build_bigram(corpus, punct)

Tabla de bigramas

In [15]:
bigram_table = {}

bigram_list = list(tw_bigrams.keys())
vocab_size = len(list(tw_unigrams))

for v in bigram_list:
    if v[0] not in bigram_table:
        bigram_table[v[0]] = {}
    bigram_table[v[0]][v[1]] = (tw_bigrams[(v[0], v[1])] + 1)/(tw_unigrams[v[0]] + vocab_size)

for v in bigram_list:
    if v[1] not in bigram_table:
        bigram_table[v[1]] = {}
    bigram_table[v[1]][v[0]] = 1/(tw_unigrams[v[1]] + vocab_size)

In [None]:
bigram_table = {}

bigram_list = list(tw_bigrams.keys())
vocab_size = len(list(tw_unigrams))

unigram_vocab = list(tw_unigrams.keys())
    
for v1 in unigram_vocab:
    for v2 in unigram_vocab:
        
        if (v1, v2) in tw_bigrams:
            if v1 not in bigram_table:
                bigram_table[v1] = {}
            bigram_table[v1][v2] = (tw_bigrams[(v1, v2)] + 1)/(tw_unigrams[v1] + vocab_size)
            
        else:
            if v1 not in bigram_table:
                bigram_table[v1] = {}
            bigram_table[v1][v2] = 1/(tw_unigrams[v1] + vocab_size)

Obteniendo trigramas

In [17]:
tw_trigrams = build_ngrams(corpus, punct, 3)

Tabla de trigramas

In [18]:
trigram_table = {}

trigram_list = list(tw_trigrams.keys())
vocab_size = len(list(tw_bigrams))

for v in trigram_list:
    if v[0] not in trigram_table:
        trigram_table[v[0]] = {}

    if v[1] not in trigram_table[v[0]]:
        trigram_table[v[0]][v[1]] = {}
    
    trigram_table[v[0]][v[1]][v[2]] = (tw_trigrams[(v[0], v[1], v[2])] + 1)/(tw_bigrams[(v[0], v[1])] + vocab_size)

### Ejercicio 4. Bigramas y Trigramas con Good-Turing Disccount

Para los valores de $N_{c+1}$ que no existan, se ajustará un modelo de ley de potencia: $N_{c+1} = a*(c+1)^{b}$ con $b < -1$

In [87]:
def power_law(coeffs, x):
    return np.exp(coeffs[1])*(x**(coeffs[0]))

Tabla de bigramas

In [92]:
bigram_table = {}

bigram_list = list(tw_bigrams.keys())
N = len(bigram_list)

# Calcula conteo N_c
limit = 20  # A paritr de este valor se usa el modelo de ley de potencia
Nk = {}
for f in set(tw_bigrams.values()):
    if f >= 20:
        break
    Nk[f] = len([w for w in tw_bigrams.keys() if tw_bigrams[w] == f])

# Ajusta modelo de ley de potencia
Nk_log = []
k = []
for f in set(tw_bigrams.values()):
    Nk_log.append(np.log(len([w for w in tw_bigrams.keys() if tw_bigrams[w] == f])))
    k.append(np.log(f))
    
Nk_log = np.array(Nk_log)
k = np.array(k)
z = np.polyfit(k, Nk_log, 1)

# Calcula tabla de bigramas
for v in bigram_list:
    
    if v[0] not in bigram_table:
        bigram_table[v[0]] = {}
        
    if tw_bigrams[(v[0], v[1])] > limit or (tw_bigrams[(v[0], v[1])]+1) not in Nk:
        c = power_law(z, tw_bigrams[(v[0], v[1])]+1)
    else:
        c = (tw_bigrams[(v[0], v[1])] + 1)*(Nk[tw_bigrams[(v[0], v[1])]+1]/Nk[tw_bigrams[(v[0], v[1])]])

    bigram_table[v[0]][v[1]] = c/N

Tabla de trigramas

In [93]:
trigram_table = {}

trigram_list = list(tw_trigrams.keys())
N = len(trigram_list)

# Calcula conteo N_c
limit = 20  # A paritr de este valor se usa el modelo de ley de potencia
Nk = {}
for f in set(tw_trigrams.values()):
    if f >= 20:
        break
    Nk[f] = len([w for w in tw_trigrams.keys() if tw_trigrams[w] == f])

# Ajusta modelo de ley de potencia
Nk_log = []
k = []
for f in set(tw_trigrams.values()):
    Nk_log.append(np.log(len([w for w in tw_trigrams.keys() if tw_trigrams[w] == f])))
    k.append(np.log(f))
    
Nk_log = np.array(Nk_log)
k = np.array(k)
z = np.polyfit(k, Nk_log, 1)

# Calcula tabla de bigramas
for v in trigram_list:
    if v[0] not in trigram_table:
        trigram_table[v[0]] = {}

    if v[1] not in trigram_table[v[0]]:
        trigram_table[v[0]][v[1]] = {}
        
    if tw_trigrams[(v[0], v[1], v[2])] > limit or (tw_trigrams[(v[0], v[1], v[2])]+1) not in Nk:
        c = power_law(z, tw_trigrams[(v[0], v[1], v[2])]+1)
    else:
        c = (tw_trigrams[(v[0], v[1], v[2])] + 1)*(Nk[tw_trigrams[(v[0], v[1], v[2])]+1]/Nk[tw_trigrams[(v[0], v[1], v[2])]])

    bigram_table[v[0]][v[1], v[2]] = c/N

### Ejercicio 5. Modelo lenguaje con Add-one Smoothing en tweets agresivos