# Luis Eduardo Robles Jiménez

### Natural Language Processing

### Practica 3: BoW y Esquemas de pesado

In [None]:
def get_texts_from_file(path_corpus, path_truth):
    txt = []
    y = []
    with open(path_corpus, "r") as f_corpus, open(path_truth, "r") as f_truth:
        for tuit in f_corpus:
            txt += [tuit]
        for label in f_truth:
            y += [label] 
    return txt, list(map(int, y))

#### Load datasets

In [None]:
tr_txt, tr_y = get_texts_from_file("../data/agresividad/mex_train.txt", "../data/agresividad/mex_train_labels.txt")

In [None]:
len(tr_txt)

In [None]:
len(tr_y)

In [None]:
from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# See dist of labels

print(Counter(tr_y))

plt.hist(tr_y, bins=len(set(tr_y)))
plt.ylabel('Users')
plt.xlabel('Class')

In [None]:
tr_txt[:10]

In [None]:
# Split doesn't quite work because of punctuation

set(tr_txt[5].split()) 

In [None]:
import nltk 
from nltk.tokenize import TweetTokenizer # Tokenizer for social networks

In [None]:
tokenizer = TweetTokenizer()

## Tokenizar un tweet

In [None]:
tokenizer.tokenize(tr_txt[5])

In [None]:
tokenizer.tokenize("Hola @nick como estas #felizdia bye!!! hola@")

In [None]:
tokenizer.tokenize("https://www.youtube.com/watch?v=dhhS_g78X2E @")

In [None]:
corpus_palabras = []
for doc in tr_txt:
    corpus_palabras += tokenizer.tokenize(doc) # A single list
    #corpus_palabras += [tokenizer.tokenize(doc)] # Creates a list of lists

In [None]:
len(set(corpus_palabras))

In [None]:
fdist = nltk.FreqDist(corpus_palabras) # Frequency of each word
fdist

In [None]:
def sortFreqDist(freqdict):
    aux = [(freqdict[key], key) for key in freqdict]
    aux.sort()
    aux.reverse()
    return aux

In [None]:
V = sortFreqDist(fdist) # Vocabulario
V = V[:5000]
V[:10]

In [None]:
dict_indices = dict()
cont = 0
for weight, word in V:
    dict_indices[word] = cont
    cont += 1
print(len(dict_indices))
list(dict_indices)[:10]

In [None]:
val_txt, val_y = get_texts_from_file("../data/agresividad/mex_val.txt", "../data/agresividad/mex_val_labels.txt")

In [None]:
# See dist of labels
print(Counter(val_y))

plt.hist(val_y, bins=len(set(val_y)))
plt.ylabel('Users')
plt.xlabel('Class')

## Bag of Words

In [None]:
import numpy as np

### Binary bag of words

In [None]:
def build_binary_bow(tr_txt, V, dict_indices): #List of all tweets, vocabulary, ordered dict(word, freq)
    BOW = np.zeros((len(tr_txt),len(V)), dtype = int)
    cont_doc = 0
    for tr in tr_txt:
        fdist_doc = nltk.FreqDist(tokenizer.tokenize(tr))
        for word in fdist_doc:
            if word in dict_indices:
                BOW[cont_doc, dict_indices[word]] = 1
        cont_doc += 1
    return BOW #Returns a matrix of nDocs x nWords (first 5000 words)

### TCOR

In [None]:
def TCOR(BOW): # It works with binary BOW
    BOW = BOW.T
    vocabSize = BOW.shape[0]
    tcor = np.zeros((vocabSize, vocabSize))
    for ik in range(vocabSize):
        print(ik, '\r', end = "")
        occur = np.count_nonzero(np.sum(BOW[:, np.nonzero(BOW[ik])], axis = 1)) # Gets the number of words t_k co-occurs with
        for ij in range(vocabSize):
            if ij >= ik:
                freq, tff = np.count_nonzero(np.logical_and(BOW[ik], BOW[ij])), 0 # Gets the number of docs where t_k co-occurs with t_j
                if freq: tff = 1 + np.log(freq)
                tcor[ik, ij] = tff * np.log(vocabSize / occur)
                tcor[ij, ik] = tff
            else: 
                tcor[ik, ij] *= np.log(vocabSize / occur)
    return tcor

## Experiments

### TCOR - BOC

In [None]:
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest, chi2

In [None]:
BOW_tr = build_binary_bow(tr_txt, V, dict_indices)

In [None]:
base = TCOR(BOW_tr)

In [None]:
base = preprocessing.normalize(base, norm = "l2")

In [None]:
feats = SelectKBest(chi2, k = 1000)
#feats = SelectKBest(chi2, k = 50)
feats.fit(BOW_tr, tr_y)

In [None]:
best = feats.get_support(indices = True)
print(best.shape)

In [None]:
# Goes from word -> key to key -> word

dict_indice_invertido = {}
for w in dict_indices:
    dict_indice_invertido[dict_indices[w]] = w

In [None]:
t_words = [dict_indice_invertido[index] for index in best]

In [None]:
# dict_indices['palabra'] = 201
target_matrix = np.array([base[dict_indices[word]] for word in t_words])
target_matrix.shape

In [None]:
from tsne import tsne

In [None]:
reduced_matrix = tsne(target_matrix, 2)

In [None]:
reduced_matrix.shape

In [None]:
max_x = np.amax(reduced_matrix, axis=0)[0]
max_y = np.amax(reduced_matrix, axis=0)[1]

import matplotlib.pyplot as plt
%matplotlib inline

from nltk.corpus import stopwords

sw = set(stopwords.words("spanish"))

plt.figure(figsize=(40, 40), dpi=100)
plt.xlim((-max_x,max_x))
plt.ylim((-max_y,max_y))
plt.scatter(reduced_matrix[:, 0], reduced_matrix[:, 1], 20, color="black");

for idx, word in enumerate(t_words[:]):
    x = reduced_matrix[idx, 0]
    y = reduced_matrix[idx, 1]  
    if word in sw:
        plt.annotate(word, (x,y), color="red")
    else: 
        plt.annotate(word, (x,y), color="black")

## Conclusion

Tras llevar a cabo la implementacion de sistema de representacion TCOR puedo intuir que tiene mayor potencial para agrupar las palabras y representar un corpus, ya que es de tipo words-words. Me es bastante claro que el cálculo es demandante en cuestion de tiempo y complejidad de implementacion pero es asequible y util. 
Otra cosa que es importante notar es que este tipo de representacion es de naturaleza distribucional y vectorial. Su motivacion es pensar que la semantica de una palabra puede ser explicada por las palabras con las que esta coincide, en terminos coloquiales "dime con quien te juntas y te dire quien eres".
Basados en la definicion del pesado, se pueden decir las siguientes cosas:

- Entre en mas documentos co-ocurra t_k con t_j, mas explicara t_k la semantica de t_j.
- Por otro lado, entre mas palabras de co-ocurrencia con t_k haya, menos servira para explicar la semantica de t_j.

Las diferencias entre el sistema de pesado TCOR y sistema de pesado DOR:

- DOR genera una matriz de tamaño -> Palabras x Documentos; TCOR genera una matriz de tamaño -> Palabras x Palabras.
- Aunque parten de premisas similares, ambos sistemas de pesado representan ventajas diferentes, pero basado en experimentos extrinsecos, TCOR parece ser capaz de atrapar un poco mas de semantica y explicar mejor el corpus a traves de term clustering.
Sin duda, esta ha sido una actividad retadora pero ha dejado buen aprendizaje por detras, reforzado aun mas por la comparacion con DOR.
