# Luis Eduardo Robles Jiménez

### Natural Language Processing

### Practica 3: BoW y Esquemas de pesado

In [None]:
def get_texts_from_file(path_corpus, path_truth):
    txt = []
    y = []
    with open(path_corpus, "r") as f_corpus, open(path_truth, "r") as f_truth:
        for tuit in f_corpus:
            txt += [tuit]
        for label in f_truth:
            y += [label] 
    return txt, list(map(int, y))

#### BOW train

In [None]:
tr_txt, tr_y = get_texts_from_file("../data/agresividad/mex_train.txt", "../data/agresividad/mex_train_labels.txt")

In [None]:
len(tr_txt)

In [None]:
len(tr_y)

In [None]:
from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# See dist of labels

print(Counter(tr_y))

plt.hist(tr_y, bins=len(set(tr_y)))
plt.ylabel('Users')
plt.xlabel('Class')

In [None]:
tr_txt[:10]

In [None]:
# Split doesn't quite work because of punctuation

set(tr_txt[5].split()) 

In [None]:
import nltk 
from nltk.tokenize import TweetTokenizer # Tokenizer for social networks

In [None]:
tokenizer = TweetTokenizer()

## Tokenizar un tweet

In [None]:
tokenizer.tokenize(tr_txt[5])

In [None]:
tokenizer.tokenize("Hola @nick como estas #felizdia bye!!! hola@")

In [None]:
tokenizer.tokenize("https://www.youtube.com/watch?v=dhhS_g78X2E @")

In [None]:
corpus_palabras = []
for doc in tr_txt:
    corpus_palabras += tokenizer.tokenize(doc) # A single list
    #corpus_palabras += [tokenizer.tokenize(doc)] # Creates a list of lists

In [None]:
len(set(corpus_palabras))

In [None]:
fdist = nltk.FreqDist(corpus_palabras) # Frequency of each word
fdist

In [None]:
def sortFreqDist(freqdict):
    aux = [(freqdict[key], key) for key in freqdict]
    aux.sort()
    aux.reverse()
    return aux

In [None]:
V = sortFreqDist(fdist) # Vocabulario
V = V[:5000]
V[:10]

In [None]:
dict_indices = dict()
cont = 0
for weight, word in V:
    dict_indices[word] = cont
    cont += 1
list(dict_indices)[:10]

In [None]:
val_txt, val_y = get_texts_from_file("../data/agresividad/mex_val.txt", "../data/agresividad/mex_val_labels.txt")

In [None]:
# See dist of labels
print(Counter(val_y))

plt.hist(val_y, bins=len(set(val_y)))
plt.ylabel('Users')
plt.xlabel('Class')

## Bag of Words

In [None]:
import numpy as np

### Binary bag of words

In [None]:
def my_build_binary_bow(tr_txt, V, dict_indices): #List of all tweets, vocabulary, ordered dict(word, freq)
    w_bound = 5000
    BOW = np.zeros((len(tr_txt), np.min((len(V), w_bound))), dtype = int)
    for i, t in enumerate(tr_txt):
        for j, w in enumerate(dict_indices):
            if j >= w_bound: break
            BOW[i, j] = w in t
    return BOW #Returns a matrix of nDocs x nWords (first 5000 words)
def build_binary_bow(tr_txt, V, dict_indices): #List of all tweets, vocabulary, ordered dict(word, freq)
    BOW = np.zeros((len(tr_txt),len(V)), dtype = int)
    cont_doc = 0
    for tr in tr_txt:
        fdist_doc = nltk.FreqDist(tokenizer.tokenize(tr))
        for word in fdist_doc:
            if word in dict_indices:
                BOW[cont_doc, dict_indices[word]] = 1
        cont_doc += 1
    return BOW #Returns a matrix of nDocs x nWords (first 5000 words)

### Frequency bag of words

In [None]:
def build_freq_bow(txt, V, dict_indices):
    BOW = np.zeros((len(txt),len(V)), dtype = int)
    for tweet_i, tweet in enumerate(txt):
        fdist_doc = nltk.FreqDist(tokenizer.tokenize(tweet))
        for word in fdist_doc:
            if word in dict_indices:
                BOW[tweet_i, dict_indices[word]] += fdist_doc[word]
    return BOW

### TF-IDF Bag of Words

## Classification

In [None]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_recall_fscore_support

### Define model

In [None]:
parameters = {'C': [0.05, 0.25, 1, 2]} # Complexity parameters
svr = svm.LinearSVC(class_weight = 'balanced') # The parameters tell the classifier to consider that the dataset is unbalanced
grid = GridSearchCV(estimator = svr, param_grid = parameters, n_jobs = -1, scoring = "f1_macro", cv = 5) # It's a validation technique: evaluates hyperparameters in a classifier; Use 8 logic processors; Try 5 different times

In [None]:
def evaluatePrediction(BOW, y_true, y_pred):
    p, r, f, _ = precision_recall_fscore_support(val_y, y_pred, average = 'macro', pos_label = 1)
    print(confusion_matrix(val_y, y_pred))
    print(metrics.classification_report(val_y, y_pred))

## Experiments

### Binary BOW

In [None]:
BOW_tr = build_binary_bow(tr_txt, V, dict_indices)
BOW_val = build_binary_bow(val_txt, V, dict_indices)
print("Training BOW:", BOW_tr.shape)
print("Validation BOW:", BOW_val.shape)

In [None]:
grid.fit(BOW_tr, tr_y)

In [None]:
# Pretty good results for such simple model and a binary BOW
evaluatePrediction(BOW_val, val_y, grid.predict(BOW_val))

### Frequency BOW

In [None]:
BOW_tr = build_freq_bow(tr_txt, V, dict_indices)
BOW_val = build_freq_bow(val_txt, V, dict_indices)
print("Training BOW:", BOW_tr.shape)
print("Validation BOW:", BOW_val.shape)

In [None]:
grid.fit(BOW_tr, tr_y)

In [None]:

evaluatePrediction(BOW_val, val_y, grid.predict(BOW_val))

### TF-IDF BOW