# DOR

Document Ocurrent Representation

## Import libraries

In [2]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_recall_fscore_support
from sklearn.model_selection import GridSearchCV
from nltk.tokenize import TweetTokenizer 
from collections import Counter
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn import svm
%matplotlib inline
import numpy as np
import nltk 

## Define utils

In [3]:
def get_texts_from_file(path_corpus, path_truth):
    txt = []
    y = []
    with open(path_corpus, "r") as f_corpus, open(path_truth, "r") as f_truth:
        for tuit in f_corpus:
            txt += [tuit]
        for label in f_truth:
            y += [label] 
    return txt, list(map(int, y))

In [4]:
def DOR(tweets, V, dict_indices):
    # words, documents
    DOR = np.zeros((len(tweets),len(V)), dtype=int)
    for i, tweet in enumerate(tweets):
        word_map = nltk.FreqDist(tokenizer.tokenize(tweet))
        for word in word_map:
            if word in dict_indices:
                dftj = 1 + np.log(word_map[word])
                DOR[i, word_map[word]] = dftj * np.log(len(V)/len(tweet))
    # documents, documents
    DORsum = np.zeros((len(tweets), len(tweets)))
    for i, c in enumerate(DOR):
        DORsum[i, :] = np.dot(c.T, DOR.T)
    return DORsum

In [5]:
def evaluatePrediction(y_true, y_pred):
    p, r, f, _ = precision_recall_fscore_support(y_true, y_pred, average = 'macro', pos_label = 1)
    print(confusion_matrix(y_true, y_pred))
    print(metrics.classification_report(y_true, y_pred))

## Set up corpus

In [6]:
tr_txt, tr_y = get_texts_from_file("../data/agresividad/mex_train.txt", "../data/agresividad/mex_train_labels.txt")
val_txt, val_y = get_texts_from_file("../data/agresividad/mex_val.txt", "../data/agresividad/mex_val_labels.txt")

In [7]:
corpus_palabras = []
tokenizer = TweetTokenizer()
for doc in tr_txt:
    corpus_palabras += tokenizer.tokenize(doc) # A single list

In [8]:
vocab_size = 5000
fdist = nltk.FreqDist(corpus_palabras)
vocab = sorted([(fdist[key], key) for key in fdist])[:: -1][: vocab_size]

In [9]:
indices = dict()
for i, w in enumerate(vocab):
    _, word = w
    indices[word] = i
print(len(indices))
list(indices)[:10]

5000


['que', 'de', '.', 'a', 'la', 'y', 'no', 'me', '!', 'el']

## Model

In [10]:
parameters = {'C': [0.05, 0.25, 0.5, 1, 2]}
svr = svm.LinearSVC(class_weight = 'balanced')
grid = GridSearchCV(estimator = svr, param_grid = parameters, n_jobs = -1, scoring = "f1_macro", cv = 5) 

## Experiments

In [11]:
dor_tr = DOR(tr_txt, vocab, indices)
dor_val = DOR(val_txt, vocab, indices)
dor_tr

In [None]:
grid.fit(dor_tr, tr_y)

In [None]:
preds = grid.predict(dor_val)
evaluatePrediction(val_y, preds)