# TCOR

Term Co-ocurrence representation

## Definition
$$
w_{k, j} = tff(t_k, t_j) log(\frac{|T|}{T_k})
$$

$$
tff(t_k, t_j) = 
\begin{cases}
1 + log(\#(t_k, t_j)) &\quad if \ \#(t_k, t_j) > 0 \\
0 &\quad otherwise
\end{cases}
$$

Where:
- $T_k$ is the number of terms the k-th word co-occured with
- $|T|$ is the size of the vocabulary
- $\#(t_k, t_j)$ is 
- $t$ is a word

## Import libraries

In [37]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_recall_fscore_support
from sklearn.model_selection import GridSearchCV
from nltk.tokenize import TweetTokenizer 
from collections import Counter
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn import svm
%matplotlib inline
import torch
import nltk 

In [38]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Device", device)

Device cpu


## Define utils

In [39]:
def get_texts_from_file(path_corpus, path_truth):
    txt = []
    y = []
    with open(path_corpus, "r") as f_corpus, open(path_truth, "r") as f_truth:
        for tuit in f_corpus:
            txt += [tuit]
        for label in f_truth:
            y += [label] 
    return txt, list(map(int, y))

In [40]:
def TCOR(tweets, V, dict_indices):
    pass

## Set up corpus

In [41]:
tr_txt, tr_y = get_texts_from_file("../data/agresividad/mex_train.txt", "../data/agresividad/mex_train_labels.txt")
val_txt, val_y = get_texts_from_file("../data/agresividad/mex_val.txt", "../data/agresividad/mex_val_labels.txt")

In [42]:
corpus_palabras = []
tokenizer = TweetTokenizer()
for doc in tr_txt:
    corpus_palabras += tokenizer.tokenize(doc) # A single list

In [43]:
vocab_size = 5000
fdist = nltk.FreqDist(corpus_palabras)
vocab = sorted([(fdist[key], key) for key in fdist])[:: -1][: vocab_size]

In [44]:
indices = dict()
for i, w in enumerate(vocab):
    _, word = w
    indices[word] = i
print(len(indices))
list(indices)[:10]

5000


['que', 'de', '.', 'a', 'la', 'y', 'no', 'me', '!', 'el']

## Experiments

In [46]:
TCOR_tr = TCOR(tr_txt, vocab, indices)
TCOR_val = TCOR(val_txt, vocab, indices)
TCOR_tr.shape
TCOR_tr

tensor([[10.7602, 10.3036, 10.1593,  ...,  4.4397,  5.1499,  4.5469],
        [10.7602, 10.3036, 10.1593,  ...,  4.4397,  5.1499,  4.5469],
        [10.7602, 10.3036, 10.1593,  ...,  4.4397,  5.1499,  4.5469],
        ...,
        [ 3.9954,  3.8258,  3.7723,  ...,  4.4397,  5.1499,  4.5469],
        [ 3.9954,  3.8258,  3.7723,  ...,  4.4397,  5.1499,  4.5469],
        [ 3.9954,  3.8258,  3.7723,  ...,  4.4397,  5.1499,  4.5469]])