In [5]:
import sys, os
sys.path.append(os.path.join(sys.path[0].split('Documents')[0],'Documents/BecaNLP/Utils'))

import NLPUtils as nlp

import numpy as np
import pandas as pd

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
from NLPUtils.classifiers import SVCClassifier as svm
from NLPUtils.classifiers import MultinomialNB
from NLPUtils.datasets import imdb

def label_fn(labels):
    labels[labels < 5] = 0
    labels[labels > 6] = 1
    return labels

train_dataset = imdb.get_train_dataframe()
vectorizer = nlp.BagOfNgramsVectorizer(label_fn=label_fn,token_pattern=r'\b\w+\b')
#X_train, y_train = vectorizer.fit_transform(train_dataset)
#train_dataset = vectorizer.fit_transform(train_dataset)
#dev_dataset = vectorizer.transform(dev_dataset)
classifier = MultinomialNB()

In [4]:
nlp.train_dev_validation(classifier,train_dataset,vectorizer,dev_size=.1,random_state=0,metric='accuracy')

Total accuracy: 13/20 (65.00%)


0.65

In [9]:
nlp.k_fold_validation(classifier,train_dataset,vectorizer,k_folds=5,random_state=0,metric='accuracy')

Total accuracy: 4235/5000 (84.70%)
Total accuracy: 4259/5000 (85.18%)
Total accuracy: 4177/5000 (83.54%)
Total accuracy: 4204/5000 (84.08%)
Total accuracy: 4228/5000 (84.56%)


[0.847, 0.8518, 0.8354, 0.8408, 0.8456]

In [30]:
from NLPUtils.datasets import tass
from NLPUtils.classifiers import MultinomialNB, BernoulliNB, SVCClassifier as SVM, LinearSVCClassifier as LinearSVM

def label_fn(labels):
    labels_dict = {label: idx for idx, label in enumerate(np.unique(labels))}
    return np.array([labels_dict[label] for label in labels])

train_dataset = tass.get_train_dataframe()
dev_dataset = tass.get_dev_dataframe()
classifier = MultinomialNB()
#classifier = BernoulliNB()
#classifier = SVM()
#classifier = LinearSVM()

for vocab_size in [10000, 20000, 50000, 80000, 100000]:
    vectorizer = nlp.BagOfNgramsVectorizer(label_fn=label_fn,tokenizer=nlp.tokenize_characters,
                                       ngram_range=(1,20),max_features=vocab_size)
    print('Vocabulary size: {}.'.format(vocab_size),end=' ')
    nlp.train_dev_validation(classifier,train_dataset,vectorizer,dev_dataset=dev_dataset,metric='accuracy')

Vocabulary size: 10000. Total accuracy: 296/581 (50.95%)
Vocabulary size: 20000. Total accuracy: 309/581 (53.18%)
Vocabulary size: 50000. Total accuracy: 322/581 (55.42%)
Vocabulary size: 80000. Total accuracy: 328/581 (56.45%)
Vocabulary size: 100000. Total accuracy: 331/581 (56.97%)


In [32]:
for vocab_size in [120000, 150000, 200000, 500000]:
    vectorizer = nlp.BagOfNgramsVectorizer(label_fn=label_fn,tokenizer=nlp.tokenize_characters,
                                       ngram_range=(1,20),max_features=vocab_size)
    print('Vocabulary size: {}.'.format(vocab_size),end=' ')
    nlp.train_dev_validation(classifier,train_dataset,vectorizer,dev_dataset=dev_dataset,metric='accuracy')

Vocabulary size: 120000. Total accuracy: 329/581 (56.63%)
Vocabulary size: 150000. Total accuracy: 324/581 (55.77%)
Vocabulary size: 200000. Total accuracy: 322/581 (55.42%)
Vocabulary size: 500000. Total accuracy: 311/581 (53.53%)


## Implementación para test

In [10]:
from NLPUtils.datasets import tass2018 as tass
from NLPUtils.classifiers import MultinomialNB

train_df = tass.get_train_dataframe(lang=['es'])
dev_df = tass.get_dev_dataframe(lang=['es'])
test_df = tass.get_test_dataframe(lang=['es'])
test_df['label'] = 'N'

def label_fn(labels):
    labels_dict = {label: idx for idx, label in enumerate(np.unique(labels))}
    return np.array([labels_dict[label] for label in labels])

vectorizer = nlp.BagOfNgramsVectorizer(label_fn=label_fn,tokenizer=nlp.tokenize_characters,
                                       ngram_range=(1,20),max_features=100000)
classifier = MultinomialNB()

train_dataset = vectorizer.fit_transform(train_df)
test_dataset = vectorizer.transform(test_df.loc[:,['text','label']])

classifier.train(train_dataset)
_, y_predict = classifier.predict(test_dataset)
for i in range(4):
    print((y_predict == i).sum())


In [24]:
idx, (text, label) = next(train_df.iterrows())

'-Me caes muy bien \n-Tienes que jugar más partidas al lol con Russel y conmigo\n-Por qué tan Otako, deja de ser otako\n-Haber si me muero'

In [28]:
labels_to_idx = {label: idx for idx, label in enumerate(np.unique(train_df['label']))}
idx_to_labels = {idx:label for label, idx in labels_to_idx.items()}
labels_predict = [idx_to_labels[idx] for idx in y_predict]
test_df['label'] = labels_predict
test_df.loc[:,['tweet_id','label']].to_csv('./text_results.tsv',sep='\t',index=False,header=False)

In [12]:
nlp.train_dev_validation(classifier,train_df,vectorizer,dev_dataset=dev_df)

Total accuracy: 288/506 (56.92%)


0.5691699604743083

In [8]:
classifier.train(train_dataset)
y_test, y_predict = classifier.predict(test_dataset)
for i in range(4):
    print((y_predict == i).sum())

1261
1
2
635


In [48]:
test_df['label'] = np.nan
test_df.head()

Unnamed: 0,tweet_id,text,label
0,769930322721013760,@JPelirrojo me encantan los VAPES gracias por ...,
1,769934500159688705,@yddeon la Universidad es fácil porque estudia...,
2,769937561590652928,@_cuteresa Son dos frases. La una complementa ...,
3,769985604725637121,@ratgull @Vespacityman @Kellydeharo @ELpuebloM...,
4,769993102442524674,@ImFebrer @MalagaCF Ninguno de los clubes lo h...,
