In [1]:
import numpy as np
import pandas as pd

random_seed = 12738

%load_ext autoreload
%autoreload 2

In [2]:
from utils import Melisa2Dataset

df_all = Melisa2Dataset().get_train_dataframe(usecols=['review_content','review_rate'])#.sample(n=10000,random_state=random_seed).reset_index(drop=True)
df_all

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


Unnamed: 0,review_content,review_rate
0,"Medio berreta, no justifica el gasto, ya se sa...",0
2,Excelente !! superó mi expectativas. Lo único ...,1
3,Es una buena relación calidad/precio. La cámar...,1
4,"Muy poca voluntad para ayudarme con cambiarlo,...",0
5,Nunca me respondieron por el manual de uso. En...,0
...,...,...
461870,Todavía no me lo entraron que paso?.,0
461871,Excelente muy bueno el producto.,0
461872,"Muy delgado el armazón son muy frágiles, no du...",0
461873,"La verdad un kit excelente, simple de instalar...",1


# Clasificación de polaridad con un modelo Softmax

En este caso vamos a representar a cada documento como una secuencia de word embeddings formados por dos features: la cantidad de veces que la palabra apareció en un documento de polaridad positiva y la cantidad de veces que apareció en uno de polaridad negativa. Luego, cada secuencia ingresa a un clasificador SoftMax.

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

class SoftMaxClassifier(object):
    """
    Implementación de un clasificador con word-by-category embeddings + CBOW + Softmax
    """

    def __init__(self,tokenizer=None,vocab=None,max_features=None,ngram_range=(1,1)):
        self.vec = CountVectorizer(tokenizer=tokenizer,vocabulary=vocab,
                                   max_features=max_features,ngram_range=ngram_range,
                                   lowercase=True)
        self.clf = LogisticRegression()

    def train(self,ds_train,y_train):
        X = self.vec.fit_transform(ds_train)
        y_train = y_train.astype(int)
        y_one_hot = np.zeros((y_train.size,y_train.max()+1),dtype=float)
        y_one_hot[np.arange(y_train.size),y_train] = 1.
        self.W = X.minimum(1).T.dot(y_one_hot)
        X_train = X.dot(self.W)
        self.clf.fit(X_train,y_train)

    def predict(self,ds_val):
        X_val = self.vec.transform(ds_val).astype(float)
        X_val = X_val.dot(self.W)
        y_pred = self.clf.predict(X_val)
        return y_pred

In [32]:
from utils.evaluation import train_dev_validation
import re

ngram_range = (1,1)
max_features = 10000
token_pattern = re.compile(r'[\w]+|[!¡¿\?\.,\'"]')
tokenizer = lambda x: token_pattern.findall(x)
model = SoftMaxClassifier(tokenizer=tokenizer,
                          max_features=max_features,
                          ngram_range=ngram_range)

score = train_dev_validation(model,df_all,random_state=random_seed,metrics='accuracy',
                     dev_size=.05,compute_train=True)

print('Accuracy obtenida con ngram_range={}, max_features={}:'.format(ngram_range,max_features))
try:
    print('Train accuracy: {:.2f}%'.format(score['train_accuracy']*100))
    print('Validation accuracy: {:.2f}%'.format(score['validation_accuracy']*100))
except KeyError:
    print('Accuracy: {:.2f}%'.format(score['accuracy']*100))

Accuracy obtenida con ngram_range=(1, 1), max_features=10000:
Train accuracy: 74.94%
Validation accuracy: 74.77%


In [None]:
from utils.evaluation import k_fold_validation
import re

ngram_range = (1,2)
max_features = 100000
token_pattern = re.compile(r'[\w]+|[!¡¿\?\.,\'"]')
tokenizer = lambda x: token_pattern.findall(x)
model = SoftMaxClassifier(tokenizer=tokenizer,
                          max_features=max_features,
                          ngram_range=ngram_range)

score = k_fold_validation(model,df_all,5,random_state=random_seed,metrics='accuracy')

print('Accuracy obtenida con ngram_range={}, max_features={}:'.format(ngram_range,max_features))
print('Accuracy: {:.2f}%'.format(score['accuracy']*100))

Fold 1
Fold 2
Fold 3


In [None]:
from utils.evaluation import get_score
import re

df_test = Melisa2Dataset().get_train_dataframe(usecols=['review_content','review_rate'])#.sample(n=10000,random_state=random_seed).reset_index(drop=True)

ngram_range = (1,3)
max_features = 100000
token_pattern = re.compile(r'[\w]+|[!¡¿\?.,\'"]')
tokenizer = lambda x: token_pattern.findall(x)
model = TfIdfBOWNaiveBayesClassifier(alpha=1.0,
                                tokenizer=tokenizer,
                                max_features=max_features,
                                ngram_range=ngram_range)

model.train(df_all['review_content'],df_all['review_rate'].values)
y_predict = model.predict(df_test['review_content'])
score = get_score(df_test['review_rate'].values,y_predict,'accuracy')

print('Accuracy obtenida con ngram_range={}, max_features={}:'.format(ngram_range,max_features))
print('Accuracy: {:.2f}%'.format(score['accuracy']*100))

# Resultados en validación (5-fold)

```
Accuracy obtenida con ngram_range=(1, 1), max_features=10000:
Accuracy: 74.93%

Accuracy obtenida con ngram_range=(1, 1), max_features=100000:
Accuracy: 74.93%

Accuracy obtenida con ngram_range=(1, 2), max_features=100000:
Accuracy: 77.01%

Accuracy obtenida con ngram_range=(1, 2), max_features=500000:
Accuracy: 77.01%

Accuracy obtenida con ngram_range=(1, 3), max_features=100000:
Accuracy: 77.42%
```

# Resultados para test

```
Accuracy obtenida con ngram_range=(1, 3), max_features=100000:
Accuracy: 77.43%
```