In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
from nltk.corpus import stopwords
import nltk

import pickle
import spacy

In [26]:
df = pd.read_csv('tweets_limpios_2500 - tweets_limpios_2500.csv')
df.head()

Unnamed: 0,TEXTO,LABEL
0,caso padre terminar matar mulder nava creer pa...,NEGATIVO
1,ratero inútil llenar partir bajar puerta escor...,NEGATIVO
2,mija delincuente color político ciego delincue...,NEGATIVO
3,pana opini acordar solo chismoso,NEUTRO
4,nuevo ministro gran amigar hombre trabajador,POSITIVO


In [27]:
print(df['LABEL'].value_counts())
df = df.drop(df[df.LABEL == 'NEU'].index)
print(df['LABEL'].value_counts())

NEUTRO      1020
NEGATIVO     431
POSITIVO     402
NEU            3
Name: LABEL, dtype: int64
NEUTRO      1020
NEGATIVO     431
POSITIVO     402
Name: LABEL, dtype: int64


In [28]:
df['LABEL'] = df['LABEL'].astype('category')
df['LABEL'] = df['LABEL'].cat.codes

In [29]:
df['LABEL'].value_counts()
# 1 neutro
# 0 negativo
# 2 positivo

1    1020
0     431
2     402
Name: LABEL, dtype: int64

In [30]:
df.iloc[0]['TEXTO']

'caso padre terminar matar mulder nava creer partir enterrar nuevo camada agruparse cambiarse nombrar hacer político'

In [31]:
df.shape

(1853, 2)

In [32]:
nlp = spacy.load("es_core_news_sm")
stops = stopwords.words("spanish")

In [33]:
def normalize(comment):
     comment = nlp(comment.lower())
     lemmatized = list()
     for word in comment:
          word.lemma_.strip()
          if (word.pos_ == 'NOUN' or word.pos_ == 'VERB' or word.pos_ == 'ADJ') and word.is_alpha:
               lemma = word.lemma_  
               if lemma and lemma not in stops:     
                    lemmatized.append(lemma)
     return " ".join(lemmatized)

In [34]:
X = df['TEXTO'].apply(normalize)
y = df['LABEL']

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [36]:
count_vectorizer = CountVectorizer(ngram_range=(1,3), min_df=2)
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

In [37]:
bow_df_train = pd.DataFrame(X_train_count.A, columns=count_vectorizer.get_feature_names())
bow_df_test = pd.DataFrame(X_test_count.A, columns=count_vectorizer.get_feature_names())

In [38]:
bow_df_train.head()

Unnamed: 0,abandonar,abastecimiento,abogar,abrazar,abrazar fuerte,abrir,absoluto,abusar,acabar,acabar año,...,árbol,área,épico,ético,éxito,íntegro,último,último hora,único,útil
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
nb_classifier = MultinomialNB()
nb_classifier.fit(bow_df_train, y_train)
pred = nb_classifier.predict(bow_df_test)

# accuracy (veces que fue bien, de las veces que me dio +) 
# 
# recall
# f1 

score = metrics.f1_score(y_test, pred, average='macro')
score

0.5752773916304091

In [40]:
cm = metrics.confusion_matrix(y_test, pred)
cm

array([[ 62,  62,  10],
       [ 33, 263,  42],
       [  5,  74,  61]], dtype=int64)

In [41]:
weights = sorted(zip(nb_classifier.coef_[0], count_vectorizer.get_feature_names()), reverse=True)
weights[:20]

[(-4.768495242091391, 'ser'),
 (-4.768495242091391, 'ir'),
 (-4.870277936401333, 'hacer'),
 (-4.906645580572208, 'decir'),
 (-5.206750173022547, 'gente'),
 (-5.369269102520321, 'ver'),
 (-5.369269102520321, 'dar'),
 (-5.369269102520321, 'año'),
 (-5.429893724336756, 'pasar'),
 (-5.494432245474327, 'político'),
 (-5.494432245474327, 'dejar'),
 (-5.637533089115001, 'vacunar'),
 (-5.637533089115001, 'malo'),
 (-5.7175757967885374, 'querer'),
 (-5.7175757967885374, 'país'),
 (-5.7175757967885374, 'creer'),
 (-5.7175757967885374, 'corrupción'),
 (-5.804587173778167, 'poner'),
 (-5.804587173778167, 'personar'),
 (-5.804587173778167, 'peor')]

In [42]:
filename = 'modelo_transformador_proyecto.sav'

with open(filename, 'wb') as file:
    pickle.dump((count_vectorizer, nb_classifier), file)

In [43]:
x = pd.DataFrame(['6565 656565 65656 656'])
x = count_vectorizer.transform(x.iloc[0])

print(nb_classifier.predict_proba(x))
print(nb_classifier.predict(x))

[[0.23932313 0.54955681 0.21112006]]
[1]
