# Training the Models to clasificate the Tweets

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import joblib

## Import the data used to the classifier model

In [2]:
data_processed = '../data/processed/'  # directorio que contiene data procesada

In [3]:
df = pd.read_csv(data_processed+'to_train_models.csv',usecols=['full_text','sentiment_tag'],dtype=str)
df.head(5)

Unnamed: 0,full_text,sentiment_tag
0,lelo andrés arauz dicho q gestionado gobierno ...,Negativo
1,edad enterar jorge glas andrés arauz primo,Negativo
2,extraordinario caravana lojo recibir andrés ar...,Positivo
3,david villamar mildeunar propuesta andrés arau...,Negativo
4,andrés arauz ganador debatepresidencial según ...,Negativo


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.svm import SVC

## Eliminar los valores nulos en el texto

In [5]:
df.isna().sum(axis=0)

full_text        2
sentiment_tag    0
dtype: int64

In [6]:
df.dropna(axis=0,inplace=True)

In [7]:
df.isna().sum(axis=0)

full_text        0
sentiment_tag    0
dtype: int64

In [8]:
df.groupby("sentiment_tag").count()

Unnamed: 0_level_0,full_text
sentiment_tag,Unnamed: 1_level_1
Negativo,32804
Neutral,2977
Positivo,4643


## Division de los datos en entrenamiento y validación

##### Nota: Random State es para que siempre se puede reproducir los mismos valores para cada partición, y la partición esta realizada a 80 entrenamiento y 20 de testing

In [9]:
X = df.full_text
y = df.sentiment_tag
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42 )

In [10]:
print('La división para entrenamiento:'+str(len(X_train))+' y para testing es:'+str(len(X_test)))

La división para entrenamiento:32339 y para testing es:8085


# Representación TFIDF

## Bigramas

In [11]:
# settings for count vectorizer
tfidf_vectorizer_bigram = TfidfVectorizer(ngram_range=(1,2),use_idf=True) 

# sending train_data to vector
X_train_tf_bigram = tfidf_vectorizer_bigram.fit_transform(X_train)
print("n_samples: %d, n_features: %d" % X_train_tf_bigram.shape)

n_samples: 32339, n_features: 163603


In [12]:
# sending test_data to vector
X_test_tf_bigram = tfidf_vectorizer_bigram.transform(X_test)
print("n_samples: %d, n_features: %d" % X_test_tf_bigram.shape)

n_samples: 8085, n_features: 163603


# Entrenando los modelos

In [197]:
data_models = '../models/' 
def save_model(name_file,model):
    joblib.dump(model,data_models+name_file)
    return True


## Random Forest

In [198]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [199]:
rf_clf = RandomForestClassifier(max_depth=1000, max_leaf_nodes=3000, min_samples_split=20,n_estimators=20, random_state=42)

In [200]:
rf_clf.fit(X_train_tf_bigram, y_train);

In [201]:
rf_pred = rf_clf.predict(X_test_tf_bigram)

# compute the performance measures
rf_score = metrics.precision_score(y_test, rf_pred,average='micro')
print(rf_score)

0.8713667285095856


#### Guardando el modelo

In [202]:
save_model("Random_Forest.sav",rf_clf)

True

## SVC

In [203]:
svc_clf = SVC(C=1, gamma=0.01, kernel='linear',random_state=42)

In [204]:
svc_clf.fit(X_train_tf_bigram, y_train);

In [206]:
svc_pred = svc_clf.predict(X_test_tf_bigram)
svc_score = metrics.precision_score(y_test, svc_pred,average='micro')

print(svc_score)

0.8818800247371676


#### Guardando el modelo

In [207]:
save_model("SVC.sav",svc_clf)

True

## Naive Bayes

In [208]:
nb_clf = MultinomialNB(alpha=0, fit_prior=True)
nb_clf.fit(X_train_tf_bigram, y_train);



In [210]:
nb_pred = nb_clf.predict(X_test_tf_bigram)
nb_score =  metrics.precision_score(y_test, nb_pred,average='micro')
print(nb_score)

0.8711193568336425


#### Guardando el modelo

In [61]:
save_model('Naive_Bayes.sav',nb_clf)

True