# Training the Model Random Forest and Naive Bayes to clasificate the Tweets

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import joblib

## Import the data used to the classifier model

In [2]:
data_processed = '../data/processed/'  # directorio que contiene data procesada

In [3]:
df = pd.read_csv(data_processed+'to_train_models.csv',usecols=['full_text','sentiment_tag'],dtype=str)
df.head(5)

Unnamed: 0,full_text,sentiment_tag
0,lelo andrés arauz dicho q gestionado gobierno ...,Negativo
1,edad enterar jorge glas andrés arauz primo,Negativo
2,extraordinario caravana lojo recibir andrés ar...,Positivo
3,david villamar mildeunar propuesta andrés arau...,Negativo
4,andrés arauz ganador debatepresidencial según ...,Negativo


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.svm import SVC

## Eliminar los valores nulos en el texto

In [5]:
df.isna().sum(axis=0)

full_text        2
sentiment_tag    0
dtype: int64

In [6]:
df.dropna(axis=0,inplace=True)

In [7]:
df.isna().sum(axis=0)

full_text        0
sentiment_tag    0
dtype: int64

In [8]:
df.groupby("sentiment_tag").count()

Unnamed: 0_level_0,full_text
sentiment_tag,Unnamed: 1_level_1
Negativo,32804
Neutral,2977
Positivo,4643


## Division de los datos en entrenamiento y validación

##### Nota: Random State es para que siempre se puede reproducir los mismos valores para cada partición

In [8]:
X = df.full_text
y = df.sentiment_tag
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [9]:
print("La cantidad de datos usados para entrenar el modelo es:"+str(len(X_train)))
print("La cantidad de datos usados para probar el modelo es:"+str(len(X_test)))

La cantidad de datos usados para entrenar el modelo es:32339
La cantidad de datos usados para probar el modelo es:8085


# Representación TFIDF

## Bigramas

In [10]:
# settings for count vectorizer
tfidf_vectorizer_bigram = TfidfVectorizer(ngram_range=(1,2),use_idf=True) 

# sending train_data to vector
X_train_tf_bigram = tfidf_vectorizer_bigram.fit_transform(X_train)
print("n_samples: %d, n_features: %d" % X_train_tf_bigram.shape)

n_samples: 32339, n_features: 163603


In [11]:
# sending test_data to vector
X_test_tf_bigram = tfidf_vectorizer_bigram.transform(X_test)
print("n_samples: %d, n_features: %d" % X_test_tf_bigram.shape)

n_samples: 8085, n_features: 163603


# Entrenando los modelos

In [None]:
pp_acc = {
    "Modelo": [],
    "Precision": [],
    "Recall": [],
    "F1_Score": []
}

In [None]:
def save_model(name_file,model):
    joblib.dump(model,)

## Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [15]:
rf_clf = RandomForestClassifier(max_depth=80, max_features='auto', max_leaf_nodes=100,
                       min_samples_split=4,n_estimators=100,n_jobs=None,random_state=42)

In [None]:
rf_clf.fit(X_train_tf_bigram, y_train);

In [None]:
rf_pred = rf_clf.predict(X_test_tf_bigram)

# compute the performance measures
rf_score = metrics.accuracy_score(y_test, rf_pred)

# save to dict
pp_acc["Modelo"].append("Random Forest")
pp_acc["Precision"].append(rf_score)

## SVC

In [None]:
svc_clf = SVC(C=1, gamma=0.01, kernel='linear',random_state=42)

In [None]:
svc_clf.fit(X_train_tf_bigram, y_train);

In [None]:
svc_pred = svc_clf.predict(X_test_tf)

# compute the performance measures
svc_score = metrics.accuracy_score(y_test, svc_pred)

# save to dict
pp_acc["Modelo"].append("SVM")
pp_acc["Precision"].append(svc_score)

## Naive Bayes

### Bigramas

In [None]:
nb_clf = MultinomialNB(alpha=1.0, fit_prior=False)
nb_clf.fit(X_train_tf_bigram, y_train);

In [None]:
nb_pred = nb_clf.predict(X_test_tf)

# compute the performance measures

nb_score = metrics.precision_score(y_test, nb_pred)

# save to dict
pp_acc["Modelo"].append("Naive Bayes")
pp_acc["Precision"].append(nb_score)

# Comparando modelos

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
up_df = pd.DataFrame(pp_acc)

In [None]:
plt.figure()

sns.set(font_scale=1)
plt.figure(figsize=(9, 6))

ax = sns.barplot(up_df.Modelo, up_df.Precision, alpha=1)
plt.title('Accuracy de modelos usando Unigramas y Bigramas', fontsize=18)
plt.ylabel('Accuracy', fontsize=12)

for p in ax.patches:
        ax.text(p.xy[0] + p.get_width()/2, p.xy[1] + p.get_height()+0.009, 
                str(round(p.get_height(), 3)), fontsize=12)
plt.show()