# Training the Model Random Forest and Naive Bayes to clasificate the Tweets

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Import the data used to the classifier model

In [2]:
data_processed = '../data/processed/'  # directorio que contiene data procesada

In [3]:
df = pd.read_csv(data_processed+'to_train_models.csv',usecols=['full_text','sentiment_tag'],dtype=str)
df.head(5)

Unnamed: 0,full_text,sentiment_tag
0,lelo andrés arauz dicho q gestionado gobierno ...,Negativo
1,edad enterar jorge glas andrés arauz primo,Negativo
2,extraordinario caravana lojo recibir andrés ar...,Positivo
3,david villamar mildeunar propuesta andrés arau...,Negativo
4,andrés arauz ganador debatepresidencial según ...,Negativo


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

## Eliminar los valores nulos en el texto

In [5]:
df.isna().sum(axis=0)

full_text        2
sentiment_tag    0
dtype: int64

In [6]:
df.dropna(axis=0,inplace=True)

In [7]:
df.isna().sum(axis=0)

full_text        0
sentiment_tag    0
dtype: int64

## Division de los datos en entrenamiento y validación

##### Nota: Random State es para que siempre se puede reproducir los mismos valores para cada partición

In [8]:

X = df.full_text
y = df.sentiment_tag
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [9]:
print("La cantidad de datos usados para entrenar el modelo es:"+str(len(X_train)))
print("La cantidad de datos usados para probar el modelo es:"+str(len(X_test)))

La cantidad de datos usados para entrenar el modelo es:32339
La cantidad de datos usados para probar el modelo es:8085


# Representación TFIDF

## Bigramas

In [10]:
# settings for count vectorizer
tfidf_vectorizer_bigram = TfidfVectorizer(ngram_range=(1,2),use_idf=True) 

# sending train_data to vector
X_train_tf_bigram = tfidf_vectorizer_bigram.fit_transform(X_train)
print("n_samples: %d, n_features: %d" % X_train_tf_bigram.shape)

n_samples: 32339, n_features: 163603


In [11]:
# sending test_data to vector
X_test_tf_bigram = tfidf_vectorizer_bigram.transform(X_test)
print("n_samples: %d, n_features: %d" % X_test_tf_bigram.shape)

n_samples: 8085, n_features: 163603


## Trigrams

In [15]:
# settings for count vectorizer
tfidf_vectorizer_trigram = TfidfVectorizer(ngram_range=(1,3),use_idf=True) 

# sending train_data to vector
X_train_tf_trigram = tfidf_vectorizer_trigram.fit_transform(X_train)
print("n_samples: %d, n_features: %d" % X_train_tf_trigram.shape)

n_samples: 32339, n_features: 364803


In [16]:
# sending test_data to vector
X_test_tf_trigram = tfidf_vectorizer_trigram.transform(X_test)
print("n_samples: %d, n_features: %d" % X_test_tf_trigram.shape)

n_samples: 8085, n_features: 364803


# Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [13]:
rf_clf = RandomForestClassifier(random_state=42)
param_grid = {                       
    "max_depth": [5,10, 15,25,30],                                                      
    "min_samples_split": [2, 5, 10, 15, 50,100],
    "n_estimators": [10, 50, 100,200, 400, 600, 800],
    "min_samples_leaf": [1, 2, 5, 10]
}

### Bigramas

In [None]:
gs_clf_b = GridSearchCV(rf_clf, param_grid,n_jobs=-1, cv=5, scoring="accuracy")  
gs_clf_b = gs_clf_b.fit(X_train_tf_bigram, y_train);

In [None]:
print("Best Training Score: ",gs_clf_b.best_score_)
print(gs_clf_b.best_estimator_)

In [None]:
print(gs_clf_b.best_params_)

### Trigramas

In [None]:
gs_clf_t = GridSearchCV(rf_clf, param_grid,n_jobs=-1, cv=5, scoring="accuracy")  
gs_clf_t = gs_clf_t.fit(X_train_tf_trigram, y_train);

In [None]:
print("Best Training Score: ",gs_clf_t.best_score_)
print(gs_clf_t.best_estimator_)

In [None]:
print(gs_clf_t.best_params_)

# Naive Bayes

### Bigramas

In [None]:
param_grid_naiveBayes = {  
    'fit_prior': (True, False),
    'alpha': (30,20,10, 1.0, 0.0, 0.1, 1e-2, 1e-3)
} 

In [None]:
naive_bayes_b = GridSearchCV(MultinomialNB(),param_grid_naiveBayes, cv=5,n_jobs=-1) 
naive_bayes_b = naive_bayes_b.fit(X_train_tf_bigram, y_train);

In [None]:
print("Best Training Score: ",naive_bayes_b.best_score_)
print(naive_bayes_b.best_estimator_)

In [None]:
print(naive_bayes_b.best_params_)

### Trigramas

In [None]:
naive_bayes_t = GridSearchCV(MultinomialNB(),param_grid_naiveBayes, cv=5,n_jobs=-1) 
naive_bayes_t = naive_bayes_t.fit(X_train_tf_trigram, y_train);

In [None]:
print("Best Training Score: ",naive_bayes_t.best_score_)
print(naive_bayes_t.best_estimator_)

In [None]:
print(naive_bayes_t.best_params_)