# Modelo Random Forest

Implementado por Jesús David Barrios (j.barrios) - 201921887

### Importar librerías y cargar datos

In [10]:
# Pandas
import pandas as pd
pd.set_option('display.max_columns', 25) # Número máximo de columnas a mostrar
pd.set_option('display.max_rows', 50) # Numero máximo de filas a mostar

# Numpy
import numpy as np
np.random.seed(3301)

# Seaborn
import seaborn as sns 

# Matplolib
%matplotlib inline
import matplotlib.pyplot as plt

# Vectorización de texto
from sklearn.feature_extraction.text import TfidfVectorizer

# Modelos
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
# Import SGD Classifier
from sklearn.linear_model import SGDClassifier

# Métricas
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix, plot_confusion_matrix

# Optimización modelo
from sklearn.model_selection import GridSearchCV

# Guardar modelo
import joblib, pickle


In [11]:
# Importar datos
db_route = '../data/processed_data.csv'
df = pd.read_csv(db_route)

# Imprimir número de filas
print('Número de filas: ', df.shape[0])
# Imprimir número de columnas
print('Número de columnas: ', df.shape[1])
pd.set_option('display.max_columns', None) # Número máximo de columnas a mostrar

df.sample(5)

Número de filas:  195700
Número de columnas:  2


Unnamed: 0,class,tokens
21738,0,aaa i am lit stupidest person ex lik ev comp l...
26627,0,is sound rain on calm sound ear Is sound rain ...
135198,1,how clos amam tir liv tir easy target men tran...
140848,0,guess kiss girl not meampxbbtw i saw post said...
79670,0,should i delet reddit account i mean i that is...


In [14]:
# Contar valores nulos
df.isnull().sum()

class      0
tokens    59
dtype: int64

In [15]:
# Eliminar valores nulos
df.dropna(inplace=True)

### Separación de datos

In [16]:
# División entre variables independientes y dependientes
X = df['tokens']
Y = df['class']

In [17]:
# Vectorización de texto
vectorizer = TfidfVectorizer(max_df=12)
X_count = vectorizer.fit_transform(X)
print(X_count.shape)

In [21]:
# Separación de datos en entrenamiento y prueba
X_train, X_test, Y_train, Y_test = train_test_split(X_count, Y, test_size=0.2, random_state=42)

### Modelo

In [19]:
# Parametros para GridSearchCV
parameters = {
    'n_estimators': [80, 100, 120],
    'criterion' : ["gini", "entropy", "log_loss"]
}

In [22]:
# Crear modelo
model = RandomForestClassifier(random_state=42)

# Crear modelo con GridSearchCV
b_model = GridSearchCV(model, parameters, cv=5, n_jobs=-1, verbose=1)
b_model

# Entrenar modelo
b_model.fit(X_train, Y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


### Evaluación

In [None]:
# Predicción
Y_pred = b_model.predict(X_test)

# Evaluación del modelo
print('Accuracy: ', accuracy_score(Y_test, Y_pred))
print('F1: ', f1_score(Y_test, Y_pred, average='weighted'))
print('Precision: ', precision_score(Y_test, Y_pred, average='weighted'))
print('Recall: ', recall_score(Y_test, Y_pred, average='weighted'))
print(classification_report(Y_test, y))

In [None]:
# Matriz de confusión
plot_confusion_matrix(b_model, X_test, Y_test, cmap=plt.cm.Blues, normalize='true')