## Taller Random Forest y Naive Bayes
##### Monitor: Juan Nicolas Piedrahita Salas

#### Introducción a la inteligencia artificial 2023-02

Este dataset contiene información sobre el clima en algunas ciudades de australia. El objetivo es predecir si va a llover o no en el día siguiente.

# 1. importar librerias

In [None]:
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

# 2. importar dataset

In [None]:
data = pd.read_csv('AusDataForRainPred.csv')
data.drop(columns=['Date'], inplace=True)
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)

X = data.drop(columns='RainTomorrow')
y = data['RainTomorrow']

display(X,y)

In [None]:
data.info()

In [None]:
cols = data.select_dtypes(include=np.number).columns

_, axes = plt.subplots(4, 4, figsize=(20, 20))

axes = axes.flatten()
for i, col in enumerate(cols):
    sns.kdeplot(data=data, x=col, hue="RainTomorrow", ax=axes[i])
    axes[i].set_title(col)


plt.show()

In [None]:
data['RainTomorrow'].value_counts().plot.bar()

plt.ylabel('Count')
plt.xlabel('RainTomorrow')
plt.show()

# 3. Preprocesamiento de datos

standard scaling

<img src="https://cdn-images-1.medium.com/max/370/1*Nlgc_wq2b-VfdawWX9MLWA.png"/>

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler

ColTransformer = ColumnTransformer(
    [
        ('StandardScaler', StandardScaler(), X.select_dtypes(include=[np.number]).columns),
        ('OrdinalEncoder', OrdinalEncoder(), X.select_dtypes(include=['object']).columns),
    ],
    remainder='passthrough',
    n_jobs=-1,
    verbose_feature_names_out=False
)

X = ColTransformer.fit_transform(X)
y = LabelEncoder().fit_transform(y)

# 4. Dividir el dataset en entrenamiento y prueba

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# 5. buscar mejores hyperarametros tanto para el modelo de arbol como para el modelo de bosque

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [2,3,4,5,6,4,7,8,9],
    "min_samples_split": range(2, 6 + 1, 2),
    "min_samples_leaf": range(2, 6 + 1, 2),
}

grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(class_weight="balanced", random_state=42),
    param_grid=grid,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train)

display(
    "mejores parametros",
    grid_search.best_params_,
    "",
    "mejor accurracy",
    grid_search.best_score_
)

'mejores parametros'

{'criterion': 'gini',
 'max_depth': 5,
 'min_samples_leaf': 2,
 'min_samples_split': 2}

''

'mejor accurracy'

0.7822994210090986

In [None]:
grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [12,13,14,15],
    "min_samples_split": range(2, 6 + 1, 2),
    "min_samples_leaf": range(2, 6 + 1, 2),
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(class_weight="balanced", random_state=42),
    param_grid=grid,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train)

display(
    "mejores parametros",
    grid_search.best_params_,
    "",
    "mejor accurracy",
    grid_search.best_score_
)

'mejores parametros'

{'criterion': 'entropy',
 'max_depth': 15,
 'min_samples_leaf': 2,
 'min_samples_split': 2}

''

'mejor accurracy'

0.8539761313954862

# 6. entrenar modelos

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

modelDT = DecisionTreeClassifier(
    criterion = 'gini',
    max_depth = 5,
    min_samples_leaf = 2,
    min_samples_split = 2,
    random_state=42,
    class_weight="balanced"
)
modelRF = RandomForestClassifier(
    criterion = 'entropy',
    max_depth = 14,
    min_samples_leaf = 2,
    min_samples_split = 2,
    random_state=42,
    class_weight="balanced",
    n_jobs=-1
)

modelDT.fit(X_train, y_train)
modelRF.fit(X_train, y_train)

# 7. analisis de metricas

In [None]:
res = pd.DataFrame({"DecisionTree": [None, None], "RandomForest": [None, None]}, index=["Entrenamiento", "Prueba"])

res.loc["Entrenamiento", "DecisionTree"] = modelDT.score(X_train, y_train)
res.loc["Entrenamiento", "RandomForest"] = modelRF.score(X_train, y_train)

res.loc["Prueba", "DecisionTree"] = modelDT.score(X_test, y_test)
res.loc["Prueba", "RandomForest"] = modelRF.score(X_test, y_test)

res.plot.bar(rot=0, figsize=(10, 5))

plt.title("metricas DT vs RF")
plt.ylabel("Accuracy")
plt.show()

In [None]:
print('Decision Tree')
print('Train score: ', modelDT.score(X_train, y_train))
print('Test score: ', modelDT.score(X_test, y_test))

print()

print('Random Forest')
print('Train score: ', modelRF.score(X_train, y_train))
print('Test score: ', modelRF.score(X_test, y_test))

#### matriz de confusion decision tree

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

_, (ax1, ax2) = plt.subplots(1, 2, figsize=(17, 7))

cmTrain = ConfusionMatrixDisplay.from_estimator(
    estimator = modelDT,
    X = X_train,
    y = y_train,
    cmap = "Blues",
    ax = ax1
)

cmTest = ConfusionMatrixDisplay.from_estimator(
    estimator = modelDT,
    X = X_test,
    y = y_test,
    cmap="Blues",
    ax = ax2
)

cmTrain.ax_.set_title("matriz de confusion para los datos de entrenamiento")
cmTest.ax_.set_title("matriz de confusion para los datos de prueba")

plt.show()

#### matriz de confusion random forest

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

_, (ax1, ax2) = plt.subplots(1, 2, figsize=(17, 7))

cmTrain = ConfusionMatrixDisplay.from_estimator(
    estimator = modelRF,
    X = X_train,
    y = y_train,
    cmap = "Blues",
    ax = ax1
)

cmTest = ConfusionMatrixDisplay.from_estimator(
    estimator = modelRF,
    X = X_test,
    y = y_test,
    cmap="Blues",
    ax = ax2
)

cmTrain.ax_.set_title("matriz de confusion para los datos de entrenamiento")
cmTest.ax_.set_title("matriz de confusion para los datos de prueba")

plt.show()

vemos que el modelo Random Forest alcanza una mayor exactitud que el modelo Decision Tree, tanto en entrenamiento como en prueba.

## clasificacion bayesiana

In [None]:
from sklearn.naive_bayes import GaussianNB

modeloNB = GaussianNB()

modeloNB = modeloNB.fit(X_train, y_train)

#### exactitud del modelo

In [None]:
print("exactitud para el entrenamiento =", modeloNB.score(X_train, y_train))
print("exactitud para prueba =", modeloNB.score(X_test, y_test))

#### matrices de confusion

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

_, (ax1, ax2) = plt.subplots(1, 2, figsize=(17, 7))

cmTrain = ConfusionMatrixDisplay.from_estimator(
    estimator = modeloNB,
    X = X_train,
    y = y_train,
    cmap = "Blues",
    ax = ax1
)

cmTest = ConfusionMatrixDisplay.from_estimator(
    estimator = modeloNB,
    X = X_test,
    y = y_test,
    cmap="Blues",
    ax = ax2
)

cmTrain.ax_.set_title("matriz de confusion para los datos de entrenamiento")
cmTest.ax_.set_title("matriz de confusion para los datos de prueba")

plt.show()

### Comparacion final de los tres modelos

In [None]:
res = pd.DataFrame([], index=["Entrenamiento", "Prueba"], columns=["Decision tree", "Random forest", "Naive bayes"])

res["Decision tree"] = [modelDT.score(X_train, y_train), modelDT.score(X_test, y_test)]

res["Random forest"] = [modelRF.score(X_train, y_train), modelRF.score(X_test, y_test)]

res["Naive bayes"] = [modeloNB.score(X_train, y_train), modeloNB.score(X_test, y_test)]

res.plot.bar(rot=0, figsize=(10, 5))

plt.title("metricas DT vs RF vs NB")
plt.ylabel("Accuracy")
plt.show()