# Explore here

It's recommended to use this notebook for exploration purposes.

For example: 

1. You could import the CSV generated by python into your notebook and explore it.
2. You could connect to your database using `pandas.read_sql` from this notebook and explore it.

In [1]:
!pip install -r ../requirements.txt

Collecting sqlalchemy==1.4.37
  Downloading SQLAlchemy-1.4.37-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[?25hCollecting pymysql==1.0.2
  Downloading PyMySQL-1.0.2-py3-none-any.whl (43 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas==1.4.2
  Downloading pandas-1.4.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.7/11.7 MB[0m [31m140.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m
[?25hCollecting python-dotenv==0.20.0
  Downloading python_dotenv-0.20.0-py3-none-any.whl (17 kB)
Collecting psycopg2-binary==2.9.3
  Downloading psycopg2_binary-

In [2]:
import matplotlib.pyplot as plt

import numpy as np 
import pandas as pd
import seaborn as sns
import pickle
import xgboost as xgb

from xgboost import XGBClassifier
from sklearn.ensemble import  RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from urllib.request import urlretrieve
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV



In [3]:
df_raw = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/random-forest-project-tutorial/main/titanic_train.csv') 
print(df_raw)

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ... 

In [4]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
df_raw.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
24,25,0,3,"Palsson, Miss. Torborg Danira",female,8.0,3,1,349909,21.075,,S
342,343,0,2,"Collander, Mr. Erik Gustaf",male,28.0,0,0,248740,13.0,,S
87,88,0,3,"Slocovski, Mr. Selman Francis",male,,0,0,SOTON/OQ 392086,8.05,,S
624,625,0,3,"Bowen, Mr. David John ""Dai""",male,21.0,0,0,54636,16.1,,S
862,863,1,1,"Swift, Mrs. Frederick Joel (Margaret Welles Ba...",female,48.0,0,0,17466,25.9292,D17,S
477,478,0,3,"Braund, Mr. Lewis Richard",male,29.0,1,0,3460,7.0458,,S
714,715,0,2,"Greenberg, Mr. Samuel",male,52.0,0,0,250647,13.0,,S
303,304,1,2,"Keane, Miss. Nora A",female,,0,0,226593,12.35,E101,Q
439,440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31.0,0,0,C.A. 18723,10.5,,S
27,28,0,1,"Fortune, Mr. Charles Alexander",male,19.0,3,2,19950,263.0,C23 C25 C27,S


In [6]:
df_raw['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [7]:
df_raw['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [8]:
df_raw.info

<bound method DataFrame.info of      PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                   

Borramos las columnas que no vamos a precisar

In [9]:
df_transform=df_raw.drop(['Cabin','PassengerId','Ticket','Name'],axis=1)

In [10]:
# dos variables nuevas
df_transform['Sex_encoded']=df_transform['Sex'].apply(lambda x: 1 if x=="female" else 0)

In [11]:
df_transform = df_transform.drop(['Sex'],axis=1)

In [12]:
df_transform['Embarked_S']=df_transform['Embarked'].apply(lambda x: 1 if x=="S" else 0)

In [13]:
df_transform['Embarked_C']=df_transform['Embarked'].apply(lambda x: 1 if x=="C" else 0)

In [14]:
df_transform['Age_clean']=df_transform['Age'].fillna(30)

In [15]:
df_transform=df_transform.drop(['Embarked'],axis=1)
df_transform=df_transform.drop(['Age'],axis=1)

In [16]:
df=df_transform.copy()

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Survived     891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   SibSp        891 non-null    int64  
 3   Parch        891 non-null    int64  
 4   Fare         891 non-null    float64
 5   Sex_encoded  891 non-null    int64  
 6   Embarked_S   891 non-null    int64  
 7   Embarked_C   891 non-null    int64  
 8   Age_clean    891 non-null    float64
dtypes: float64(2), int64(7)
memory usage: 62.8 KB


In [18]:
X=df.drop(['Survived'],axis=1)

In [19]:
y=df['Survived']

In [20]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=70)

In [21]:
# Fit a Decision Tree model as comparison

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.7892376681614349

In [22]:
# Fit a Random Forest model

clf = RandomForestClassifier(n_estimators=100, max_features="auto",random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

  warn(


0.7982062780269058

In [23]:
# Step 6: Fit a Gradient Boosting model

clf = GradientBoostingClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)



0.8340807174887892

Por ahora el GradientBoosting es el mejor resultado con 0.83 de accurancy_score

In [24]:
D_train = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
D_test = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)

In [25]:
D_train

<xgboost.core.DMatrix at 0x7f81e172b970>

Parametros a seleccionar

booster: El tipo de modelo de clasificación usado, por defecto gbtree.
objective: El tipo de tarea de clasificación que realizaremos. Para clasificación binaria, nuestro caso, especificamos binary:logistic.
max_depth: “Profundidad” o número de nodos de bifurcación de los árboles de decisión usados en el entrenamiento. Aunque una mayor profundidad puede devolver mejores resultados, también puede resultar en overfitting (sobre ajuste).
eta: La tasa de aprendizaje del modelo. Un mayor valor llega más rápidamente al mínimo de la función objetivo, es decir, a un “mejor modelo”, pero puede “pasarse” de su valor óptimo. En cambio, un valor pequeño puede nunca llegar al valor óptimo de la función objetivo, incluso después de muchas iteraciones. En ambos casos, esto afecta el desempeño de nuestros modelos con nuevos.
nthread: El número de hilos computacionales que serán usados en el proceso de entrenamiento. Generalmente se refiere a los núcleos del procesador de tu equipo de cómputo, local o remoto, pero también pueden ser los núcleos de un GPU.
nround: El número de iteraciones que se realizarán antes de detener el proceso de ajuste. Un mayor número de iteraciones generalmente devuelve mejores resultados de predicción, pero necesita más tiempo de entrenamiento y conlleva un riesgo de sobre ajuste si son demasiadas rondas.

In [26]:
parametros = {"booster":"gbtree", "max_depth": 2, "eta": 0.3, "objective": "binary:logistic", "nthread":2}
rondas = 10

In [27]:
evaluacion = [(D_test, "eval"), (D_train, "train")]

Con estas variables definidas, continuamos con el entrenamiento del modelo. Notarás que se irán mostrando los resultados de evaluación de cada iteración del modelo, hasta que alcance el número de rondas que hemos definido.

In [28]:
modelo = xgb.train(parametros, D_train, rondas, evaluacion)

[0]	eval-logloss:0.58920	train-logloss:0.58220
[1]	eval-logloss:0.53747	train-logloss:0.52306
[2]	eval-logloss:0.49385	train-logloss:0.48806
[3]	eval-logloss:0.48303	train-logloss:0.46617
[4]	eval-logloss:0.46340	train-logloss:0.45297
[5]	eval-logloss:0.45382	train-logloss:0.44076
[6]	eval-logloss:0.44218	train-logloss:0.43088
[7]	eval-logloss:0.44012	train-logloss:0.42523
[8]	eval-logloss:0.43525	train-logloss:0.41942
[9]	eval-logloss:0.43272	train-logloss:0.41407




Generacion de Predicciones

In [29]:
prediccion = modelo.predict(D_test)

In [30]:
prediccion

array([0.9250106 , 0.95114714, 0.8693489 , 0.17490175, 0.17490175,
       0.18023612, 0.8970491 , 0.8785872 , 0.21051596, 0.17490175,
       0.54657185, 0.17490175, 0.17490175, 0.14877917, 0.12675628,
       0.18130095, 0.3768865 , 0.14877917, 0.14877917, 0.14877917,
       0.9034693 , 0.10812957, 0.30748942, 0.8970491 , 0.8745533 ,
       0.2813915 , 0.14877917, 0.32292524, 0.9034693 , 0.14877917,
       0.95114714, 0.8410089 , 0.18130095, 0.9034693 , 0.8467681 ,
       0.14877917, 0.12675628, 0.14877917, 0.93659914, 0.11522276,
       0.9034693 , 0.51526344, 0.20244259, 0.2460344 , 0.70040876,
       0.12675628, 0.30748942, 0.11522276, 0.32453743, 0.63643867,
       0.14877917, 0.8970491 , 0.570457  , 0.20244259, 0.30748942,
       0.54657185, 0.9250106 , 0.47641724, 0.6025949 , 0.14877917,
       0.23125269, 0.14877917, 0.1929879 , 0.58501273, 0.26940903,
       0.54657185, 0.9250106 , 0.12675628, 0.14877917, 0.14877917,
       0.9034693 , 0.14877917, 0.18130095, 0.95114714, 0.21051

In [31]:
prediccion = [1 if i > .7 else 0 for i in prediccion]
prediccion[:10]

[1, 1, 1, 0, 0, 0, 1, 1, 0, 0]

Ahora vamos a ver que tan acertadas estan nuestras predicciones

In [32]:
def metricas(objetivo, prediccion):
    matriz_conf = confusion_matrix(objetivo, prediccion)
    score = accuracy_score(objetivo, prediccion)
    reporte = classification_report(objetivo, prediccion)
    metricas = [matriz_conf, score, reporte]
    return(metricas)

In [33]:
metricas_1 = metricas(y_test, prediccion)
[print(i) for i in metricas_1]

[[128   3]
 [ 42  50]]
0.7982062780269058
              precision    recall  f1-score   support

           0       0.75      0.98      0.85       131
           1       0.94      0.54      0.69        92

    accuracy                           0.80       223
   macro avg       0.85      0.76      0.77       223
weighted avg       0.83      0.80      0.78       223



[None, None, None]

Probamos con un segundo modelo

In [34]:
parametros_02 = {"booster":"gbtree", "max_depth": 4, "eta": .3, "objective": "binary:logistic", "nthread":2}
rondas_02 = 100

In [35]:
modelo_02 = xgb.train(parametros_02, D_test, rondas_02, evaluacion, early_stopping_rounds=10)

[0]	eval-logloss:0.55395	train-logloss:0.59087
[1]	eval-logloss:0.46874	train-logloss:0.53625
[2]	eval-logloss:0.41092	train-logloss:0.51072
[3]	eval-logloss:0.37318	train-logloss:0.49780
[4]	eval-logloss:0.34640	train-logloss:0.49443
[5]	eval-logloss:0.32051	train-logloss:0.48429
[6]	eval-logloss:0.30145	train-logloss:0.47721
[7]	eval-logloss:0.28405	train-logloss:0.48033
[8]	eval-logloss:0.27007	train-logloss:0.48526
[9]	eval-logloss:0.25582	train-logloss:0.48358
[10]	eval-logloss:0.24276	train-logloss:0.48885
[11]	eval-logloss:0.23610	train-logloss:0.49754
[12]	eval-logloss:0.22674	train-logloss:0.49899
[13]	eval-logloss:0.21934	train-logloss:0.49925
[14]	eval-logloss:0.21273	train-logloss:0.50252
[15]	eval-logloss:0.20598	train-logloss:0.51731
[16]	eval-logloss:0.20037	train-logloss:0.52150




In [36]:
prediccion_02 = modelo_02.predict(D_test)
prediccion_02 = [1 if i > .7 else 0 for i in prediccion_02]

In [37]:
prediccion_02[:10]

[1, 1, 1, 0, 0, 0, 1, 1, 0, 0]

In [38]:
metricas_02 = metricas(y_test, prediccion_02)

[print(i) for i in metricas_02]

[[130   1]
 [ 23  69]]
0.8923766816143498
              precision    recall  f1-score   support

           0       0.85      0.99      0.92       131
           1       0.99      0.75      0.85        92

    accuracy                           0.89       223
   macro avg       0.92      0.87      0.88       223
weighted avg       0.91      0.89      0.89       223



[None, None, None]

Este modelo mejora mucho por lo cual lo elijo como mi resultado y procedo a guardarlo

In [39]:
# Guardar modelo
filename = '../models/modelo_boosting.sav'
pickle.dump(modelo_02, open(filename, 'wb'))

In [40]:
# Si queremos cargar el archivo guardado en la carpeta models

load_model = pickle.load(open('../models/modelo_boosting.sav', 'rb'))