In [None]:
#importamos librerias

# Visualizacion y tratado de datos
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import matplotlib
import matplotlib.pyplot as plt

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedKFold

In [None]:
# Leemos el dataframe
df = pd.read_csv('test.csv')
df_sample = pd.read_csv('sample_submission.csv')

In [None]:
# Tratamos el dataframe
df.dropna(subset=['NumberOfDependents'], inplace=True)
df.reset_index(inplace=True)

from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=2)
imputer.fit(df[['MonthlyIncome']])
df[['MonthlyIncome']] = imputer.transform(df[['MonthlyIncome']])

In [None]:
# Visualizamos
plt.figure(figsize=(7,7))
sns.heatmap(df.corr(), vmin=-1 , vmax=1 , cmap=sns.diverging_palette(148,280, s=85, l=25 , n=7), square = True, linewidths=.5);

In [None]:
# Seguimos con el tratamiento post-visualizacion
df.drop(columns=['index'], inplace=True)
X = df.drop(['SeriousDlqin2yrs'], axis=1)
y = df['SeriousDlqin2yrs']

In [None]:
# dividimos en train, test, split
from sklearn.model_selection import train_test_split
X_train, X, y_train, y = train_test_split(X, y, test_size=0.20, random_state = 42)

In [None]:
# importamos el Random Forest
from sklearn.ensemble import RandomForestClassifier

rand_forest = RandomForestClassifier()

rand_forest.fit(X_train, y_train)

RandomForestClassifier()

In [None]:
preds = rand_forest.predict(X_test)

In [None]:
# Comprobamos metricas
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score,\
                            roc_auc_score, roc_curve, precision_recall_curve, confusion_matrix

print("Score del modelo (accuracy):", round(rand_forest.score(X_test, y_test), 3))
print("Accuracy score:", round(accuracy_score(preds, y_test), 3))
print("Recall score:", round(recall_score(preds, y_test), 3))
print("Precision score:", round(precision_score(preds, y_test), 3))
print("F1 score:", round(f1_score(preds, y_test), 3))
print("AUC:", round(roc_auc_score(preds, y_test), 3))

Score del modelo (accuracy): 0.935
Accuracy score: 0.935
Recall score: 0.538
Precision score: 0.155
F1 score: 0.241
AUC: 0.74


In [None]:
# Creamos nuestro GridSearch para buscar el mejor modelo para nuestro proyecto

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

pipe = Pipeline(steps=[
    ('classifier', RandomForestClassifier())
])
# El RandomForestClassifier lo demanda el pipeline.


logistic_params = {
    'classifier': [LogisticRegression()],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__C': [0.01, 0.1, 0.5, 1]
}


dtc_params = {
    'classifier': [DecisionTreeClassifier()],
    'classifier__max_depth': [10]
}

random_forest_params = {
    'classifier': [RandomForestClassifier()]
}


gbc_params = {
    'classifier': [GradientBoostingClassifier()],
    'classifier__n_estimators': [10, 100, 1000],
    'classifier__loss': ('deviance', 'exponential')
}

search_space = [
    logistic_params,
    dtc_params,
    random_forest_params
]

# Corremos nuestro modelo

clf = GridSearchCV(estimator=pipe,
                   scoring='roc_auc',
                   param_grid=search_space,
                   cv=10,
                   verbose=1,
                   n_jobs=-1)

clf.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


        nan 0.64615918 0.80139333 0.84475947]


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('classifier',
                                        RandomForestClassifier())]),
             n_jobs=-1,
             param_grid=[{'classifier': [LogisticRegression()],
                          'classifier__C': [0.01, 0.1, 0.5, 1],
                          'classifier__penalty': ['l1', 'l2']},
                         {'classifier': [DecisionTreeClassifier()],
                          'classifier__max_depth': [10]},
                         {'classifier': [RandomForestClassifier()]}],
             scoring='roc_auc', verbose=1)

In [None]:
# Revisamos nuestro mejor modelo
print(clf.best_estimator_)
print(clf.best_params_)
print(clf.best_score_)

Pipeline(steps=[('classifier', RandomForestClassifier())])
{'classifier': RandomForestClassifier()}
0.8447594690992556


In [None]:
# Probamos de nuevo con otro modelo a ver si mejora 

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

pipe = Pipeline(steps=[
    ('classifier', RandomForestClassifier())
])


gbc_params = {
    'classifier': [GradientBoostingClassifier()],
    'classifier__loss': ['exponential'],
    'classifier__learning_rate': [0.1, 0.5]
}


search_space = [
    gbc_params
]

clf = GridSearchCV(estimator=pipe,
                   scoring='roc_auc',
                   param_grid=search_space,
                   cv=10,
                   verbose=1,
                   n_jobs=-1)
# Corremos nuestro modelo unicamente con el gradient boosting

grids = {
    'gbc': clf
}

clf.fit(X_train, y_train)

Fitting 10 folds for each of 2 candidates, totalling 20 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('classifier',
                                        RandomForestClassifier())]),
             n_jobs=-1,
             param_grid=[{'classifier': [GradientBoostingClassifier(loss='exponential')],
                          'classifier__learning_rate': [0.1, 0.5],
                          'classifier__loss': ['exponential']}],
             scoring='roc_auc', verbose=1)

In [None]:
# Revisamos nuestro mejor modelo
print(clf.best_estimator_)
print(clf.best_params_)
print(clf.best_score_)

Pipeline(steps=[('classifier', GradientBoostingClassifier(loss='exponential'))])
{'classifier': GradientBoostingClassifier(loss='exponential'), 'classifier__learning_rate': 0.1, 'classifier__loss': 'exponential'}
0.8627666362137341


In [None]:
# el gradient boosting fue el mejor modelo por lo que nos quedamos con el
clf.best_estimator_

Pipeline(steps=[('classifier', GradientBoostingClassifier(loss='exponential'))])