In [37]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas_profiling

from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, precision_recall_fscore_support
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.model_selection import StratifiedKFold
from sklearn.datasets import make_classification

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgbm
import xgboost as xgb

from sklearn.ensemble import VotingClassifier

In [25]:
def plotRoc(predictions):
    predictions = predictions[['label','probability']]

    y = predictions['label']
    scores = predictions['probability']
    fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=1)
    
        # La guardo
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
            lw=lw, label='Curva ROC')
    plt.plot([0, 1], [0, 1], color = 'navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Ratio Falsos Positivos')
    plt.ylabel('Ratio Verdaderos Positivos')
    plt.title('ROC')
    plt.legend(loc="lower right")
    plt.show()

In [26]:
def precissionRecall(y_test, predictions):
    df = pd.DataFrame({'label':y_test, 'prediction': predictions})
    verdaderos_positivos = len(df[(df['label'] == 1) & (df['prediction'] == 1)])
    falsos_positivos = len(df[(df['label'] == 0) & (df['prediction'] == 1)])
    falsos_negativos = len(df[(df['label'] == 1) & (df['prediction'] == 0)])
    precission = "\nPrecission: {}".format(verdaderos_positivos/(verdaderos_positivos+falsos_positivos))
    recall = "\nRecall: {}".format(verdaderos_positivos/(verdaderos_positivos+falsos_negativos))
    
    return str(precission+recall)

In [27]:
def resultados_modelo(model, X_test, y_test):
    predictions = model.predict(X_test)
    probabilities = model.predict_proba(X_test)
    preds = pd.DataFrame({'label': list(y_test), 'probability': list(probabilities)})
    preds['probability'] = preds['probability'].apply(lambda x: x[1])

    plotRoc(preds)
    info = ""
    info = info + ('Nº Variables: {}'.format(len(X_test.columns)))
    info = info + ("\nScore: {}".format(model.score(X_test, y_test)))

    from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, f1_score

    info = info + ('\nConfusion Matrix\n'+str(confusion_matrix(y_test, predictions)))

    fpr, tpr, thresholds = roc_curve(y_test, predictions)
    info = info + ('\nROC : {}'.format(roc_auc_score(y_test, preds['probability'] )))
    info = info + ('\nF1 : {}'.format(f1_score(y_test, predictions)))
    info = info + (precissionRecall(y_test, predictions))
#     info = info + ('\noob_decision_function_: {}'.format(model.oob_decision_function_))
    # info = info + ('\estimators_: {}'.format(model.estimators_))
    # info = info + ('\estimator_params: {}'.format(model.estimator_params))
    print(info)

In [28]:
def metricas(x_train, y_train,x_test, y_test, model, label_modelo = 'modelo'):
    
    # Train
    y_pred_train = model.predict(x_train)
    
    # Test
    y_pred_test = model.predict(x_test)
    
    resultados = pd.DataFrame({'Modelo': [label_modelo + '_train', label_modelo + '_test'],
                               'Precision' : [metrics.precision_score(y_train, y_pred_train), metrics.precision_score(y_test, y_pred_test)], 
                               'Recall':[metrics.recall_score(y_train, y_pred_train), metrics.recall_score(y_test, y_pred_test)], 
                               'F1': [metrics.f1_score(y_train, y_pred_train),metrics.f1_score(y_test, y_pred_test)],
                               'AUC': [roc_auc_score(y_train, y_pred_train), roc_auc_score(y_test, y_pred_test)]}, 
                              index = [0,1])
    return resultados

# metricas(x_test, y_test, rf, 'rf')

# Lectura del dataset y split train/test

In [29]:
path_lectura = r'C:\Users\admin_eci2019\notebooks\Proyecto_ECI\Data\05_con_variables_continuas_categorizadas'
path_guardado = r'C:\Users\admin_eci2019\notebooks\Proyecto_ECI\Data\05_con_variables_continuas_categorizadas'

In [30]:
df = pd.read_pickle(path_lectura + r'\df_train_hogar.pkl')
df = pd.get_dummies(df)

In [31]:
X = df.drop(['target'],axis=1).copy()
y = df['target'].copy()

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 22)

In [39]:
pandas_profiling.ProfileReport(df)

  variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)


0,1
Number of variables,64
Number of observations,502725
Total Missing (%),0.0%
Total size in memory,68.1 MiB
Average record size in memory,142.0 B

0,1
Numeric,2
Categorical,0
Boolean,59
Date,0
Text (Unique),0
Rejected,3
Unsupported,0

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.25606

0,1
0.0,373996
1.0,128729

Value,Count,Frequency (%),Unnamed: 3
0.0,373996,74.4%,
1.0,128729,25.6%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.097863

0,1
0.0,453527
1.0,49198

Value,Count,Frequency (%),Unnamed: 3
0.0,453527,90.2%,
1.0,49198,9.8%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.32797

0,1
0.0,337846
1.0,164879

Value,Count,Frequency (%),Unnamed: 3
0.0,337846,67.2%,
1.0,164879,32.8%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.16601

0,1
0.0,419270
1.0,83455

Value,Count,Frequency (%),Unnamed: 3
0.0,419270,83.4%,
1.0,83455,16.6%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.15758

0,1
0.0,423508
1.0,79217

Value,Count,Frequency (%),Unnamed: 3
0.0,423508,84.2%,
1.0,79217,15.8%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.49097

0,1
0,255900
1,246825

Value,Count,Frequency (%),Unnamed: 3
0,255900,50.9%,
1,246825,49.1%,

0,1
Distinct count,4736
Unique (%),0.9%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,64.451
Minimum,-9
Maximum,13465
Zeros (%),58.0%

0,1
Minimum,-9
5-th percentile,-9
Q1,-9
Median,0
Q3,0
95-th percentile,0
Maximum,13465
Range,13474
Interquartile range,9

0,1
Standard deviation,612.22
Coef of variation,9.499
Kurtosis,187.92
Mean,64.451
MAD,132.23
Skewness,12.869
Sum,32401000
Variance,374820
Memory size,7.7 MiB

Value,Count,Frequency (%),Unnamed: 3
0.0,291603,58.0%,
-9.0,196609,39.1%,
683.0,30,0.0%,
849.0,25,0.0%,
632.0,25,0.0%,
472.0,24,0.0%,
688.0,23,0.0%,
700.0,22,0.0%,
471.0,22,0.0%,
713.0,21,0.0%,

Value,Count,Frequency (%),Unnamed: 3
-9.0,196609,39.1%,
0.0,291603,58.0%,
1.0,2,0.0%,
2.0,1,0.0%,
3.0,2,0.0%,

Value,Count,Frequency (%),Unnamed: 3
13460.0,3,0.0%,
13461.0,3,0.0%,
13462.0,1,0.0%,
13464.0,3,0.0%,
13465.0,3,0.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.087245

0,1
0,458865
1,43860

Value,Count,Frequency (%),Unnamed: 3
0,458865,91.3%,
1,43860,8.7%,

0,1
Distinct count,502725
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,36228000
Minimum,34
Maximum,94307097
Zeros (%),0.0%

0,1
Minimum,34
5-th percentile,3692700
Q1,20238000
Median,35153000
Q3,48551000
95-th percentile,90039000
Maximum,94307097
Range,94307063
Interquartile range,28313000

0,1
Standard deviation,22285000
Coef of variation,0.61513
Kurtosis,0.52521
Mean,36228000
MAD,17088000
Skewness,0.73817
Sum,18212820876192
Variance,496620000000000
Memory size,27.7 MiB

Value,Count,Frequency (%),Unnamed: 3
31567928,1,0.0%,
31534688,1,0.0%,
13411640,1,0.0%,
18477869,1,0.0%,
13417787,1,0.0%,
36472124,1,0.0%,
12358974,1,0.0%,
29320512,1,0.0%,
32470338,1,0.0%,
36666691,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
34,1,0.0%,
240,1,0.0%,
307,1,0.0%,
356,1,0.0%,
414,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
94306727,1,0.0%,
94306834,1,0.0%,
94306859,1,0.0%,
94306909,1,0.0%,
94307097,1,0.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.36165

0,1
0,320913
1,181812

Value,Count,Frequency (%),Unnamed: 3
0,320913,63.8%,
1,181812,36.2%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.12463

0,1
0,440068
1,62657

Value,Count,Frequency (%),Unnamed: 3
0,440068,87.5%,
1,62657,12.5%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.28175

0,1
0,361082
1,141643

Value,Count,Frequency (%),Unnamed: 3
0,361082,71.8%,
1,141643,28.2%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.23634

0,1
0,383912
1,118813

Value,Count,Frequency (%),Unnamed: 3
0,383912,76.4%,
1,118813,23.6%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.35728

0,1
0,323113
1,179612

Value,Count,Frequency (%),Unnamed: 3
0,323113,64.3%,
1,179612,35.7%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.63238

0,1
1,317914
0,184811

Value,Count,Frequency (%),Unnamed: 3
1,317914,63.2%,
0,184811,36.8%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.056132

0,1
0,474506
1,28219

Value,Count,Frequency (%),Unnamed: 3
0,474506,94.4%,
1,28219,5.6%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.034834

0,1
0,485213
1,17512

Value,Count,Frequency (%),Unnamed: 3
0,485213,96.5%,
1,17512,3.5%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.24597

0,1
0,379070
1,123655

Value,Count,Frequency (%),Unnamed: 3
0,379070,75.4%,
1,123655,24.6%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.030683

0,1
0,487300
1,15425

Value,Count,Frequency (%),Unnamed: 3
0,487300,96.9%,
1,15425,3.1%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.16738

0,1
0,418580
1,84145

Value,Count,Frequency (%),Unnamed: 3
0,418580,83.3%,
1,84145,16.7%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.16647

0,1
0,419036
1,83689

Value,Count,Frequency (%),Unnamed: 3
0,419036,83.4%,
1,83689,16.6%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.16666

0,1
0,418942
1,83783

Value,Count,Frequency (%),Unnamed: 3
0,418942,83.3%,
1,83783,16.7%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.16652

0,1
0,419012
1,83713

Value,Count,Frequency (%),Unnamed: 3
0,419012,83.3%,
1,83713,16.7%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.16635

0,1
0,419097
1,83628

Value,Count,Frequency (%),Unnamed: 3
0,419097,83.4%,
1,83628,16.6%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.16663

0,1
0,418958
1,83767

Value,Count,Frequency (%),Unnamed: 3
0,418958,83.3%,
1,83767,16.7%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.50018

0,1
1,251453
0,251272

Value,Count,Frequency (%),Unnamed: 3
1,251453,50.0%,
0,251272,50.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.49982

0,1
0,251453
1,251272

Value,Count,Frequency (%),Unnamed: 3
0,251453,50.0%,
1,251272,50.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.11432

0,1
0,445255
1,57470

Value,Count,Frequency (%),Unnamed: 3
0,445255,88.6%,
1,57470,11.4%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.11763

0,1
0,443588
1,59137

Value,Count,Frequency (%),Unnamed: 3
0,443588,88.2%,
1,59137,11.8%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.10478

0,1
0,450049
1,52676

Value,Count,Frequency (%),Unnamed: 3
0,450049,89.5%,
1,52676,10.5%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.10818

0,1
0,448341
1,54384

Value,Count,Frequency (%),Unnamed: 3
0,448341,89.2%,
1,54384,10.8%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.13787

0,1
0,433416
1,69309

Value,Count,Frequency (%),Unnamed: 3
0,433416,86.2%,
1,69309,13.8%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.1039

0,1
0,450493
1,52232

Value,Count,Frequency (%),Unnamed: 3
0,450493,89.6%,
1,52232,10.4%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.094427

0,1
0,455254
1,47471

Value,Count,Frequency (%),Unnamed: 3
0,455254,90.6%,
1,47471,9.4%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.11855

0,1
0,443127
1,59598

Value,Count,Frequency (%),Unnamed: 3
0,443127,88.1%,
1,59598,11.9%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.10035

0,1
0,452277
1,50448

Value,Count,Frequency (%),Unnamed: 3
0,452277,90.0%,
1,50448,10.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.90214

0,1
1,453529
0,49196

Value,Count,Frequency (%),Unnamed: 3
1,453529,90.2%,
0,49196,9.8%,

0,1
Correlation,0.99998

0,1
Correlation,0.98318

0,1
Correlation,0.98318

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.50028

0,1
1,251503
0,251222

Value,Count,Frequency (%),Unnamed: 3
1,251503,50.0%,
0,251222,50.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.49972

0,1
0,251503
1,251222

Value,Count,Frequency (%),Unnamed: 3
0,251503,50.0%,
1,251222,50.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.58005

0,1
1,291608
0,211117

Value,Count,Frequency (%),Unnamed: 3
1,291608,58.0%,
0,211117,42.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.41995

0,1
0,291608
1,211117

Value,Count,Frequency (%),Unnamed: 3
0,291608,58.0%,
1,211117,42.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.82462

0,1
1,414555
0,88170

Value,Count,Frequency (%),Unnamed: 3
1,414555,82.5%,
0,88170,17.5%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.17538

0,1
0,414555
1,88170

Value,Count,Frequency (%),Unnamed: 3
0,414555,82.5%,
1,88170,17.5%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.70571

0,1
1,354778
0,147947

Value,Count,Frequency (%),Unnamed: 3
1,354778,70.6%,
0,147947,29.4%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.29429

0,1
0,354778
1,147947

Value,Count,Frequency (%),Unnamed: 3
0,354778,70.6%,
1,147947,29.4%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.25054

0,1
0,376772
1,125953

Value,Count,Frequency (%),Unnamed: 3
0,376772,74.9%,
1,125953,25.1%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.24934

0,1
0,377374
1,125351

Value,Count,Frequency (%),Unnamed: 3
0,377374,75.1%,
1,125351,24.9%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.24998

0,1
0,377052
1,125673

Value,Count,Frequency (%),Unnamed: 3
0,377052,75.0%,
1,125673,25.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.25013

0,1
0,376977
1,125748

Value,Count,Frequency (%),Unnamed: 3
0,376977,75.0%,
1,125748,25.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.64018

0,1
1,321832
0,180893

Value,Count,Frequency (%),Unnamed: 3
1,321832,64.0%,
0,180893,36.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.35982

0,1
0,321832
1,180893

Value,Count,Frequency (%),Unnamed: 3
0,321832,64.0%,
1,180893,36.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.60369

0,1
1,303489
0,199236

Value,Count,Frequency (%),Unnamed: 3
1,303489,60.4%,
0,199236,39.6%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.39631

0,1
0,303489
1,199236

Value,Count,Frequency (%),Unnamed: 3
0,303489,60.4%,
1,199236,39.6%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.59644

0,1
1,299847
0,202878

Value,Count,Frequency (%),Unnamed: 3
1,299847,59.6%,
0,202878,40.4%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.40356

0,1
0,299847
1,202878

Value,Count,Frequency (%),Unnamed: 3
0,299847,59.6%,
1,202878,40.4%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.88945

0,1
1,447149
0,55576

Value,Count,Frequency (%),Unnamed: 3
1,447149,88.9%,
0,55576,11.1%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.11055

0,1
0,447149
1,55576

Value,Count,Frequency (%),Unnamed: 3
0,447149,88.9%,
1,55576,11.1%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.78247

0,1
1,393365
0,109360

Value,Count,Frequency (%),Unnamed: 3
1,393365,78.2%,
0,109360,21.8%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.21753

0,1
0,393365
1,109360

Value,Count,Frequency (%),Unnamed: 3
0,393365,78.2%,
1,109360,21.8%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.88174

0,1
1,443274
0,59451

Value,Count,Frequency (%),Unnamed: 3
1,443274,88.2%,
0,59451,11.8%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.11826

0,1
0,443274
1,59451

Value,Count,Frequency (%),Unnamed: 3
0,443274,88.2%,
1,59451,11.8%,

Unnamed: 0,label3,v8,label2,label5,label4,v39,v16,v40,id,target,v45_Alto,v45_Bajo,v45_Medio,v45_NI,v33_C,v33_D,v33_NI,v33_S,v33_V,v3_disc_v3_-9.0_2.0,v3_disc_v3_136.0_362.0,v3_disc_v3_1709.0_17422.0,v3_disc_v3_3.0_135.0,v3_disc_v3_363.0_758.0,v3_disc_v3_759.0_1708.0,v24_disc_v24_-9.0_762.0,v24_disc_v24_763.0_17453.0,v43_disc_v43_0.0_35.0,v43_disc_v43_36.0_41.0,v43_disc_v43_42.0_45.0,v43_disc_v43_46.0_49.0,v43_disc_v43_50.0_54.0,v43_disc_v43_55.0_58.0,v43_disc_v43_59.0_62.0,v43_disc_v43_63.0_69.0,v43_disc_v43_70.0_97.0,v14_disc_v14_-9.0_0.0,v14_disc_v14_1.0_13120.0,v20_disc_v20_-9.0_0.0,v20_disc_v20_1.0_17.0,v46_disc_v46_0.0_118.0,v46_disc_v46_119.0_999.0,v23_disc_v23_0.0_1.0,v23_disc_v23_2.0_17453.0,v2_disc_v2_0.0_1.0,v2_disc_v2_2.0_14.0,v53_disc_v53_0.0_1.0,v53_disc_v53_1.02_7370.89,v44_disc_v44_10024.0_11666.0,v44_disc_v44_11668.0_13900.0,v44_disc_v44_13909.0_40326.0,v44_disc_v44_5307.0_10022.0,v58_disc_v58_0.0_1.0,v58_disc_v58_1.01_6259.0,v28_disc_v28_-9.0_0.0,v28_disc_v28_1.0_12356.0,v51_disc_v51_0.0_1.0,v51_disc_v51_1.05_7000.0,v4_disc_v4_0.0_1.0,v4_disc_v4_2.0_4.0,v52_disc_v52_0.0_1.0,v52_disc_v52_2.0_5821.65,v1_disc_v1_0.0_1.0,v1_disc_v1_2.0_27.0
0,1.0,0.0,0.0,1.0,0.0,1,0.0,1,34,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,1,0,0,1,0,1,1,0,0,0,0,1,0,1,0,1,0,1,0,1,1,0
1,0.0,0.0,0.0,1.0,0.0,1,-9.0,0,240,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,1,1,0,1,0,0,1,0,0,1,0,0,1,1,0,1,0,1,0,1,0
2,1.0,0.0,1.0,0.0,1.0,1,-9.0,1,307,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,1,0,1,0,1,1,0,0,0,0,1,0,1,0,1,0,1,0,1,0,1
3,0.0,0.0,0.0,1.0,0.0,1,0.0,0,356,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,1,0,1,0,1,0,0,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0
4,0.0,0.0,0.0,0.0,1.0,1,-9.0,0,414,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,1,1,0,0,1,0,0,1,0,0,1,0,1,0,1,1,0,0,1,1,0


In [45]:
# profile = df.profile_report(title = 'hola')
# profile.to_file(outputfile="Report_ECI-Hogar.html") 


# Modelo Regresión Logística

In [21]:
from sklearn.linear_model import LogisticRegression

model_logistic = LogisticRegression(class_weight='balanced',
                                    solver = 'sag', # For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones
                                    random_state=22,
                                    n_jobs=-1)

model_logistic.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=-1, penalty='l2',
                   random_state=22, solver='sag', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
datos_metricas = metricas(X_train, y_train,X_test, y_test, model_logistic, 'logistica')
datos_metricas

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Unnamed: 0,Modelo,Precision,Recall,F1,AUC
0,logistica_train,0.0,0.0,0.0,0.499996
1,logistica_test,0.0,0.0,0.0,0.5


# Modelo BaggingClassifier con DecisionTree

https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier

In [None]:
model_bagging = BaggingClassifier(DecisionTreeClassifier(criterion = 'gini',
                                                       max_depth = 17,
                                                       max_features = 'auto',
                                                       random_state=1))

model_baggin.fit(X_train, y_train)

In [None]:
# Poner la optimización de los hiperparámetros
max_depth = [10, 13, 15, 20]
# min_weight_fraction_leaf = [0.003]
criterion = ['gini', 'entropy']
min_samples_split = [100, 150, 200]
min_samples_leaf = [15, 20, 25] 
max_leaf_nodes = [2**8, 2**12, 2**14, 2**16]
max_features = ['auto']
class_weight = [{1:4},{1:10},'balanced']

hyperF = dict(
              criterion = criterion, 
              max_depth = max_depth,  
              min_samples_split = min_samples_split, 
              min_samples_leaf = min_samples_leaf, 
              min_weight_fraction_leaf = min_weight_fraction_leaf,
              max_features = max_features,
#               max_leaf_nodes = max_leaf_nodes,
              class_weight = class_weight
)

model_dt = BaggingClassifier(DecisionTreeRegressor(criterion = 'gini',
                                 max_depth = 17,
                                 max_features = 'auto',
                                 random_state=1))

gridF = GridSearchCV(clf_over, hyperF, cv = 3, scoring = 'f1', verbose = 1, n_jobs = -1)
bestF = gridF.fit(X_train, y_train)

In [None]:
dtm = bestF.best_estimator_.fit(X_train,y_train)

# Modelo RandomForest Classifier


https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
model_rfc = RandomForestClassifier(n_estimators=200,
                                   criterion='gini',
                                   max_depth=17, 
                                   min_samples_split=100,
                                   min_samples_leaf=1, 
                                   min_weight_fraction_leaf=0.05,
                                   max_features='auto', n_jobs=None,
                                   random_state=1, verbose=0, 
                                   class_weight=[{1:2},'balanced'])

model_rfc.fit(X_train, y_train)

# Modelo XGBoost

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

In [None]:
import sklearn.ensemble

model_xgb = GradientBoostingClassifier(loss='deviance', # 'deviance', 'exponential'
                                       # learning_rate=0.1,
                                       n_estimators=200, 
                                       criterion='friedman_mse', # 'mse', 'friedman_mse', 'mae'
                                       min_samples_split=2,
                                       min_samples_leaf=1,
                                       min_weight_fraction_leaf=0.0,
                                       max_depth=3, 
                                       init=None,
                                       random_state=None,
                                       max_features=None,
                                       verbose=0, 
                                       max_leaf_nodes=None,
                                       warm_start=False,
                                       presort='auto',
                                       n_iter_no_change=None,
                                       tol=0.0001)

In [None]:
best_xgb = model_xgb.fit(X_train, y_train)


# Modelo LightGBM

https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMModel.html#lightgbm.LGBMModel

In [None]:
import lightgbm as lgb

params = {}

model_lgb = lgb.LGBMModel(boosting_type='gbdt', # ‘gbdt’, ‘dart’, ‘goss’, ‘rf’.
                          num_leaves=31, # minimo de hojas del arbol
                          max_depth=20, 
                          n_estimators=200, 
                          objective=None,
                          class_weight=[{1:2},'balanced'],
                          colsample_bytree=1.0,
                          reg_alpha=0.0, 
                          reg_lambda=0.0, 
                          random_state=1,
                          n_jobs=-1, 
                          silent=False, 
                          importance_type='split' # 'split','gain'
                         )

In [None]:
best_lgb = model_lgb.fit(X_train, y_train)

In [2]:
model_gbm()

# Ensemble

In [None]:
from sklearn.ensemble import VotingClassifier
# model1 = LogisticRegression(random_state=1)
# model2 = tree.DecisionTreeClassifier(random_state=1)
# model = VotingClassifier(estimators=[('lr', model1), ('dt', model2)], voting='hard')
model.fit(X_train,y_train)
model.score(X_test,y_test)