In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [54]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn import model_selection

In [24]:
data_path = "../data/uci_data.xls"

In [25]:
df = pd.read_excel(data_path, header=0, index_col=0, skiprows=1)

In [26]:
train, test = train_test_split(df, test_size=0.2, random_state=42)
X_train = train.drop(columns=("default payment next month"))
y_train = train[["default payment next month"]]
X_test = test.drop(columns=("default payment next month"))
y_test = test[["default payment next month"]]

In [27]:
train.head()

Unnamed: 0_level_0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21754,80000,2,2,2,24,0,0,0,0,0,...,73731,39643,39457,3503,5001,2092,1218,1445,878,0
252,30000,1,2,2,28,0,0,0,0,0,...,25255,22001,0,5006,1244,851,955,0,0,0
22942,180000,2,5,1,44,0,0,-1,-1,-1,...,0,6881,10340,0,850,0,6881,10340,182,0
619,60000,1,1,2,25,0,0,0,0,0,...,39639,39619,39140,2018,1900,2000,1500,1900,2000,0
17091,130000,2,2,2,25,0,0,0,0,0,...,117823,120854,123904,4100,4200,5000,5000,5000,10700,0


In [39]:
numeric_features = ["LIMIT_BAL", "AGE", 
                    "BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6", 
                    "PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6"]

categorical_features = ["SEX", "EDUCATION", "MARRIAGE", "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6"]

In [98]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(solver='saga', max_iter=3000))])

clf.fit(X_train, y_train.values.ravel());

C(0,0) True negatives
C(1,0) False negatives
C(1,1) True positives
C(0,1) False positives

In [99]:
metrics.confusion_matrix(y_test, clf.predict(X_test))

array([[4461,  226],
       [ 861,  452]])

In [67]:
metrics.accuracy_score(y_test, clf.predict(X_test))

0.8188333333333333

In [68]:
metrics.recall_score(y_test, clf.predict(X_test))

0.34424980959634427

In [69]:
metrics.roc_auc_score(y_test, clf.predict(X_test))

0.6480156664794181

In [100]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

In [121]:
param_grid = {
    'classifier__penalty': ['l1', 'l2'],
    'classifier__C': [8, 10, 15]
}

model_lr = model_selection.GridSearchCV(clf, param_grid=param_grid, cv=5)

In [122]:
model_lr = model_lr.fit(X_train, y_train.values.ravel())

In [192]:
metrics.confusion_matrix(y_train, model_lr.predict(X_train))

array([[31775,  1555],
       [ 2598,  1426]])

In [123]:
metrics.confusion_matrix(y_test, model_lr.predict(X_test))

array([[4462,  225],
       [ 861,  452]])

In [190]:
metrics.accuracy_score(y_train, model_lr.predict(X_train)), metrics.recall_score(y_train, model_lr.predict(X_train))

(0.8888204743802538, 0.3543737574552684)

In [191]:
metrics.accuracy_score(y_test, model_lr.predict(X_test)), metrics.recall_score(y_test, model_lr.predict(X_test))

(0.819, 0.34424980959634427)

In [126]:
model_lr.best_params_

{'classifier__C': 8, 'classifier__penalty': 'l2'}

In [127]:
param_grid = {
    'classifier__penalty': ['l1', 'l2'],
    'classifier__C': [6, 8, 9]
}

model_lr = model_selection.GridSearchCV(clf, param_grid=param_grid, cv=5)

model_lr = model_lr.fit(X_train, y_train.values.ravel())

In [None]:
metrics.confusion_matrix(y_train, model_lr.predict(X_train))

In [128]:
metrics.confusion_matrix(y_test, model_lr.predict(X_test))

array([[4462,  225],
       [ 861,  452]])

In [None]:
metrics.accuracy_score(y_train, model_lr.predict(X_train)), metrics.recall_score(y_train, model_lr.predict(X_train))

In [151]:
metrics.accuracy_score(y_test, model_lr.predict(X_test)), metrics.recall_score(y_test, model_lr.predict(X_test))

(0.819, 0.34424980959634427)

In [129]:
model_lr.best_params_

{'classifier__C': 8, 'classifier__penalty': 'l2'}

### Generar más muestras negativas para conseguir una muestra balanceada

In [130]:
from sklearn.utils import resample

In [134]:
train_ok = train[train['default payment next month'] == 0]
train_im = train[train['default payment next month'] == 1]

In [139]:
train_ok.shape, train_im.shape

((18677, 24), (5323, 24))

In [140]:
# sample with replacement to match majority class
train_im_resampled = resample(train, replace=True, n_samples=18677, random_state=42) 

In [141]:
train_im_resampled.shape

(18677, 24)

In [142]:
train_balanced = pd.concat([train_ok, train_im_resampled])

In [144]:
X_train_bl = train_balanced.drop(columns=("default payment next month"))
y_train_bl = train_balanced[["default payment next month"]]

In [153]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf_bl = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(solver='saga', max_iter=3000))])

clf_bl.fit(X_train_bl, y_train_bl.values.ravel());

In [193]:
metrics.confusion_matrix(y_train, clf_bl.predict(X_train))

array([[32819,   511],
       [ 3356,   668]])

In [194]:
metrics.confusion_matrix(y_test, clf_bl.predict(X_test))

array([[4617,   70],
       [1099,  214]])

In [195]:
metrics.accuracy_score(y_train, clf_bl.predict(X_train)), metrics.recall_score(y_train, clf_bl.predict(X_train))

(0.8964769502596777, 0.16600397614314116)

In [196]:
metrics.accuracy_score(y_test, clf_bl.predict(X_test)), metrics.recall_score(y_test, clf_bl.predict(X_test))

(0.8051666666666667, 0.16298552932216298)

In [158]:
### Usamos ahora un decision tree

In [159]:
from sklearn.tree import DecisionTreeClassifier

In [171]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf_dt = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', DecisionTreeClassifier(random_state=42))])

clf_dt.fit(X_train, y_train.values.ravel());

In [197]:
metrics.confusion_matrix(y_train, clf_dt.predict(X_train))

array([[33330,     0],
       [    7,  4017]])

In [198]:
metrics.confusion_matrix(y_test, clf_dt.predict(X_test))

array([[4068,  619],
       [ 920,  393]])

In [199]:
metrics.accuracy_score(y_train, clf_dt.predict(X_train)), metrics.recall_score(y_train, clf_dt.predict(X_train))

(0.9998126037372169, 0.9982604373757455)

In [200]:
metrics.accuracy_score(y_test, clf_dt.predict(X_test)), metrics.recall_score(y_test, clf_dt.predict(X_test))

(0.7435, 0.2993145468392993)

In [183]:
param_grid = {
    'classifier__max_depth': [10, 20, 50, 100],
    'classifier__min_samples_split': [2, 3],
    'classifier__min_samples_leaf': [2, 3]
}

model_dt = model_selection.GridSearchCV(clf_dt, param_grid=param_grid, cv=5)

model_dt = model_dt.fit(X_train, y_train.values.ravel())

In [201]:
metrics.confusion_matrix(y_train, model_dt.predict(X_train))

array([[33255,    75],
       [  534,  3490]])

In [184]:
metrics.confusion_matrix(y_test, model_dt.predict(X_test))

array([[4220,  467],
       [ 957,  356]])

In [202]:
metrics.accuracy_score(y_train, model_dt.predict(X_train)), metrics.recall_score(y_train, model_dt.predict(X_train))

(0.9836965251378701, 0.867296222664016)

In [203]:
metrics.accuracy_score(y_test, model_dt.predict(X_test)), metrics.recall_score(y_test, model_dt.predict(X_test))

(0.7626666666666667, 0.27113480578827115)

In [186]:
model_dt.best_params_

{'classifier__max_depth': 50,
 'classifier__min_samples_leaf': 2,
 'classifier__min_samples_split': 2}