In [1]:
# set terminal options to display all outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Exercises
- In the exercises, you will still use the same dataset after One-Hot Encoding (which is df2)
- The dependent variable is "default payment next month"

In [2]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
import numpy as np
from sklearn.metrics import roc_curve, auc, precision_recall_curve

In [3]:
df = pd.read_excel('https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls', header = 1).drop('ID', axis = 1)

col = ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
df2 = pd.get_dummies(df, columns = col, drop_first=True)
df2.head()

Unnamed: 0,LIMIT_BAL,AGE,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,...,PAY_5_8,PAY_6_-1,PAY_6_0,PAY_6_2,PAY_6_3,PAY_6_4,PAY_6_5,PAY_6_6,PAY_6_7,PAY_6_8
0,20000,24,3913,3102,689,0,0,0,0,689,...,0,0,0,0,0,0,0,0,0,0
1,120000,26,2682,1725,2682,3272,3455,3261,0,1000,...,0,0,0,1,0,0,0,0,0,0
2,90000,34,29239,14027,13559,14331,14948,15549,1518,1500,...,0,0,1,0,0,0,0,0,0,0
3,50000,37,46990,48233,49291,28314,28959,29547,2000,2019,...,0,0,1,0,0,0,0,0,0,0
4,50000,57,8617,5670,35835,20940,19146,19131,2000,36681,...,0,0,1,0,0,0,0,0,0,0


In [4]:
from sklearn.model_selection import train_test_split
y = df2['default payment next month'].values.astype(float)
x = df2.drop('default payment next month', axis = 1).values.astype(float)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 0)

You can then report the final model performance on the test set (skipped). 

## Q1 - Grid-search both C and number of PCs

Next, we use the gride search procedure to search for both the optimal C (inverse of L2 regularization parameter) and the number of principle components.

- Define a parameter grid such that pca__n_components is 5 to 20, and add C as a parameter that can take the following values: [1000, 100, 10, 1, 0.1, 0.01]. Hint: use `pipe.get_params().keys()` to get the right key for C. 
- Which C and PC combination is the best?

In [5]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [6]:
##### YOUR CODE HERE ######
pipe = Pipeline([
    ('standardization', preprocessing.StandardScaler()),
    ('pca', PCA()),    
    ('lg', LogisticRegression(C=0.01, solver = 'lbfgs', max_iter = 1000))
])

param_grid = {'pca__n_components': [i for i in range(5, 21)], 'lg__C': [1000, 100, 10, 1, 0.1, 0.01]}
# grid search using cross validation
grid = GridSearchCV(pipe, cv = 3, param_grid = param_grid, scoring = 'roc_auc')
grid_fit = grid.fit(x_train, y_train)

In [7]:
report(grid.cv_results_)

Model with rank: 1
Mean validation score: 0.762 (std: 0.006)
Parameters: {'lg__C': 100, 'pca__n_components': 19}

Model with rank: 2
Mean validation score: 0.761 (std: 0.005)
Parameters: {'lg__C': 10, 'pca__n_components': 18}

Model with rank: 3
Mean validation score: 0.761 (std: 0.006)
Parameters: {'lg__C': 100, 'pca__n_components': 17}



## Q2 - Provide AUC in the test set
- Based on Exercise 1, use the best chosen model to provide AUC for the test set

In [8]:
#### YOUR CODE HERE ####
# build pipeline using estimated best number of PCs
predicted_prob = grid.predict_proba(x_test)[: , 1]

# get AUC
fpr, tpr, thresholds = roc_curve(y_test, predicted_prob)
print('AUC: {:.2f}'.format(auc(fpr, tpr)))

AUC: 0.75


## Q3 - Confusion matrix, precision, recall, F-1
- When decision threshold is 0.5:
    - Provide Confusion matrix, precision, recall, F-1 for the test set

In [9]:
from sklearn.metrics import confusion_matrix, classification_report

#### YOUR CODE HERE ####
cm = confusion_matrix(y_test, predicted_prob > 0.5)
print('confusion matrix:\n', cm)
print('classification report:\n', classification_report(y_test, predicted_prob > 0.5))

confusion matrix:
 [[2258  102]
 [ 476  164]]
classification report:
               precision    recall  f1-score   support

         0.0       0.83      0.96      0.89      2360
         1.0       0.62      0.26      0.36       640

   micro avg       0.81      0.81      0.81      3000
   macro avg       0.72      0.61      0.62      3000
weighted avg       0.78      0.81      0.77      3000



## Q4 Optimal decision threshold

If the average cost for a false positive prediction is 1, and the cost for a false negative prediction is 2, what is the optimal decision threshold that can minimize cost in the test set given your predicted_prob? Provide answer up to 2 decimal places. 

In [10]:
dt_cost = []
for dt in np.arange(0.0, 1.0, 0.01):
    print(f"decision threshold: {dt:.2f}")
    #### YOUR CODE HERE ####
    tn, fp, fn, tp = confusion_matrix(y_test, predicted_prob > dt).ravel()
    cost = 1*fp + 2*fn
    print(f"cost: {cost:.2f}")
    dt_cost.append([dt, cost])

decision threshold: 0.00
cost: 2360.00
decision threshold: 0.01
cost: 2356.00
decision threshold: 0.02
cost: 2349.00
decision threshold: 0.03
cost: 2338.00
decision threshold: 0.04
cost: 2319.00
decision threshold: 0.05
cost: 2299.00
decision threshold: 0.06
cost: 2287.00
decision threshold: 0.07
cost: 2263.00
decision threshold: 0.08
cost: 2218.00
decision threshold: 0.09
cost: 2168.00
decision threshold: 0.10
cost: 2103.00
decision threshold: 0.11
cost: 2003.00
decision threshold: 0.12
cost: 1884.00
decision threshold: 0.13
cost: 1733.00
decision threshold: 0.14
cost: 1583.00
decision threshold: 0.15
cost: 1417.00
decision threshold: 0.16
cost: 1283.00
decision threshold: 0.17
cost: 1218.00
decision threshold: 0.18
cost: 1143.00
decision threshold: 0.19
cost: 1092.00
decision threshold: 0.20
cost: 1042.00
decision threshold: 0.21
cost: 1004.00
decision threshold: 0.22
cost: 990.00
decision threshold: 0.23
cost: 974.00
decision threshold: 0.24
cost: 959.00
decision threshold: 0.25
cos

In [11]:
dt_cost = np.array(dt_cost)
print(f"optimal decision threshold is :{dt_cost[np.argmin(dt_cost[:, 1]), 0]}")

optimal decision threshold is :0.25
