In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# set terminal options to display all outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
!pip install -U xlrd

In [None]:
!pip install -U pandas

## 1. Dataset
- We are using "default of credit card clients Data Set" from UCI Machine Learning Repository
- This dataset consists of customers' default payments (fail to pay the credit card by the due date) data in Taiwan among six data mining methods
- Binary dependent variable "default payment next month" (1 = default, 0 = not default)
- Detailed attributes descriptions [here](https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients)

In [None]:
import pandas as pd
from sklearn.metrics import roc_curve, auc

pd.set_option('display.max_columns', 100)

df = pd.read_excel('http://fengmai.net//download/courses/2020S-BIA652NT-Pub/Week10-Classification%20and%20Hyperparameter%20Search%20using%20Python/default%20of%20credit%20card%20clients.xls', header = 1).drop('ID', axis = 1)
df.head()

In [None]:
# basic data exploration
df.shape
df.columns
df.isnull().values.any()

In [None]:
df.describe().T.to_excel("desc_stats.xlsx")

In [None]:
df['default payment next month'].value_counts()
# note that this is an imbalanced dataset, we should look at the confusion matrix rather than overall accuracy (will be discussed later).

In [None]:
import seaborn as sns
sns.set(style = 'whitegrid')
sns.countplot(data = df, x = 'default payment next month')

## 2. Creating Dummy Variables (One-Hot Encoding)
- Many machine learning algorithms cannot work with categorical data directly. Thus, categorical data must be converted to numbers (dummy variables), e.g., "cold": 1, "warm": 2, "hot": 3
- Yet, there may still be problems when there is no ordinal relationship and allowing the representation to lean on any such relationship might be damaging to learning to solve the problem, e.g. "dog": 1, "cat": 2. In these cases, we would like to give the model more expressive power to learn a probability-like number for each possible label value
- One Hot Encoding: a representation of categorical variables as binary vectors
    - first requires that the categorical values be mapped to integer values
    - each integer value is represented as a binary vector that is all zero values except the index of the integer, which is marked with a 1
- More details [here](https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/)

In [None]:
df.apply(lambda col: len(col.unique()))
# in our case, ['SEX', 'EDUCATION', 'MARRIAGE'] are dummy variables

In [None]:
col = ['SEX', 'EDUCATION', 'MARRIAGE']
df2 = pd.get_dummies(df, columns = col, drop_first=True)
df2.head()
df2.shape

## 3. Logistic Regression

In [None]:
import statsmodels.api as sm

logit_mod = sm.Logit(df2['default payment next month'], sm.add_constant((df2.drop('default payment next month', axis = 1))))
logit_res = logit_mod.fit()
print(logit_res.summary())

In [None]:
to_drop = []
corr_matrix = sm.add_constant((df2.drop('default payment next month', axis = 1))).corr()
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if (corr_matrix.iloc[i, j] >= 0.80) and i != j:
            print(f"{corr_matrix.columns[i]} and {corr_matrix.index[j]} have a correlation of {corr_matrix.iloc[i, j]}" ) 
            to_drop.append(corr_matrix.index[j])

In [None]:
to_drop

In [None]:
import statsmodels.api as sm

logit_mod = sm.Logit(df2['default payment next month'], sm.add_constant((df2.drop(['default payment next month'] + to_drop, axis = 1))))
logit_res = logit_mod.fit(maxiter = 1000)
print(logit_res.summary())

In [None]:
log_sum = logit_res.summary()

In [None]:
log_sum.as_csv()

[Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)  

**Make sure that you choose the regularization hyperparameter (C) wisely! The default C is 1.0, which can be very detrimental to model performance. Note that C is the inverse of lambda, smaller C means stronger penalty for complicated models. ** In this example, we set C to a large number. 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression

y = df2['default payment next month']
x = df2.drop('default payment next month', axis = 1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [None]:
# standardization before fitting data so that they have 0-mean and unit-variance: z = (x - u) / s
scaler = preprocessing.StandardScaler().fit(x_train)
x_train_s = scaler.transform(x_train)
x_test_s = scaler.transform(x_test)

In [None]:
lg = LogisticRegression(C=100, solver = 'lbfgs', max_iter = 1000)
lg.fit(x_train_s, y_train)

In [None]:
y_predict_test = lg.predict(x_test_s)

In [None]:
y_predict_test[:20]

In [None]:
y_test[:20].values

## 4. Performance evaluation
- Accuracy
- Confusion Matrix
- Precision, Recall, and F1-score
- More details [here](https://towardsdatascience.com/beyond-accuracy-precision-and-recall-3da06bea9f6c)

### 4.1. Accuracy
- Accuracy is often used to measure model performance

In [None]:
lg.score

In [None]:
print('classifier accuracy is {:.2f}'.format(lg.score(x_test_s, y_test)))

In [None]:
from sklearn.metrics import accuracy_score
print('classifier accuracy is {:.2f}'.format(accuracy_score(y_true=y_test, y_pred=y_predict_test)))

- However, for an imbalanced classification problem where **the sample is imbalanced (one category represents the overwhelming majority of the data points)** or **the cost is asymmetric**, accuracy can be a problematic metric
  - prediction of rare diseases
  - mortgage default
  - antibody test

### 4.2. Confusion Matrix
- Two types of prediction errors:
  - False Positive: Predict an event when there was no event
  - False Negative: Predict no event when in fact there was an event

In [None]:
# confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_true=y_test, y_pred=y_predict_test)
print('confusion matrix:\n', cm)

In [None]:
tn, fp, fn, tp = cm.ravel()
print(f'tn: {tn}, fp: {fp}, fn: {fn}, tp: {tp}')

In [None]:
sns.heatmap(pd.DataFrame(cm), annot = True, cmap="YlGnBu")

### 4.3. Precision, Recall, and F1-score
- Precision: precentage of true cases among the predicated true cases
- Recall:  precentage of true cases that have been retrieved over the total number of true cases
    - Metrics: <br>
   $$ 
    \begin{align}
     precision~of~positive~class &= \frac{true~positives}{ true~positives + false~positives} \\
     recall~of~positive~class &= \frac{true~positives}{ true~positives + false~negatives}
    \end{align}
    $$
- F-score: $$\frac{2*precision*recall}{precision+recall}$$

In [None]:
# classification report
print('classification report:\n', classification_report(y_test, y_predict_test))

### 4.4 Decision threshold

By default, most classification algorithms can output a predicted probability using `predict_proba()`. The default decision threshold is $p = 0.5$, meaning if $p(y|x) > 0.5$ then predicted class is 1. Sometimes it is necassary to use a difference decision threshold. 

In [None]:
predict_prob = lg.predict_proba(x_test_s)
# note that the predicted prob has 2 columns and they add up to 1. 
# The columns give the predicted probability of each class.
predict_prob[:5]

When we set the threshold to 0.1, the recall for y = 1 class goes up, and precision goes down (think about why). This is a fundamental tradeoff that you need to make. The optimal threshold depends on 
- the relative cost between false positive and false negative. 
- how unbalanced the classes are in the train set.

In [None]:
print('classification report:\n', classification_report(y_test, predict_prob[:,1] > 0.1))

## 5. ROC curve and Precision-Recall Curve
- ROC curve and AUC
- Precision-Recall Curve
- More details [here](https://machinelearningmastery.com/roc-curves-and-precision-recall-curves-for-classification-in-python/)

### 5.1 ROC curve and AUC
- **Receiver operating characteristic (ROC) curve** plots the true positive rate (TPR) vs the false positive rate (FPR) as a function of the model’s threshold for classifying positives
- Metrics: <br>

$$ 
    \begin{align}
     true~positive~rate~(tpr) &= recall~of~positive~class  \\
     false~positive~rate~(fpr) &= \frac{false~positives}{ false~positives + true~negatives} \\
     & = 1- recall~of~negative~class
    \end{align}
$$ 

- **Area under the curve (AUC)** is a metric to calculate the overall performance of a classification model based on area under the ROC curve 
- **Important**: To genenrate the ROC curve, you need to use the output from `predict_proba`, not `predict`! The `roc_curve` function automatically varies the decision threshold and computes the TP and FP at any given threshold. 

In [None]:
y_test

In [None]:
predict_prob[:, 1]

In [None]:
from sklearn.metrics import roc_curve, auc, precision_recall_curve

fpr, tpr, thresholds = roc_curve(y_test, predict_prob[:, 1])
print('AUC: {:.2f}'.format(auc(fpr, tpr)))

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize = (7, 5))
plt.plot(fpr, tpr, color = 'darkorange', lw = 2, label = 'Logistic Reg')
plt.plot([0, 1], [0, 1], color = 'navy', lw = 2, linestyle = '--', label = 'Random Guess')
plt.title('AUC of Logistic Model')
plt.xlabel('False Positive Rate (1-Specificity)')
plt.ylabel('True Positive Rate (Recall)' )
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.legend()
plt.show();

In [None]:
# !!!! Warning! Wrong Approach !!!! #######
fpr, tpr, thresholds = roc_curve(y_test, predict_prob[:, 1] > 0.5)
print('AUC: {:.2f}'.format(auc(fpr, tpr)))

import matplotlib.pyplot as plt
plt.figure(figsize = (7, 5))
plt.plot(fpr, tpr, color = 'darkorange', lw = 2, label = 'Logistic Reg')
plt.plot([0, 1], [0, 1], color = 'navy', lw = 2, linestyle = '--', label = 'Random Guess')
plt.title('AUC of Logistic Model (Wrong Way)')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.legend()
plt.show();

Using the binary classification output instead of predicted probability to construct the ROC curve and to compute AUC is a common mistake. See [SAS's Python API tutorial](https://github.com/sassoftware/saspy-examples/blob/8b5caae91b375276c3d20b2aa62a3fce9ff881f7/SAS_contrib/SASPy%20for%20Machine%20Learning.ipynb)

### 5.2 Precision-Recall Curve
- **Precision-Recall Curve** plots the precision (y-axis) and the recall (x-axis) for different thresholds, much like the ROC curve


In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, predict_prob[:,1] )

plt.figure(figsize = (7, 5))
plt.plot(recall, precision, color='darkorange', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision_Recall_Curve of Logistic Model')
plt.show();

Quiz: When decision threshold increases, what happens to the horizontal axis (recall)?

## 6. PCA
- **Principal Component Analysis (PCA)** is a dimension-reduction tool that can be used to reduce a large set of variables to a small set that still contains most of the information in the large set.
- The main purpose of principal component analysis is to:
    - identify hidden pattern in a data set,
    - reduce the dimensionnality of the data by removing the noise and redundancy in the data,
    - deal with multicollinearity

In [None]:
import numpy as np

# correlation matrix
corr = df2.iloc[:, :14].corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show();

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

y = df2['default payment next month'].values.astype(float)
x = df2.drop('default payment next month', axis = 1).values.astype(float)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

# Note that standardization is important in PCA since the latter projects your original data onto directions which maximize the variance
# If the features have different scales, this projection may get skrewed
scaler = preprocessing.StandardScaler().fit(x_train)
x_train_s = scaler.transform(x_train)
x_test_s = scaler.transform(x_test)

In [None]:
# find the number of principle components which have 80% information of the original dataset
pca = PCA(n_components = 0.8)
pca.fit(x_train_s)
print('Variance ratio of each pc:\n', pca.explained_variance_ratio_, '\n')
print('Explained variance of each pc:\n', pca.explained_variance_, '\n')
print('Selected {} pcs'.format(pca.n_components_))
print('Original dataset shape: ', df2.shape)

In [None]:
x_train_pca = pca.transform(x_train_s)
x_test_pca = pca.transform(x_test_s)

lg = LogisticRegression(C = 1000, solver = 'lbfgs', max_iter = 1000)
lg.fit(x_train_pca, y_train)

predict = lg.predict(x_test_pca)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
print('classifier accuracy is {:.2f}'.format(lg.score(x_test_pca, y_test)))

cm = confusion_matrix(y_test, predict)
print('confusion matrix:\n', cm)

print('classification report:\n', classification_report(y_test, predict))

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda_mod = LinearDiscriminantAnalysis()
lda_mod.fit(x_train_pca, y_train)
predict = lda_mod.predict(x_test_pca)
cm = confusion_matrix(y_test, predict)
print('confusion matrix:\n', cm)
print('classification report:\n', classification_report(y_test, predict))

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda_mod = QuadraticDiscriminantAnalysis()
qda_mod.fit(x_train_pca, y_train)
predict = qda_mod.predict(x_test_pca)
cm = confusion_matrix(y_test, predict)
print('confusion matrix:\n', cm)
print('classification report:\n', classification_report(y_test, predict))

In [None]:
qda_mod = QuadraticDiscriminantAnalysis()
qda_mod.fit(x_train_pca, y_train)
predict = qda_mod.predict_proba(x_test_pca)[:, 1] > 0.9
cm = confusion_matrix(y_test, predict)
print('confusion matrix:\n', cm)
print('classification report:\n', classification_report(y_test, predict))

## Deciding the number of principal components by cross validation
- We can use CV on train set to decide the best number of PC (2-20) to maximize AUC
- Steps:
    - build a pipeline which executes necessary steps in a row
    - set the range of parameters to be tuned
    - grid search using cross validation

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

# build a pipeline which executes three steps in a row
pipe = Pipeline([
    ('standardization', preprocessing.StandardScaler()),
    ('pca', PCA()),    
    ('lg', LogisticRegression(C=0.01, solver = 'lbfgs', max_iter = 1000))
])

# set the range of parameters to be tuned
param_grid = {'pca__n_components':  range(2, 21)}
# grid search using cross validation
grid = GridSearchCV(pipe, cv = 3, param_grid = param_grid, scoring = 'roc_auc', refit=True, verbose=1)
grid_fit = grid.fit(x_train, y_train)

We can use the following function to help report the results: 

In [None]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            
report(grid.cv_results_)

Once the grid search is complete, the `GridSearchCV` will automatically refit on the entire train set. You can use the object to predict new observations directly. 

In [None]:
# This is not needed, unless you set refit = False when creating GridSearchCV. 
# final_model = grid.best_estimator_.fit(x_train, y_train)

In [None]:
grid.predict_proba(x_test)[:3]