In [None]:
# import libraries
import numpy as np 
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt

## 1. Introduction to the Data Set

For this dataset, the order of the implementation of the problem is as follows:
1. We check for the meaning of each feature
2. We check for the possible existence of missing values and duplicate terms
3. We check for the unexpected/unexplained values in some variables
4. We check for the distribution of each variable, including both categorical and continuous
5. We check whether the data is unbalanced
6. We do the data preprocessing, including category consolidation (replace the unexplained values with reasonable existing value), data encoding and scaling
7. We split the train and test set
8. We perform feature selection (correlation test, chi-squared test, SelectKBest method, backward feature selection) on the train set
9. We drop the unnecessary features from both train and test set
10. We perform the linear separability check on the train set
11. We apply Synthetic Minority Over-sampling Technique (SMOTE) to train set, and export the train and test set to separate csv files
12. We use different models to fit the train set, then apply the model to the test set


In [None]:
# card_divided uses prof's code to split the data file into train and test set
# the first 22500 rows makes up the train set, the remaining rows are test set
df_card = pd.read_csv('card_divided.csv',skiprows=0)

In [None]:
df_card.head()

## 2. Exploratory Data Analysis

In [None]:
df_card.info()

In [None]:
# see summary statistics
df_card.describe()

In [None]:
# Duplicate and missing data

# Check for missing values
# get the number of missing records in each column
print(df_card.isnull().sum())
print("Presence of missing values: ",  df_card.isnull().values.any())

# Check for duplicates - use ID since each client will have a different ID
print("Number of unique IDs : " , df_card['ID'].nunique())


From the summary statistics, we can see that `AGE` has a minimum of 21 and maximum of 79. Since the minimum age to have a credit card in Taiwan is 18 (Pinzon, 2023), all the data is valid.

Reference: 
Pinzon, J. L. (2023, January 5). Young Taiwanese adults can open bank accounts themselves│TVBS新聞網. TVBS. Retrieved April 11, 2023, from https://news.tvbs.com.tw/english/2009577

In [None]:
# from the summary table, we can see that the PAY_n values have a minimum of -2, which was not defined
# so we check here to see if it's the same for all the PAY_n values
print(df_card.PAY_0.min())
print(df_card.PAY_2.min())
print(df_card.PAY_3.min())
print(df_card.PAY_4.min())
print(df_card.PAY_5.min())
print(df_card.PAY_6.min())


In [None]:
# check distribution of categorical variables
print(df_card.SEX.value_counts())
print('Sex column values: ', df_card.SEX.unique())
print(df_card.EDUCATION.value_counts())
print('Education column values: ', df_card.EDUCATION.unique())
print(df_card.MARRIAGE.value_counts())
print('Marriage column values: ', df_card.MARRIAGE.unique())

In [None]:
# visualise distribution of categorical variables
sex_counts = df_card.SEX.value_counts()
education_counts = df_card.EDUCATION.value_counts()
marriage_counts = df_card.MARRIAGE.value_counts()

# sex pie chart
fig, ax = plt.subplots()
ax.pie(sex_counts, labels=sex_counts.index, autopct='%1.1f%%', startangle=90)
ax.legend(labels=['1 = Male', '2 = Female'], loc='upper right')
ax.set_title('Distribution of SEX')
ax.axis('equal') # to draw a circle

# education pie chart
fig1, ax1 = plt.subplots()
ax1.pie(education_counts, labels=education_counts.index, startangle=90)
ax1.legend(labels=['1 = graduate school', '2 = university', '3 = high school',
                     '4 = others', '5 = unknown', '6 = unknown'], loc='upper right')
ax1.set_title('Distribution of EDUCATION')
ax1.axis('equal')

# marriage pie chart
fig2, ax2 = plt.subplots()
ax2.pie(marriage_counts, labels=marriage_counts.index, startangle=90)
ax2.legend(labels=['1 = married', '2 = single', '3 = others'], loc='lower right')
ax2.set_title('Distribution of MARRIAGE')
ax2.axis('equal')

# show plot
plt.show()


In [None]:
# Distribution of Continuous Variable
def hist(df, vars, n_r, n_c, n_b):
    fig=plt.figure()
    for i, variable_name in enumerate(vars):
        ax=fig.add_subplot(n_r,n_c,i+1)
        df[variable_name].hist(bins=n_b,ax=ax)
        ax.set_title(variable_name, fontsize = 8)
    fig.tight_layout() 
    plt.show()

continous_variables = df_card[['LIMIT_BAL','BILL_AMT1','BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
                 'PAY_AMT1','PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']]
hist(continous_variables, continous_variables.columns, 4, 4, 20)

In [None]:
# check financial stats of clients with high LIMIT_BAL to check for outliers
# sort from highest
df_sorted = df_card.sort_values(by='LIMIT_BAL', ascending=False)
# top 20 highest LIMIT_BAL
top_20 = df_sorted.head(20)
top_20_stats = top_20.loc[:, ['LIMIT_BAL', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'BILL_AMT2', 'BILL_AMT3', 
                              'BILL_AMT4', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'default.payment.next.month']]
print(top_20_stats)

In [None]:
# check distribution of target variable
print(df_card['default.payment.next.month'].value_counts())

# pie chart
target_counts = df_card['default.payment.next.month'].value_counts()
fig3, ax3 = plt.subplots()
ax3.pie(target_counts, labels=target_counts.index, autopct='%1.1f%%', startangle=90)
ax3.legend(labels=['0 = non-default', '1 = default'], loc='lower right')
ax3.set_title('Distribution of target variable')
ax3.axis('equal')

# show plot
plt.show()

## 3. Data pre-processing

In [None]:
# replace value 0 with the category "others" represented by 3
df_card.loc[df_card.MARRIAGE == 0, 'MARRIAGE'] = 3

# replace values 0, 5, 6 with the category "others" represented by 4
education_others = (df_card.EDUCATION == 5) | (df_card.EDUCATION == 6) | (df_card.EDUCATION == 0)
df_card.loc[education_others, 'EDUCATION'] = 4

# replace all PAY_n values -2, -1 with 0
paid_duly = (df_card.PAY_0 == -2) | (df_card.PAY_0 == -1)
df_card.loc[paid_duly, 'PAY_0'] = 0

paid_duly = (df_card.PAY_2 == -2) | (df_card.PAY_2 == -1)
df_card.loc[paid_duly, 'PAY_2'] = 0

paid_duly = (df_card.PAY_3 == -2) | (df_card.PAY_3 == -1)
df_card.loc[paid_duly, 'PAY_3'] = 0

paid_duly = (df_card.PAY_4 == -2) | (df_card.PAY_4 == -1)
df_card.loc[paid_duly, 'PAY_4'] = 0

paid_duly = (df_card.PAY_5 == -2) | (df_card.PAY_5 == -1)
df_card.loc[paid_duly, 'PAY_5'] = 0

paid_duly = (df_card.PAY_6 == -2) | (df_card.PAY_6 == -1)
df_card.loc[paid_duly, 'PAY_6'] = 0

In [None]:
# Data Encoding
df_card_encoded = df_card.copy()

# one-hot encoding - SEX
# drop_first = True drops the first category of SEX (Male)
df_card_encoded = pd.get_dummies(df_card_encoded, columns = ['SEX'], drop_first = True)
# rename column to FEMALE
df_card_encoded.rename(columns={'SEX_2':'FEMALE'}, inplace=True)

# one-hot encoding - MARRIAGE
df_card_encoded = pd.get_dummies(df_card_encoded, columns = ['MARRIAGE'])
# drop MARRIAGE_3 (Others category) column
df_card_encoded = df_card_encoded.drop('MARRIAGE_3',axis = 1)
# rename columns
df_card_encoded.rename(columns={'MARRIAGE_1':'MARRIED', 'MARRIAGE_2': 'SINGLE'}, inplace=True)

print(df_card_encoded.head())



In [None]:
# Scaling
from sklearn.preprocessing import MinMaxScaler

df_card_encoded_scaled = df_card_encoded.copy()

scaler = MinMaxScaler()
df_card_encoded_scaled[['LIMIT_BAL','BILL_AMT1','BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
                        'PAY_AMT1','PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']] = pd.DataFrame(
    scaler.fit_transform(df_card_encoded_scaled[['LIMIT_BAL','BILL_AMT1','BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 
                                                 'BILL_AMT5', 'BILL_AMT6','PAY_AMT1','PAY_AMT2', 'PAY_AMT3', 
                                                 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']]))
print(df_card_encoded_scaled.head())

In [None]:
# Split test and train set
X_train = df_card_encoded_scaled.iloc[0:22500,1:]
X_test = df_card_encoded_scaled.iloc[22500:30000,1:]
y_train = df_card_encoded_scaled.iloc[0:22500,-4]
y_test = df_card_encoded_scaled.iloc[22500:30000,-4]

## 4. Feature Selection

In [None]:
# Correlation matrix
# X is the 23 features, y is the target variable
 
X = X_train
y = y_train

import matplotlib.pyplot as plt
import seaborn as sns

corr= X_train.corr()
plt.figure(figsize=(18,15))
mask = np.triu(np.ones_like(corr))
sns.heatmap(corr, annot=True, mask = mask, vmin=-1.0, vmax=1, cmap ='vlag')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Chi2

# data without target column
X_train_only_features = X_train.drop(columns=['default.payment.next.month'],axis = 1)

from sklearn.feature_selection import chi2
chi_scores = chi2(np.clip(X_train_only_features,0,10000000),y)
print(chi_scores)
p_values = pd.Series(chi_scores[1],index = X_train_only_features.columns) 
print(p_values)
p_values.sort_values(ascending = False , inplace = True) 
p_values.plot.bar()


From the Chi-squared test, all bill amounts can be removed. However, bill amount can be useful in determining the target variable. Generally, those with higher outstanding debt in their bill amount have a higher risk of default (Kagan, 2023) since they may struggle to make larger payments.

Reference: Kagan, J. (2023, April 6). Default risk: Definition, types, and ways to measure. Investopedia. Retrieved April 13, 2023, from https://www.investopedia.com/terms/d/defaultrisk.asp  

In [None]:
# SelectKBest
from sklearn.feature_selection import SelectKBest

# for classification, we use these three
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif

# this function will take in X, y variables 
# with criteria, and return a dataframe with most important columns based on that criteria
def featureSelect_dataframe(X, y, criteria, k):

    # initialize our function/method
    reg = SelectKBest(criteria, k=k).fit(X,y)
    
    # transform after creating the reg (so we can use getsupport)
    X_transformed = reg.transform(X)

    # filter down X based on kept columns
    X = X[[val for i,val in enumerate(X.columns) if reg.get_support()[i]]]

    # return that dataframe
    return X

New_X = featureSelect_dataframe(X_train_only_features, y_train, chi2, 11)

New_X

In [None]:
# Backward feature selection
X_train = X_train_only_features

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
import pandas as pd

# Instantiate the logistic regression model
model = LogisticRegression()

# Instantiate the Recursive Feature Elimination (RFE) object
rfe = RFE(model, n_features_to_select=5)

# Fit the RFE to the training data
rfe.fit(X_train, y_train)

# Create a dataframe to store the feature rankings
feature_ranks = pd.DataFrame({'Feature': X_train.columns, 'Rank': rfe.ranking_})

# Sort the features by rank
feature_ranks = feature_ranks.sort_values(by='Rank')

# Print out the feature ranking with each variable name
for rank, feature in enumerate(feature_ranks['Feature']):
    print(f"Rank {rank+1}: {feature}")

In [None]:
#  Drop columns we are not using 
df_card_encoded_scaled = df_card_encoded_scaled.drop('BILL_AMT1',axis = 1)
df_card_encoded_scaled = df_card_encoded_scaled.drop('BILL_AMT3',axis = 1)
df_card_encoded_scaled = df_card_encoded_scaled.drop('BILL_AMT4',axis = 1)
df_card_encoded_scaled = df_card_encoded_scaled.drop('BILL_AMT5',axis = 1)
df_card_encoded_scaled = df_card_encoded_scaled.drop('PAY_2',axis = 1)
df_card_encoded_scaled = df_card_encoded_scaled.drop('PAY_3',axis = 1)
df_card_encoded_scaled = df_card_encoded_scaled.drop('PAY_5',axis = 1)
df_card_encoded_scaled = df_card_encoded_scaled.drop('AGE',axis = 1)

print(df_card_encoded_scaled.head())


In [None]:
# Split test and train set again after dropping the features, this time the dataset is without redundant features
X_train = df_card_encoded_scaled.iloc[0:22500,1:]
X_test = df_card_encoded_scaled.iloc[22500:30000,1:]
y_train = df_card_encoded_scaled.iloc[0:22500,-4]
y_test = df_card_encoded_scaled.iloc[22500:30000,-4]

del X_train['default.payment.next.month']
del X_test['default.payment.next.month']

## 5. Linear Separability Check

In [None]:
# linear separability check using SVM
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.svm import SVC
clf_svm = SVC(random_state=1,kernel='linear')
clf_svm.fit(X_train, y_train)

y_predict = clf_svm.predict(X_test)
print(sum(y_predict == y_test))
print('Accuracy Score is {:.5}'.format(accuracy_score(y_test, y_predict)))
print('Recall Score is {:.5}'.format(recall_score(y_test, y_predict)))
print('Precision Score is {:.5}'.format(precision_score(y_test, y_predict)))
print('F1 Score is {:.5}'.format(f1_score(y_test, y_predict)))

sns.set_style('white') 
class_names = ['0','1']

print(pd.DataFrame(confusion_matrix(y_test,y_predict))) # left is actual, top is predicted

#### SMOTE (Data Transformation)

In [None]:
# distribution of target variable is imbalanced
print(df_card_encoded_scaled['default.payment.next.month'].value_counts())

from imblearn.over_sampling import SMOTE
oversample = SMOTE(random_state=1)
np.random.seed(1)

X_train_balanced, y_train_balanced = oversample.fit_resample(X_train, y_train)

# Export to csv
smoteTrain = X_train_balanced.reset_index(drop=True).join(y_train_balanced)
testSet = X_test.join(y_test)

# generate csv files for train and test set each
smoteTrain.to_csv('~/Downloads/smoteTrain.csv', index=False)
testSet.to_csv('~/Downloads/testSet.csv', index=False)

In [None]:
sum((X_train_balanced.MARRIED == 1) & (X_train_balanced.SINGLE == 1))

In [None]:
df_train = pd.read_csv('smoteTrain.csv')
df_test = pd.read_csv('testSet.csv')

In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
X_train = df_train.iloc[:,0:-1] # all the variables
y_train = df_train.iloc[:,-1] # labels

X_test = df_test.iloc[:,0:-1] # all the variables
y_test = df_test.iloc[:,-1] # labels

In [None]:
y_train.value_counts()

## 6. Model Selection and Evaluation

### Model 1: SVM

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
model = SVC(random_state=1,C=10,kernel='rbf')
model.fit(X_train, y_train)

y_predict = model.predict(X_test)
print(classification_report(y_test, y_predict))
print(pd.DataFrame(confusion_matrix(y_test,y_predict)))

In [None]:
from sklearn.model_selection import GridSearchCV
param_space = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}

clf = GridSearchCV(model, param_space, n_jobs=-1, cv=10)
clf.fit(X_train, y_train)

print('Best parameters found:\n', clf.best_params_)

In [None]:
clf_svm = SVC(random_state=1,kernel='linear')
clf_svm.fit(X_train, y_train)
y_predict = clf_svm.predict(X_test)
print(classification_report(y_test, y_predict))
print(pd.DataFrame(confusion_matrix(y_test,y_predict)))

In [None]:
print("SVM avg class accuracy (before tuning): ", (1/2 * (0.82 + 0.57)))
print("SVM avg class accuracy (after tuning): ", (1/2 * (0.88 + 0.51)))

### Model 2: Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

#instantiate the model
log_regression = LogisticRegression(random_state=5)
#fit the model using the training data
log_regression.fit(X_train,y_train)
#use model to make predictions on test data
y_predict = log_regression.predict(X_test)

In [None]:
print("Classification Report for Logistic Regression")
print(classification_report(y_test, y_predict))

# Accuracy score
print("Confusion Matrix for Logistic Regression")
print(pd.DataFrame(confusion_matrix(y_test,y_predict)))

# Calcuate accuracy
print("Accuracy for Logistic Regression: ", accuracy_score(y_test,y_predict))

In [None]:
# Calculate evaluation metrics
TP_LR = 4933 # top left
FP_LR = 724 # bottom left
TN_LR = 941
FN_LR = 902

specificity_LR = TN_LR / (TN_LR + FP_LR)
print("Specificity: ", specificity_LR )
print("FPR (False Positive Rate): ", 1 - specificity_LR)

In [None]:
# Plot ROC Curve and find AUC
y_pred_proba = log_regression.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)

random_probs = [0 for _ in range(len(y_test))]
random_fpr, random_tpr, _ = metrics.roc_curve(y_test, random_probs)

auc = metrics.roc_auc_score(y_test, y_pred_proba)
random_auc = metrics.roc_auc_score(y_test, random_probs)

#create ROC curve
plt.plot(fpr,tpr,label="LogReg AUC="+str(auc))
plt.plot(random_fpr, random_tpr, label="Random AUC="+str(random_auc))
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc=4)
plt.show()

In [None]:
# Hyperparameter tuning using GridSearchCV
# example of grid searching key hyperparametres for logistic regression
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# define models and parameters
model = LogisticRegression()
solvers = ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train,y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
# Run Logistic Regression after hyperparameter tuning
#instantiate the model
log_regression = LogisticRegression(C = 100, penalty = 'l2', solver = 'newton-cg', random_state=5)
#fit the model using the training data
log_regression.fit(X_train,y_train)
#use model to make predictions on test data
y_predict = log_regression.predict(X_test)

In [None]:
print("Classification Report for Logistic Regression")
print(classification_report(y_test, y_predict))

# Accuracy score
print("Confusion Matrix for Logistic Regression")
print(pd.DataFrame(confusion_matrix(y_test,y_predict)))

# Calcuate accuracy
print("Accuracy for Logistic Regression: ", accuracy_score(y_test,y_predict))

In [None]:
# Calculate evaluation metrics
TP_LR = 4925 # top left
FP_LR = 721 # bottom left
TN_LR = 944
FN_LR = 910

precision_LR = TP_LR / (TP_LR + FP_LR)
specificity_LR = TN_LR / (TN_LR + FP_LR)
print("Precision: ", precision_LR) 
print("Specificity: ", specificity_LR )
print("FPR (False Positive Rate): ", 1 - specificity_LR)

In [None]:
# Plot ROC Curve and find AUC
y_pred_proba = log_regression.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)

random_probs = [0 for _ in range(len(y_test))]
random_fpr, random_tpr, _ = metrics.roc_curve(y_test, random_probs)

auc = metrics.roc_auc_score(y_test, y_pred_proba)
random_auc = metrics.roc_auc_score(y_test, random_probs)

#create ROC curve
plt.plot(fpr,tpr,label="LogReg AUC="+str(auc))
plt.plot(random_fpr, random_tpr, label="Random AUC="+str(random_auc))
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc=4)
plt.show()

In [None]:
print("Logistic regression avg class accuracy (before tuning): ", (1/2 * (0.85 + 0.57)))
print("Logistic regression avg class accuracy (after tuning): ", (1/2 * (0.84 + 0.57)))

### Model 3: Multi-layer Perceptron (MLP)

In [None]:
# Import MLPClassifer 
from sklearn.neural_network import MLPClassifier
# Import accuracy score 
from sklearn.metrics import accuracy_score

# Create model object
clf = MLPClassifier(hidden_layer_sizes=(6,5),
                    random_state=5,
                    verbose=True,
                    learning_rate_init=0.01)

# Fit data onto the model
clf.fit(X_train,y_train)

# Make prediction on test dataset
y_predict = clf.predict(X_test)

In [None]:
print("Classification Report for MLP")
print(classification_report(y_test, y_predict))

print("Confusion Matrix for MLP")
print(pd.DataFrame(confusion_matrix(y_test,y_predict)))

# Calcuate accuracy
print("Accuracy for MLP: ", accuracy_score(y_test,y_predict))

In [None]:
# Calculate evaluation metrics
TP_MLP = 4744
FP_MLP = 676
TN_MLP = 989
FN_MLP = 1091

specificity_MLP = TN_MLP / (TN_MLP + FP_MLP)
print("Specificity: ", specificity_MLP )
print("FPR (False Positive Rate): ", 1 - specificity_MLP)

In [None]:
# Parameter tuning for MLP
mlp_gs = MLPClassifier(max_iter=100,random_state=5)

parameter_space = {
    'hidden_layer_sizes': [(10,50), (10,50,100), (50, 100, 150), (20,20)],
    'activation': ['tanh', 'relu','logistic'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(mlp_gs, parameter_space, n_jobs=-1, cv=3)
clf.fit(X_train, y_train)

In [None]:
print('Best parameters found:\n', clf.best_params_)

# All results
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [None]:
# MLP after parameter tuning
from sklearn.neural_network import MLPClassifier
# Import accuracy score 
from sklearn.metrics import accuracy_score

# Create model object
clf = MLPClassifier(hidden_layer_sizes=(50, 100, 150),
                    activation = 'relu', 
                    alpha = 0.0001,
                    random_state=5,
                    verbose=True,
                    learning_rate = 'constant', 
                    solver = 'sgd'
                    )

# Fit data onto the model
clf.fit(X_train,y_train)

# Make prediction on test dataset
y_predict = clf.predict(X_test)

In [None]:
print("Classification Report for MLP")
print(classification_report(y_test, y_predict))

print("Confusion Matrix for MLP")
print(pd.DataFrame(confusion_matrix(y_test,y_predict)))

# Calcuate accuracy
print("Accuracy for MLP: ", accuracy_score(y_test,y_predict))

In [None]:
print("MLP avg class accuracy (before tuning): ", (1/2 * (0.81 + 0.59)))
print("MLP avg class accuracy (after tuning): ", (1/2 * (0.83 + 0.57)))

### Model 4: k-nearest neighbors (k-NN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
acc = []
# Will take some time
from sklearn import metrics
for i in range(1,200):
    neigh = KNeighborsClassifier(n_neighbors = i).fit(X_train,y_train)
    yhat = neigh.predict(X_test)
    acc.append(metrics.precision_score(y_test, yhat))
    
plt.figure(figsize=(10,6))
plt.plot(range(1,200),acc,color = 'blue',linestyle='dashed', 
         marker='o',markerfacecolor='red', markersize=10)
plt.title('precision vs. K Value')
plt.xlabel('K')
plt.ylabel('Precision')
print("Maximum precision: ",max(acc)," at K = ",acc.index(max(acc)))

In [None]:
acc = []
# Will take some time
from sklearn import metrics
for i in range(1,200):
    neigh = KNeighborsClassifier(n_neighbors = i).fit(X_train,y_train)
    yhat = neigh.predict(X_test)
    acc.append(metrics.accuracy_score(y_test, yhat))
    
plt.figure(figsize=(10,6))
plt.plot(range(1,200),acc,color = 'blue',linestyle='dashed', 
         marker='o',markerfacecolor='red', markersize=10)
plt.title('accuracy vs. K Value')
plt.xlabel('K')
plt.ylabel('Accuracy')
print("Maximum accuracy: ",max(acc)," at K = ",acc.index(max(acc)))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=180) 
classifier.fit(X_train, y_train)
y_predict = classifier.predict(X_test)
print(classification_report(y_test, y_predict))
print(pd.DataFrame(confusion_matrix(y_test,y_predict)))

In [None]:
print("k-NN avg class accuracy: ", (1/2 * (0.78 + 0.61)))

### Model 5: Decision Tree and Random Forest

#### Decision Tree

In [None]:
#decision tree and random forest
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,roc_auc_score,roc_curve

# Build the decision tree model wuth default parameters
dt = DecisionTreeClassifier(random_state=42,max_depth= 5)
dt.fit(X_train, y_train)

# Predict the target variable for the testing set
y_pred = dt.predict(X_test)

# Get the probability predictions for AUC
y_prob = dt.predict_proba(X_test)[:, 1]

# Evaluate the performance of the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)
print('Confusion Matrix:\n', cm)
# Calculate the AUC
auc = roc_auc_score(y_test, y_prob)
print("AUC:", auc)
# Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
auc = roc_auc_score(y_test, y_prob)

# Plot ROC curve
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

# Print AUC
print('AUC:', auc)

In [None]:
#Tune parameters
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Define the range of values to test for the parameters
param_grid = {'max_depth': range(1, 10),
              'min_samples_split': range(1,10),
              'min_samples_leaf': range(1,10)}

# Create a decision tree classifier
clf = DecisionTreeClassifier(random_state=42)

# Use GridSearchCV to search for the best combination of parameters
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding mean cross-validation score
print('Best parameters:', grid_search.best_params_)
print('Best cross-validation score:', grid_search.best_score_)


#build the decision tree with tuned parameters
dt2 = DecisionTreeClassifier(random_state=42,max_depth= 9, min_samples_leaf=1, min_samples_split= 4)
dt2.fit(X_train, y_train)


y_pred = dt2.predict(X_test)

# Get the probability predictions for AUC
y_prob = dt2.predict_proba(X_test)[:, 1]

# Evaluate the performance of the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)
print('Confusion Matrix:\n', cm)
# Calculate the AUC
auc = roc_auc_score(y_test, y_prob)
print("AUC:", auc)
# Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
auc = roc_auc_score(y_test, y_prob)

# Plot ROC curve
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

# Print AUC
print('AUC:', auc)


In [None]:
print("Decision Tree avg class accuracy (before tuning): ", (1/2 * (0.78 + 0.61)))
print("Decision Tree avg class accuracy (after tuning): ", (1/2 * (0.74 + 0.61)))

#### Random Forest

In [None]:
#Random Forest

from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier with default hyperparameters
rf_best = RandomForestClassifier()

# Fit the classifier to the training data
rf_best.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_best.predict(X_test)

# Evaluate the performance of the random forest classifier on the test set
accuracy = rf_best.score(X_test, y_test)
print("Test accuracy:", accuracy)
from sklearn.metrics import recall_score, f1_score, roc_auc_score

# Predict the class probabilities for the test set
y_prob = rf_best.predict_proba(X_test)[:, 1]

# Calculate the recall score
recall = recall_score(y_test, y_pred)
print("Recall:", recall)

# Calculate the F1 score
f1 = f1_score(y_test, y_pred)
print("F1 score:", f1)

cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix:")
print(cm)

# Plot ROC curve
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

# Print AUC
print('AUC:', auc)

In [None]:
# get the feature importances with corresponding feature names
importances = rf_best.feature_importances_
feature_names = X_train.columns.tolist()

# create a dataframe to store the feature importances
df_importances = pd.DataFrame({'feature_names': feature_names, 'importance': importances})

# sort the features by importance
df_importances = df_importances.sort_values('importance', ascending=False)

# plot the feature importances
plt.bar(x='feature_names', height='importance', data=df_importances)
plt.xticks(rotation=90)
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Gini Importance of Features')
plt.show()

Although the model's performance was not outstanding, we can draw insights from Gini importance. "Gini importance” is the total decrease in node impurity (weighted by the probability of reaching that node (which is approximated by the proportion of samples reaching that node)) averaged over all trees of the ensemble.  

Reference: 
Gordon, A. D., Breiman, L., Friedman, J. H., Olshen, R. A., & Stone, C. J. (1984). Classification and Regression Trees. Biometrics, 40(3), 874. https://doi.org/10.2307/2530946


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50,100,150],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

# Create a random forest classifier
rf = RandomForestClassifier(random_state=42)

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(rf, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# Train the random forest classifier with the best hyperparameters
rf_best = RandomForestClassifier(**grid_search.best_params_, random_state=42)
rf_best.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_best.predict(X_test)


# Evaluate the performance of the random forest classifier on the test set
accuracy = rf_best.score(X_test, y_test)
print("Test accuracy:", accuracy)
from sklearn.metrics import recall_score, f1_score, roc_auc_score

# Predict the class probabilities for the test set
y_prob = rf_best.predict_proba(X_test)[:, 1]

# Calculate the recall score
recall = recall_score(y_test, y_pred)
print("Recall:", recall)

# Calculate the F1 score
f1 = f1_score(y_test, y_pred)
print("F1 score:", f1)

cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix:")
print(cm)

# Plot ROC curve
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

# Print AUC
print('AUC:', auc)

In [None]:
print("Random Forest avg class accuracy (before tuning): ", (1/2 * (0.87 + 0.47)))
print("Random Forest avg class accuracy (after tuning): ", (1/2 * (0.83 + 0.57)))