In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve, train_test_split, validation_curve
from sklearn.pipeline import make_pipeline

In [None]:
import os
print(os.getcwd())
#change working directory to the location of the data file
os.chdir('/mnt/d/Sajjad/08-2023/Python Code/Introduction to Machine Learning/')
print(os.getcwd())

In [None]:
import pandas as pd
import numpy as np

In [None]:
df1 = pd.read_csv("default_cc.csv")

# Changed Gender from int64 to str
df1['Gender'] = df1['Gender'].astype('str')

# Changed EDUCATION from int64 to str
df1['EDUCATION'] = df1['EDUCATION'].astype('str')

# Changed MARRIAGE from int64 to str
df1['MARRIAGE'] = df1['MARRIAGE'].astype('str')

# Deleted column ID from df1
df1.drop(['ID'], axis=1, inplace=True)

In [None]:
df_onehot = pd.get_dummies(df1)
X = df_onehot.loc[:, df_onehot.columns != 'default payment next month']
y = df_onehot[['default payment next month']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)

In [None]:
from sklearn.tree import DecisionTreeClassifier

### Learning Curve for Training and Testing/Validation

In [None]:
#pipe_lr = make_pipeline(StandardScaler(),
#                        LogisticRegression(penalty='l2', max_iter=10000))

pipe_lr = make_pipeline(DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0))
train_sizes, train_scores, test_scores =\
                learning_curve(estimator=pipe_lr,
                               X=X_train,
                               y=y_train,
                               train_sizes=np.linspace(0.1, 1.0, 10),
                               cv=10,
                               n_jobs=1)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(train_sizes, train_mean,
         color='blue', marker='o',
         markersize=5, label='Training accuracy')

plt.fill_between(train_sizes,
                 train_mean + train_std,
                 train_mean - train_std,
                 alpha=0.15, color='blue')

plt.plot(train_sizes, test_mean,
         color='green', linestyle='--',
         marker='s', markersize=5,
         label='Validation accuracy')

plt.fill_between(train_sizes,
                 test_mean + test_std,
                 test_mean - test_std,
                 alpha=0.15, color='green')

plt.grid()
plt.xlabel('Number of training examples')
plt.ylabel('Accuracy')
plt.legend(loc='upper right')
plt.ylim([0.8, 1.03])
plt.tight_layout()
# plt.savefig('figures/06_05.png', dpi=300)
plt.show()

### Bagging and Boosting

In [None]:

import time
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from catboost import CatBoostClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split 
from sklearn.metrics import roc_curve, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.3, random_state=2)

In [None]:
def fit_model(model, model_name):
    model.fit(trainX,trainy)
    md_probs = model.predict_proba(testX)
    md_probs = md_probs[:,1]
    md_auc = roc_auc_score(testy, md_probs)
    print(model_name, " : ", md_auc)

In [None]:
num_of_models = 300
depth_level = 3

Random Forest and Graident Boosting

In [None]:
#record the start time
start_time = time.time()
rf = RandomForestClassifier(max_depth=10,n_estimators=num_of_models)
fit_model(rf, "Random Forest")
#record the end time
end_time = time.time()
#calculate the total time
total_time = end_time - start_time
print("Total time RF: ", total_time)

#record the start time
start_time = time.time()
gb = GradientBoostingClassifier(max_depth=depth_level,n_estimators=num_of_models)
fit_model(gb, "Graident Boosting")
#record the end time
end_time = time.time()
#calculate the total time
total_time = end_time - start_time
print("Total time GB: ", total_time)



Random Forest with n_jobs=-1

In [None]:
#record the start time
start_time = time.time()
rf = RandomForestClassifier(max_depth=10,n_estimators=num_of_models, n_jobs=-1 )
fit_model(rf, "Random Forest")
#record the end time
end_time = time.time()
#calculate the total time
total_time = end_time - start_time
print("Total time RF: ", total_time)


Adaboost Classifier

In [None]:
#record the start time
start_time = time.time()
ab = AdaBoostClassifier(n_estimators=num_of_models)
fit_model(ab, "Adaptive Boosting")
#record the end time
end_time = time.time()
#calculate the total time
total_time = end_time - start_time
print("Total time AB: ", total_time)

Bagging using Categorical Naive Bayes

In [None]:
from sklearn.naive_bayes import CategoricalNB
from sklearn.preprocessing import LabelEncoder

In [None]:
def convert_categorical(df1):
    df_q = pd.DataFrame()
    label_encoder = LabelEncoder()
    for col in df1:
        if col not in ['Gender','EDUCATION','MARRIAGE','default payment next month']:
            df_q[col] = pd.qcut(df1[col], 5, duplicates='drop')            
            df_q[col]= label_encoder.fit_transform(df_q[col])
            df_q[col] = df_q[col].astype('str')

    X_cat = df1[['Gender','EDUCATION','MARRIAGE']]
    df_cat = pd.concat([df_q,X_cat],axis=1)
    return df_cat

 
temp_df1 = convert_categorical(df1) 
temp_df1.head()

X_cat = convert_categorical(df1)
trainX, testX, trainy, testy = train_test_split(X_cat, y, test_size=0.3, random_state=2)


In [None]:
#record the start time
start_time = time.time()
nb_c = CategoricalNB()
bg_c = BaggingClassifier(base_estimator=nb_c, n_estimators=num_of_models, n_jobs=-1)
fit_model(ab, "Bagging Classifier using NB")
#record the end time
end_time = time.time()
#calculate the total time
total_time = end_time - start_time
print("Total time AB: ", total_time)

XGBoost

In [None]:
#use xgboost
xgb_model = xgb.XGBClassifier(max_depth=depth_level, n_estimators=num_of_models, learning_rate=0.1)
start_time = time.time()
#fit xgb_model
xgb_model.fit(trainX,trainy)
md_probs = xgb_model.predict_proba(testX)
md_probs = md_probs[:,1]
md_auc = roc_auc_score(testy, md_probs)
print("XG Boost", " : ", md_auc)
#record the end time
end_time = time.time()
#calculate the total time
total_time = end_time - start_time
print("Total time XGB: ", total_time)

Light GBM

In [None]:
#use lgboost
lgb_model = lgb.LGBMClassifier(max_depth=depth_level, n_estimators=num_of_models, learning_rate=0.1)
start_time = time.time()
#fit xgb_model
lgb_model.fit(trainX,trainy)
md_probs = lgb_model.predict_proba(testX)
md_probs = md_probs[:,1]
md_auc = roc_auc_score(testy, md_probs)
print("LG Boost", " : ", md_auc)
#record the end time
end_time = time.time()
#calculate the total time
total_time = end_time - start_time
print("Total time LGB: ", total_time)

In [None]:
X2 = df1.loc[:, df1.columns != 'default payment next month']
y2 = df1[['default payment next month']]
trainX2, testX2, trainy2, testy2 = train_test_split(X2, y2, test_size=0.3, random_state=2)

CatBoost

In [None]:
cb = CatBoostClassifier(iterations=num_of_models, depth=depth_level, learning_rate=0.1, loss_function='Logloss', verbose=False)
#record the start time
start_time = time.time()
cb.fit(trainX2,trainy2)
md_probs = cb.predict_proba(testX2)
md_probs = md_probs[:,1]
md_auc = roc_auc_score(testy2, md_probs)
print("Cat Boost", " : ", md_auc)
#record the end time
end_time = time.time()
#calculate the total time
total_time = end_time - start_time
print("Total time CB: ", total_time)

### Feature Importance

In [None]:
from lightgbm import plot_importance 

In [None]:
plot_importance(lgb_model, figsize=(10, 9))

In [None]:
cb.get_feature_importance()

In [None]:
cb.get_feature_importance(prettified=True)

In [None]:
from xgboost import plot_importance

In [None]:
plot_importance(xgb_model)