In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('kick.csv')

df.info()

In [None]:
df_original = df.copy()

In [None]:
df_original.info()

In [None]:
# Auction: imputation(nan -> OTHER)
df['Auction'] = df['Auction'].fillna('OTHER')

# VehYear: imputation(nan -> median)
df['VehYear'] = df['VehYear'].fillna(df['VehYear'].median())

# Transmission: imputation (?, nan -> UNKNOWN)
df['Transmission'] = df['Transmission'].str.upper()
df['Transmission'] = df['Transmission'].replace('?',np.nan)
df['Transmission'] = df['Transmission'].fillna('UNKNOWN')

# Make: imputation(nan -> UNKNOWN)
df['Make'] = df['Make'].fillna('UNKNOWN')

# VehOdo: imputation(replace outliers with the median value)
lower_bound = 1000
upper_bound = 400000

vehodo_median = df['VehOdo'].median()

df.loc[(df['VehOdo'] < lower_bound)|(df['VehOdo'] > upper_bound), 'VehOdo'] = np.nan

df['VehOdo'] = df['VehOdo'].fillna(vehodo_median)

# Nationality: imputation(USA -> AMERICAN, ? -> nan, nan ->UNKNOWN)
df['Nationality'] = df['Nationality'].str.upper().str.strip()
df['Nationality'] = df['Nationality'].replace('USA', 'AMERICAN')
df['Nationality'] = df['Nationality'].replace('?', np.nan)
df['Nationality'] = df['Nationality'].fillna('UNKNOWN')

# TopThreeAmericanName: imputation(? -> nan, nan -> UNKNOWN)
df['TopThreeAmericanName'] = df['TopThreeAmericanName'].replace('?', np.nan)
df['TopThreeAmericanName'] = df['TopThreeAmericanName'].fillna('UNKNOWN')

# ForSale: upper, imputation(?,0 -> nan / nan-> UNKNOWN)
df['ForSale'] = df['ForSale'].str.upper().str.strip()
df['ForSale'] = df['ForSale'].replace('?', np.nan)
df['ForSale'] = df['ForSale'].replace('0', np.nan)
for_sale_map = {'YES': 0, 'NO': 1}
df['ForSale'] = df['ForSale'].map(for_sale_map)
df['ForSale'] = df['ForSale'].fillna('UNKNOWN')

# MMR Prices
# imputation: ?,0-> nan / change to number /nan -> median value
mmr_current_cols = ['MMRCurrentAuctionAveragePrice','MMRCurrentAuctionCleanPrice', 
                    'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice']

for col in mmr_current_cols:
    df[col] = df[col].astype(str).str.strip()
    df[col] = df[col].replace('?', np.nan)
    df[col] = pd.to_numeric(df[col], errors='coerce')
    df[col] = df[col].replace(0, np.nan)
    df[col] = df[col].fillna(df[col].median())

mmr_acquisition_cols = ['MMRAcquisitionAuctionAveragePrice','MMRAcquisitionAuctionCleanPrice', 
                    'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice']

for col in mmr_acquisition_cols:
    df[col] = df[col].astype(str).str.strip()
    df[col] = df[col].replace('?', np.nan)
    df[col] = pd.to_numeric(df[col], errors='coerce')
    df[col] = df[col].replace(0, np.nan)
    df[col] = df[col].fillna(df[col].median())

# outlier: quantile 1% and 99%  -> nan -> median value
for col in mmr_current_cols + mmr_acquisition_cols:
    lower_limit = df[col].quantile(0.01)
    upper_limit = df[col].quantile(0.99)
    df.loc[(df[col] < lower_limit) | (df[col] > upper_limit), col] = np.nan
    df[col] = df[col].fillna(df[col].median())

In [None]:
print(df['ForSale'].describe())
print(df['ForSale'].unique())
print(df['ForSale'].value_counts())
print(df['VehOdo'].value_counts(bins=10)) #only numeric data

In [None]:
selected_cols = ['Auction', 'VehYear', 'Make', 'Transmission', 'VehOdo',
                 'Nationality', 'TopThreeAmericanName',
                 'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice',
                 'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice',
                 'MMRCurrentRetailRatio', 'ForSale', 'IsBadBuy']
df_selected = df[selected_cols].copy()
print(df_selected.describe())
print("================================================================")
for col in df_selected.columns:
    print(df_selected[col].unique())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(6,4))
sns.countplot(data=df, x='IsBadBuy')
plt.title('Distribution of IsBadBuy')
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.histplot(df['VehOdo'], kde=True, bins=50)
plt.title('Distribution of Vehicle Odometer (VehOdo)')
plt.xlabel('VehOdo')
plt.ylabel('Count')
plt.show()

In [None]:
ax = sns.boxplot(x="IsBadBuy", y="VehOdo", data=df)
plt.show()

In [None]:
correlation_matrix = df[mmr_current_cols + mmr_acquisition_cols].corr()
correlation_matrix

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

correlation_mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", 
            annot_kws={'size': 8}, cmap='coolwarm', 
            center=0, mask=correlation_mask, square=True)
plt.title('Correlation Matrix of MMR Variables')
plt.show()
# we use one value of MMR variables

In [None]:
# the result between before and after preprocessing
df_selected_original = df_original[selected_cols].copy()
df_selected_processed = df[selected_cols].copy()

for col in selected_cols:
    print(f"===== Column: {col}========")
    if df[col].dtype in ['int64', 'float64']:
        print("Before preprocessing:")
        print(df_selected_original[col].describe())
        print("====================================")
        print("After preprocessing:")
        print(df_selected_processed[col].describe())
    else:
        print("Before preprocessing:")
        print(df_selected_original[col].value_counts(dropna=False))
        print("====================================")
        print("After preprocessing:")
        print(df_selected_processed[col].value_counts(dropna=False))
    print("\n")

In [None]:
categorical_cols = ['Auction', 'Make', 'Transmission', 'Nationality', 
                    'TopThreeAmericanName', 'ForSale']
for col in categorical_cols:
    print(f"=== {col} ===")
    print("*** Before preprocessing ***")
    print(df_original.groupby('IsBadBuy')[col].value_counts(normalize=True))
    print("-----------------------------")
    print("*** After preprocessing ***")
    print(df.groupby('IsBadBuy')[col].value_counts(normalize=True))
    print("======================================\n")

# the proportion of values of the target variable 

In [None]:
numeric_cols = ['VehYear', 'VehOdo', 'MMRCurrentAuctionAveragePrice','MMRCurrentAuctionCleanPrice', 
                'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice']
for col in numeric_cols:
    print(f"=== {col} ===")
    print("*** Before preprocessing ***")
    print(df_original.groupby('IsBadBuy')[col].describe())
    print("-----------------------------")
    print("*** After preprocessing ***")
    print(df.groupby('IsBadBuy')[col].describe())
    print("======================================\n")

In [None]:
drop_cols = [
    'PurchaseID', 'PurchaseTimestamp', 'PurchaseDate', 'Make',
    'Color', 'WheelTypeID', 'WheelType', 'Size', 'TopThreeAmericanName', 'MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice', 
    'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice', 'MMRCurrentAuctionCleanPrice', 'MMRCurrentRetailCleanPrice', 
    'MMRCurrentRetailRatio', 'WarrantyCost', 'ForSale',
    'PRIMEUNIT', 'AUCGUART', 'VNST', 'VehBCost', 'IsOnlineSale'
]
df.drop(columns=drop_cols, axis=1, inplace=True)


In [None]:
print(df.columns)
print(df.isnull().sum())

In [None]:
df = pd.get_dummies(df)

In [None]:
y = df['IsBadBuy'].values
X = df.drop('IsBadBuy', axis=1)
feature_names =X.columns
X = X.values

In [None]:
from sklearn.model_selection import train_test_split
random_state = 10
test_set_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_size, stratify=y, random_state=random_state)

print("Size of training set:", len(X_train))
print("Size of testing set:", len(X_test))

In [None]:
from sklearn.tree import DecisionTreeClassifier

#simple decision tree training
model = DecisionTreeClassifier(random_state=random_state)
model.fit(X_train, y_train)

In [None]:
print('****Model parameters*****\n', model.get_params(deep=True))
print('Number of leaves in the trained model:', model.get_n_leaves())
print("Number of nodes:", model.tree_.node_count)

In [None]:
print("Training set accuracy:", model.score(X_train, y_train)) #overfitting
print("Testing set accuracy:", model.score(X_test, y_test))

In [None]:
y_pred = model.predict(X_test)
print(y_pred)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
def display_feature_importances(model, feature_names, features_to_display=20):
    importances = model.feature_importances_
    indices = np.argsort(importances)
    indices = np.flip(indices, axis=0)

    indices = indices[:features_to_display]
    for i in indices:
        print(feature_names[i], ':', importances[i])
    print("Number of leaves:", model.get_n_leaves())
    
display_feature_importances(model, feature_names)

In [None]:
from IPython.display import Image, display
from io import StringIO
from sklearn.tree import export_graphviz
import pydot

def visualize_model(model):
    dotfile = StringIO()
    export_graphviz(model, out_file=dotfile, feature_names=feature_names)

    graph = pydot.graph_from_dot_data(dotfile.getvalue())
    display(Image(graph[0].create_png()))

visualize_model(model)

In [None]:
model_small = DecisionTreeClassifier(
    max_depth=3,
    min_samples_leaf=5,
    random_state=random_state
) 

model_small.fit(X_train, y_train)

y_pred =model_small.predict(X_test)
print(classification_report(y_test, y_pred))

# Node, leaves, depth
print("Nodes:", model_small.tree_.node_count)
print("Leaves:", model_small.get_n_leaves())
print("Depth:", model_small.get_depth())

In [None]:
model_small = DecisionTreeClassifier(
    max_depth=3, 
    class_weight='balanced',
    random_state=random_state
) 
#'class_weight'can balance the class

model_small.fit(X_train, y_train)

y_pred =model_small.predict(X_test)
print(classification_report(y_test, y_pred))

# Node, leaves, depth
print("Nodes:", model_small.tree_.node_count)
print("Leaves:", model_small.get_n_leaves())
print("Depth:", model_small.get_depth())

In [None]:
display_feature_importances(model_small, feature_names)
visualize_model(model_small)

In [None]:
import matplotlib.pyplot as plt

test_score = []
train_score = []
for max_depth in range(2,21):
    temp_model = DecisionTreeClassifier(max_depth=max_depth, class_weight="balanced", random_state=random_state)
    temp_model.fit(X_train, y_train)
    test_score.append(temp_model.score(X_test, y_test))
    train_score.append(temp_model.score(X_train, y_train))

plt.plot(range(2,21), train_score, 'b', range(2,21), test_score, 'r')
plt.xlabel('max_depth\nBlue = training acc. Red = test acc.')
plt.ylabel('accuracy')
plt.show()

In [None]:
import matplotlib.pyplot as plt

test_score = []
train_score = []
for max_depth in range(10,15):
    temp_model = DecisionTreeClassifier(max_depth=max_depth, class_weight="balanced", random_state=random_state)
    temp_model.fit(X_train, y_train)
    test_score.append(temp_model.score(X_test, y_test))
    train_score.append(temp_model.score(X_train, y_train))

plt.plot(range(10,15), train_score, 'b', range(10,15), test_score, 'r')
plt.xlabel('max_depth\nBlue = training acc. Red = test acc.')
plt.ylabel('accuracy')
plt.show()

In [None]:
import matplotlib.pyplot as plt

test_score = []
train_score = []
for max_depth in range(5,10):
    temp_model = DecisionTreeClassifier(max_depth=max_depth, class_weight="balanced", random_state=random_state)
    temp_model.fit(X_train, y_train)
    test_score.append(temp_model.score(X_test, y_test))
    train_score.append(temp_model.score(X_train, y_train))

plt.plot(range(5,10), train_score, 'b', range(5,10), test_score, 'r')
plt.xlabel('max_depth\nBlue = training acc. Red = test acc.')
plt.ylabel('accuracy')
plt.show()

In [None]:
import matplotlib.pyplot as plt

test_score = []
train_score = []
for max_depth in range(2, 5):
    temp_model = DecisionTreeClassifier(max_depth=max_depth, class_weight="balanced", random_state=random_state)
    temp_model.fit(X_train, y_train)
    test_score.append(temp_model.score(X_test, y_test))
    train_score.append(temp_model.score(X_train, y_train))

plt.plot(range(2, 5), train_score, 'b', range(2, 5), test_score, 'r')
plt.xlabel('max_depth\nBlue = training acc. Red = test acc.')
plt.ylabel('accuracy')
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV
params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': range(10, 15),
    'min_samples_leaf': range(20, 60, 10)
}

def perform_grid_search(X_train, y_train, X_test, y_test, params, num_folds=10):
    cv = GridSearchCV(param_grid=params, estimator=DecisionTreeClassifier(random_state=random_state, class_weight='balanced'), cv=num_folds, verbose=1, n_jobs=-1)
    cv.fit(X_train,y_train)

    print("Train accuracy:", cv.score(X_train, y_train))
    print("Test accuracy:", cv.score(X_test, y_test))

    y_pred = cv.predict(X_test)
    print(classification_report(y_test, y_pred))

    print(cv.best_params_)
    return cv
    
cv = perform_grid_search(X_train, y_train, X_test, y_test, params)

In [None]:
params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': range(2,5),
    'min_samples_leaf': range(5, 20, 10)
}
cv= perform_grid_search(X_train, y_train, X_test, y_test, params)

In [None]:
params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': range(5,8),
    'min_samples_leaf': range(5, 10)
}
cv = perform_grid_search(X_train, y_train, X_test, y_test, params)

In [None]:
params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': range(2, 4),
    'min_samples_leaf': range(5, 50, 10)
}
cv= perform_grid_search(X_train, y_train, X_test, y_test, params)

In [None]:
model_cv= cv.best_estimator_
visualize_model(model_cv)

In [None]:
display_feature_importances(model_cv, feature_names)
visualize_model(model_cv)

In [None]:
print("Nodes:", model_cv.tree_.node_count)
print("Leaves:", model_cv.get_n_leaves())
print("Depth:", model_cv.get_depth())

In [None]:
y_pred = model_cv.predict(X_test)
y_pred_proba_dt = model_cv.predict_proba(X_test)

print("Probability produced by decision tree for each class vs actual prediction on Target (0 = clean, 1 = BadBuy).")
print("You should be able to see the default threshold of 0.5.")
print("(Probs on zero)  (probs on one)  (prediction made)  (label)")

for i in range(20):
    print(f"{y_pred_proba_dt[i][0]:.13f}  {y_pred_proba_dt[i][1]:.13f}  {y_pred[i]:<10d}  {y_test[i]:10d}")

In [None]:
from sklearn.metrics import roc_auc_score

y_pred_proba_dt = model.predict_proba(X_test)
y_pred_proba_dt_small = model_small.predict_proba(X_test)
y_pred_proba_dt_cv = model_cv.predict_proba(X_test)

roc_index_dt = roc_auc_score(y_test, y_pred_proba_dt[:, 1])
roc_index_dt_small = roc_auc_score(y_test, y_pred_proba_dt_small[:, 1])
roc_index_dt_cv = roc_auc_score(y_test, y_pred_proba_dt_cv[:, 1])

print("ROC index on test for default model:", roc_index_dt)
print("ROC index on test for small model:", roc_index_dt_small)
print("ROC index on test for grid search model:", roc_index_dt_cv)

In [None]:
from sklearn.metrics import roc_curve
fpr_dt, tpr_dt, thresholds_dt = roc_curve(y_test, y_pred_proba_dt[:,1])
fpr_dt_small, tpr_dt_small, thresholds_dt_small = roc_curve(y_test, y_pred_proba_dt_small[:,1])
fpr_dt_cv, tpr_dt_cv, thresholds_dt_cv = roc_curve(y_test, y_pred_proba_dt_cv[:,1])

plt.plot(fpr_dt, tpr_dt, label='ROC Curve for default tree {:.3f}'.format(roc_index_dt), color='red', lw=0.5)
plt.plot(fpr_dt_small, tpr_dt_small, label='ROC Curve for small tree{:.3f}'.format(roc_index_dt_small), color='green', lw=0.5)
plt.plot(fpr_dt_cv, tpr_dt_cv, label='ROC Curve for grid search{:.3f}'.format(roc_index_dt_cv), color='blue', lw=0.5)
plt.plot([0, 1], [0, 1], color='navy', lw=0.5, label='Baseline', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
import pickle

with open('decision_tree_model.pickle', 'wb') as f:
    pickle.dump([model_cv, roc_index_dt_cv, fpr_dt_cv, tpr_dt_cv], f)

In [None]:
random_state = 10
test_set_size = 0.3 # 30%
print("Size of training set:", len(X_train))
print("Size of testing set:", len(X_test))

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

print("Before scaling\n-------------")
for i in range(5):
    col = X_train[:,i]
    print("Variable #{}: min {}, max {}, mean {:.2f} and std dev {:.2f}".format(i, min(col), max(col), np.mean(col), np.std(col)))

X_train = scaler.fit_transform(X_train, y_train)

print("After scaling\n-------------")
for i in range(5):
    col = X_train[:,i]
    print("Variable #{}: min {}, max {}, mean {:.2f} and std dev {:.2f}".format(i, min(col), max(col), np.mean(col), np.std(col)))

X_test = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=random_state)

model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report

print("Training accuracy:", model.score(X_train, y_train))
print("Test accuracy:", model.score(X_test, y_test))

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
print(model.coef_)

In [None]:
coef = model.coef_[0]

coef = coef[:20]
for i in range(len(coef)):
    print(feature_names[i], ':', coef[i])

In [None]:
coef = model.coef_[0]

indices = np.argsort(np.absolute(coef))
indices = np.flip(indices, axis=0)

indices = indices[:20]
for i in indices:
    print(feature_names[i], ':', coef[i])

In [None]:
from sklearn.model_selection import GridSearchCV
params = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

cv = GridSearchCV(param_grid=params,
                  estimator=LogisticRegression(random_state=random_state, class_weight='balanced'),
                  cv=10, n_jobs=-1)
cv.fit(X_train, y_train)

print("Train accuracy:", cv.score(X_train, y_train))
print("Test accuracy:", cv.score(X_test, y_test))
y_pred = cv.predict(X_test)
print(classification_report(y_test, y_pred))

print(cv.best_params_)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
def plot_skewed_columns(df):
    f, axes = plt.subplots(2,2, figsize=(12,10), sharex=False)

    sns.kdeplot(df['VehYear'].dropna(), ax=axes[0,0])
    sns.kdeplot(df['VehOdo'].dropna(), ax=axes[0,1])
    sns.kdeplot(df['MMRCurrentAuctionAveragePrice'].dropna(), ax=axes[1,0])
    sns.kdeplot(df['MMRCurrentRetailAveragePrice'].dropna(), ax=axes[1,1])
    plt.show()
plot_skewed_columns(df)

In [None]:
columns_to_transform = [
    'VehYear', 'VehOdo', 'MMRCurrentAuctionAveragePrice', 'MMRCurrentRetailAveragePrice'
]

df_log = df.copy()

for col in columns_to_transform:
    df_log[col] = df_log[col].apply(lambda x: x+1)
    df_log[col] = df_log[col].apply(np.log)

plot_skewed_columns(df_log)

In [None]:
y_log = df_log['IsBadBuy']
X_log = df_log.drop(['IsBadBuy'], axis=1)
X_mat_log = X_log.values
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X_mat_log, y_log,test_size=0.3, stratify=y_log, random_state=random_state)

scaler_log = StandardScaler()
X_train_log = scaler_log.fit_transform(X_train_log, y_train_log)
X_test_log = scaler_log.transform(X_test_log)

In [None]:
params = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

cv = GridSearchCV(param_grid=params, estimator=LogisticRegression(random_state=random_state, class_weight='balanced'), 
                  cv=10, n_jobs=-1)
cv.fit(X_train_log, y_train_log)

print("Train accuracy:", cv.score(X_train_log, y_train_log))
print("Test accuracy:", cv.score(X_test_log, y_test_log))

y_pred = cv.predict(X_test_log)
print(classification_report(y_test_log, y_pred))

print(cv.best_params_)

In [None]:
from sklearn.feature_selection import RFECV
rfe = RFECV(estimator = LogisticRegression(random_state=random_state, class_weight='balanced'), cv=10)

rfe.fit(X_train, y_train)

print("Original feature set", X_train.shape[1])
print("Number of features after elimination", rfe.n_features_)

In [None]:
selected_features = feature_names[rfe.support_]
print("Selected features: ", selected_features)

In [None]:
X_train_sel = rfe.transform(X_train)
X_test_sel = rfe.transform(X_test)

In [None]:
params = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

rfe_cv = GridSearchCV(param_grid=params, estimator=LogisticRegression(random_state=random_state, class_weight='balanced'), 
                  cv=10, n_jobs=-1)
rfe_cv.fit(X_train_sel, y_train)

print("Train accuracy:", rfe_cv.score(X_train_sel, y_train))
print("Test accuracy:", rfe_cv.score(X_test_sel, y_test))

y_pred = rfe_cv.predict(X_test_sel)
print(classification_report(y_test, y_pred))

print(rfe_cv.best_params_)

In [None]:
# running RFE + log transformation
rfe_log = RFECV(estimator = LogisticRegression(random_state=random_state, class_weight='balanced'), 
                cv=5)
rfe_log.fit(X_train_log, y_train_log)

print("Original feature set", X_train_log.shape[1])
print("Number of features after elimination", rfe.n_features_)

X_train_sel_log = rfe_log.transform(X_train_log)
X_test_sel_log = rfe_log.transform(X_test_log)

params = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
rfe_log_cv = GridSearchCV(param_grid=params,
                          estimator=LogisticRegression(random_state=random_state, class_weight='balanced'), 
                          cv=10, n_jobs=-1)
rfe_log_cv.fit(X_train_sel_log, y_train_log)


print("Train accuracy:", rfe_log_cv.score(X_train_sel_log, y_train_log))
print("Test accuracy:", rfe_log_cv.score(X_test_sel_log, y_test_log))

y_pred_log = rfe_log_cv.predict(X_test_sel_log)
print(classification_report(y_test_log, y_pred_log))

print("Best parameters:", rfe_log_cv.best_params_)

In [None]:
import pickle
with open('decision_tree_model.pickle','rb') as f:
    dt_best, roc_index_dt_cv, fpr_dt_cv, tpr_dt_cv = pickle.load(f)

def display_feature_importances(model, feature_names, features_to_display=20):
    importances = model.feature_importances_

    indices = np.argsort(importances)
    indices = np.flip(indices, axis=0)

    indices = indices[:features_to_display]
    for i in indices:
        print(feature_names[i], ':', importances[i])
    print("Number of leaves:", model.get_n_leaves())

display_feature_importances(dt_best, feature_names)

In [None]:
from sklearn.feature_selection import SelectFromModel

selectmodel = SelectFromModel(dt_best, prefit=True)
X_train_sel_model = selectmodel.transform(X_train)
X_test_sel_model = selectmodel.transform(X_test)
print(X_train_sel_model.shape)

In [None]:
params = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

cv_sel_model = GridSearchCV(param_grid=params,
                            estimator=LogisticRegression(random_state=random_state, class_weight='balanced'), cv=10)
cv_sel_model.fit(X_train_sel_model, y_train)
print("Train accuracy:", cv_sel_model.score(X_train_sel_model, y_train))
print("Test accuracy:", cv_sel_model.score(X_test_sel_model, y_test))

y_pred = cv_sel_model.predict(X_test_sel_model)
print(classification_report(y_test, y_pred))

print(cv_sel_model.best_params_)

In [None]:
from sklearn.metrics import roc_auc_score

y_pred_proba_lr = model.predict_proba(X_test)
y_pred_proba_lr_cv = cv.predict_proba(X_test)
y_pred_proba_rfe_cv = rfe_cv.predict_proba(X_test_sel)
y_pred_proba_rfe_log_cv = rfe_log_cv.predict_proba(X_test_sel_log)
y_pred_proba_cv_sel_model = cv_sel_model.predict_proba(X_test_sel_model)

roc_index_lr = roc_auc_score(y_test, y_pred_proba_lr[:, 1])
roc_index_lr_cv = roc_auc_score(y_test, y_pred_proba_lr_cv[:, 1])
roc_index_rfe_cv = roc_auc_score(y_test, y_pred_proba_rfe_cv[:, 1])
roc_index_rfe_log_cv = roc_auc_score(y_test, y_pred_proba_rfe_log_cv[:, 1])
roc_index_cv_sel_model = roc_auc_score(y_test, y_pred_proba_cv_sel_model[:, 1])


print("ROC index on test for `model`:", roc_index_lr)
print("ROC index on test for `cv`:", roc_index_lr_cv)
print("ROC index on test for `rfe_cv`:", roc_index_rfe_cv)
print("ROC index on test for `rfe_log_cv`:", roc_index_rfe_cv)
print("ROC index on test for `cv_sel_model`:", roc_index_cv_sel_model)

In [None]:
from sklearn.metrics import roc_curve
fpr_lr, tpr_lr, thresholds_lr = roc_curve(y_test, y_pred_proba_lr[:,1])
fpr_lr_cv, tpr_lr_cv, thresholds_lr_cv = roc_curve(y_test, y_pred_proba_lr_cv[:,1])
fpr_rfe_cv, tpr_rfe_cv, thresholds_rfe_cv = roc_curve(y_test, y_pred_proba_rfe_cv[:,1])
fpr_rfe_log_cv, tpr_rfe_log_cv, thresholds_rfe_log_cv = roc_curve(y_test, y_pred_proba_rfe_log_cv[:,1])
fpr_cv_sel_model, tpr_cv_sel_model, thresholds_cv_sel_model = roc_curve(y_test, y_pred_proba_cv_sel_model[:,1])

In [None]:
import matplotlib.pyplot as plt
plt.plot(fpr_lr, tpr_lr, label=f'ROC Curve for `model` {roc_index_lr:.3f}', color='red',lw=0.5)
plt.plot(fpr_lr_cv, tpr_lr_cv, label=f'ROC Curve for `cv` {roc_index_lr_cv:.3f}',color='green', lw=0.5)
plt.plot(fpr_rfe_cv, tpr_rfe_cv, label=f'ROC Curve for `rfe_cv` {roc_index_rfe_cv:.3f}',color='blue', lw=0.5)
plt.plot(fpr_rfe_log_cv, tpr_rfe_log_cv, label=f'ROC Curve for `rfe_log_cv` {roc_index_rfe_log_cv:.3f}', color='purple', lw=0.5)
plt.plot(fpr_cv_sel_model, tpr_cv_sel_model, label=f'ROC Curve for `cv_sel_model`{roc_index_cv_sel_model:.3f}', color='orange', lw=0.5)
plt.plot(fpr_dt_cv, tpr_dt_cv, label=f'ROC Curve for `dt_cv` {roc_index_dt_cv:.3f}', color='brown', lw=0.5)
plt.plot([0, 1], [0, 1], color='navy', lw=0.5, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.plot(fpr_rfe_cv, tpr_rfe_cv, label=f'ROC Curve for `rfe_cv` {roc_index_rfe_cv:.3f}', color='blue', lw=0.5)
plt.plot(fpr_cv_sel_model, tpr_cv_sel_model, label=f'ROC Curve for `cv_sel_model` {roc_index_cv_sel_model:.3f}', color='orange', lw=0.5)
plt.plot(fpr_dt_cv, tpr_dt_cv, label=f'ROC Curve for `dt_cv` {roc_index_dt_cv:.3f}', color='brown', lw=0.5)
plt.plot([0, 1], [0, 1], color='navy', lw=0.5, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
import pickle
lr_best = cv_sel_model
roc_index_lr_best = roc_index_cv_sel_model
tpr_lr_best = tpr_cv_sel_model
fpr_lr_best = fpr_cv_sel_model
with open('LR.pickle', 'wb') as f:
    pickle.dump([lr_best,roc_index_lr_best, fpr_lr_best, tpr_lr_best], f)

In [None]:
from sklearn.model_selection import train_test_split
random_state = 10
test_set_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_size, stratify=y, random_state=random_state)

print("Size of training set:", len(X_train))
print("Size of testing set:", len(X_test))

from sklearn.preprocessing import StandardScaler
random_seed = 10

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train, y_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

model = MLPClassifier(random_state=random_state)
model.fit(X_train, y_train)

In [None]:
print("Train accuracy:", model.score(X_train, y_train))
print("Test accuracy:", model.score(X_test, y_test))
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE (random_state = 10)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
# For fixing low precision, recall, f1-score
X_train_res = scaler.fit_transform(X_train_res) # scaling X_train_res

In [None]:
model = MLPClassifier(random_state=random_state)
model.fit(X_train_res, y_train_res)

In [None]:
model = MLPClassifier(max_iter=500, random_state=random_state)
model.fit(X_train_res, y_train_res)

In [None]:
print("Train accuracy:", model.score(X_train_res, y_train_res))
print("Test accuracy:", model.score(X_test, y_test))

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
print(X_train.shape)
print(X_train_res.shape)

In [None]:
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

import os
os.environ["PYTHONWARNINGS"] = "ignore"
params = {'hidden_layer_sizes': [(x,) for x in range(16, 257, 16)]} 
cv_1 = GridSearchCV(param_grid=params,estimator=MLPClassifier(random_state=random_state), 
                    return_train_score=True, cv=10, n_jobs=-1)

cv_1.fit(X_train_res, y_train_res)

In [None]:
result_set = cv_1.cv_results_
print(result_set)

In [None]:
import matplotlib.pyplot as plt
train_result = result_set['split0_train_score']
test_result = result_set['split0_test_score']
print("Total number of models: ", len(test_result))

plt.plot(range(0, len(train_result)), train_result, 'b', range(0,len(test_result)),test_result, 'r')
plt.xlabel('Hyperparameter Hidden_layers\nBlue = training acc. Red = test acc.')
plt.xticks(range(0, len(train_result)),range(16, 257, 16))
plt.ylabel('score')
plt.show()

In [None]:
train_result = result_set['mean_train_score']
test_result = result_set['mean_test_score']
print("Total number of models: ", len(test_result))

plt.plot(range(0, len(train_result)), train_result, 'b', range(0,len(test_result)),test_result, 'r')
plt.xlabel('Hyperparameter Hidden_layers\nBlue = training acc. Red = test acc.')
plt.xticks(range(0, len(train_result)),range(16, 257, 16))
plt.ylabel('score')
plt.show()

In [None]:
print("Train accuracy:", cv_1.score(X_train_res, y_train_res))
print("Test accuracy:", cv_1.score(X_test, y_test))

y_pred = cv_1.predict(X_test)
print(classification_report(y_test, y_pred))
print(cv_1.best_params_)

In [None]:
# new parameters
params = {'hidden_layer_sizes': [(12,), (16,), (24,), (32,), (40,), (48,), (56,), (64,)]}
cv_2 = GridSearchCV(param_grid=params,
                    estimator=MLPClassifier(random_state=random_state), cv=10, n_jobs=-1)
cv_2.fit(X_train_res, y_train_res)

print("Train accuracy:", cv_2.score(X_train_res, y_train_res))
print("Test accuracy:", cv_2.score(X_test, y_test))

y_pred = cv_2.predict(X_test)
print(classification_report(y_test, y_pred))
print(cv_2.best_params_)

In [None]:
params = {'hidden_layer_sizes': [(12,), (16,), (24,), (32,), (40,), (48,), (56,), (64,)], 
          'alpha': [0.01, 0.001, 0.0001, 0.00001]}
cv_3 = GridSearchCV(param_grid=params,
                    estimator=MLPClassifier(random_state=random_state), cv=10, n_jobs=-1)
cv_3.fit(X_train_res, y_train_res)

print("Train accuracy:", cv_3.score(X_train_res, y_train_res))
print("Test accuracy:", cv_3.score(X_test, y_test))

y_pred = cv_3.predict(X_test)
print(classification_report(y_test, y_pred))
print(cv_3.best_params_)

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

rfe = RFECV(estimator = LogisticRegression(random_state=random_state), cv=10)
rfe.fit(X_train_res, y_train_res)
print(rfe.n_features_)

In [None]:
X_train_rfe = rfe.transform(X_train_res)
X_test_rfe = rfe.transform(X_test)

params = {'hidden_layer_sizes': [(12,), (16,), (24,), (32,), (40,), (48,), (56,), (64,)], 
          'alpha': [0.01,0.001, 0.0001, 0.00001]}
rfe_cv = GridSearchCV(param_grid=params,
                      estimator=MLPClassifier(random_state=random_state), cv=10, n_jobs=-1)
rfe_cv.fit(X_train_rfe, y_train_res)

print("Train accuracy:", rfe_cv.score(X_train_rfe, y_train_res))
print("Test accuracy:", rfe_cv.score(X_test_rfe, y_test))

y_pred = rfe_cv.predict(X_test_rfe)
print(classification_report(y_test, y_pred))
print(rfe_cv.best_params_)

In [None]:
import pickle
from sklearn.feature_selection import SelectFromModel

with open('decision_tree_model.pickle', 'rb') as f:
    dt_best, roc_index_dt_cv, fpr_dt_cv, tpr_dt_cv = pickle.load(f)

selectmodel = SelectFromModel(dt_best, prefit=True)
X_train_sel_model = selectmodel.transform(X_train_res)
X_test_sel_model = selectmodel.transform(X_test)
print(X_train_sel_model.shape)

In [None]:
params = {'hidden_layer_sizes': [(12,), (16,), (24,), (32,), (40,), (48,), (56,), (64,)], 
          'alpha': [0.01,0.001, 0.0001, 0.00001]}
cv_sel_model = GridSearchCV(param_grid=params,
                            estimator=MLPClassifier(random_state=random_state), cv=10, n_jobs=-1)
cv_sel_model.fit(X_train_sel_model, y_train_res)

print("Train accuracy:", cv_sel_model.score(X_train_sel_model, y_train_res))
print("Test accuracy:", cv_sel_model.score(X_test_sel_model, y_test))

y_pred = cv_sel_model.predict(X_test_sel_model)
print(classification_report(y_test, y_pred))
print(cv_sel_model.best_params_)

In [None]:
result_set = cv_sel_model.cv_results_
print(result_set)

In [None]:
from sklearn.metrics import roc_auc_score

y_pred_proba_nn = model.predict_proba(X_test)
y_pred_proba_cv_1 = cv_1.predict_proba(X_test)
y_pred_proba_cv_2 = cv_2.predict_proba(X_test)
y_pred_proba_cv_3 = cv_3.predict_proba(X_test)
y_pred_proba_rfe_cv = rfe_cv.predict_proba(X_test_rfe)
y_pred_proba_cv_sel_model = cv_sel_model.predict_proba(X_test_sel_model)

roc_index_nn = roc_auc_score(y_test, y_pred_proba_nn[:, 1])
roc_index_cv_1 = roc_auc_score(y_test, y_pred_proba_cv_1[:, 1])
roc_index_cv_2 = roc_auc_score(y_test, y_pred_proba_cv_2[:, 1])
roc_index_cv_3 = roc_auc_score(y_test, y_pred_proba_cv_3[:, 1])
roc_index_rfe_cv = roc_auc_score(y_test, y_pred_proba_rfe_cv[:, 1])
roc_index_cv_sel_model = roc_auc_score(y_test, y_pred_proba_cv_sel_model[:, 1])

print("ROC index on test for NN_default:", roc_index_nn)
print("ROC index on test for NN with gridsearch 1:", roc_index_cv_1)
print("ROC index on test for NN with gridsearch 2:", roc_index_cv_2)
print("ROC index on test for NN with gridsearch 3:", roc_index_cv_3)
print("ROC index on test for NN with feature selection and gridsearch:", roc_index_rfe_cv)
print("ROC index on test for NN with feature selection (model selection) and gridsearch:", roc_index_cv_sel_model)

from sklearn.metrics import roc_curve
fpr_nn, tpr_nn, thresholds_nn = roc_curve(y_test, y_pred_proba_nn[:,1])
fpr_cv_1, tpr_cv_1, thresholds_cv_1 = roc_curve(y_test, y_pred_proba_cv_1[:,1])
fpr_cv_2, tpr_cv_2, thresholds_cv_2 = roc_curve(y_test, y_pred_proba_cv_2[:,1])
fpr_cv_3, tpr_cv_3, thresholds_cv_3 = roc_curve(y_test, y_pred_proba_cv_3[:,1])
fpr_rfe_cv, tpr_rfe_cv, thresholds_rfe_cv = roc_curve(y_test, y_pred_proba_rfe_cv[:,1])
fpr_cv_sel_model, tpr_cv_sel_model, thresholds_cv_sel_model = roc_curve(y_test, y_pred_proba_cv_sel_model[:,1])

import matplotlib.pyplot as plt
plt.plot(fpr_nn, tpr_nn, label='NN_default {:.3f}'.format(roc_index_nn), color='gray', lw=0.5)
plt.plot(fpr_cv_1, tpr_cv_1, label='NN cv_1 {:.3f}'.format(roc_index_cv_1), color='cyan', lw=0.5)
plt.plot(fpr_cv_2, tpr_cv_2, label='NN cv_2 {:.3f}'.format(roc_index_cv_2), color='yellow', lw=0.5)
plt.plot(fpr_cv_3, tpr_cv_3, label='NN cv_3 {:.3f}'.format(roc_index_cv_3), color='blue', lw=0.5)
plt.plot(fpr_rfe_cv, tpr_rfe_cv, label='NN rfe_cv {:.3f}'.format(roc_index_rfe_cv), color='black', lw=0.5)
plt.plot(fpr_cv_sel_model, tpr_cv_sel_model, label='NN with cv_sel_model {:.3f}'.format(roc_index_cv_sel_model), color='red', lw=0.5)
plt.plot([0, 1], [0, 1], color='navy', lw=0.5, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
import pickle

with open('decision_tree_model.pickle', 'rb') as f:
    dt_best, roc_index_dt_cv, fpr_dt_cv, tpr_dt_cv = pickle.load(f)  
with open('LR.pickle', 'rb') as f:
    lr_best, roc_index_lr_cv, fpr_lr_cv, tpr_lr_cv = pickle.load(f)
    
print("ROC index on test for decision tree:", roc_index_dt_cv)
print("ROC index on test for linear regression:", roc_index_lr_cv)
print("ROC index on test for NN with feature selection (model selection) and gridsearch:", roc_index_cv_sel_model)

plt.plot(fpr_dt_cv, tpr_dt_cv, label='DT {:.3f}'.format(roc_index_dt_cv), color='red', lw=0.5)
plt.plot(fpr_lr_cv, tpr_lr_cv, label='LR {:.3f}'.format(roc_index_lr_cv), color='green', lw=0.5)
plt.plot(fpr_cv_sel_model, tpr_cv_sel_model, label='NN with cv_sel_model {:.3f}'.format(roc_index_cv_sel_model), color='blue', lw=0.5)
plt.plot([0, 1], [0, 1], color='navy', lw=0.5, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset as TorchDataset, DataLoader
from sklearn.metrics import accuracy_score

class MyNet(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        hidden_layer_size = 100
        output_size = 1
        self.fc1 = nn.Linear(in_features, hidden_layer_size)
        self.fc2 = nn.Linear(hidden_layer_size, output_size)
        
    def forward(self, data):
        data = F.relu(self.fc1(data))
        data = self.fc2(data)
        data = F.sigmoid(data)
        return torch.flatten(data)

In [None]:
# Custom Dataset class that inherits from PyTorch's Dataset class
class MyDataset(TorchDataset):
    def __init__(self, X, y):
        assert (len(X) == len(y))
        self.X = X.astype("float32")
        self.y = y.astype("float32")
        
    def __len__(self):
        return len(self.X)
        
    def __getitem__(self, idx):
        return (self.X[idx], self.y[idx])

In [None]:
# High-level model class that handles training and prediction
class MyModel:
    def __init__(self, in_features, batch_size=200):
        self.model = MyNet(in_features=in_features)
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.0001)
        self.criterion = nn.BCELoss()
        self.batch_size = batch_size
        count = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
        print(f"The model has {count:,} trainable parameters")
        
    def train_one_epoch(self, loader):
        for X, y in loader:
            out = self.model(X)
            loss = self.criterion(out, y)
            loss.backward()
            self.optimizer.step()
            self.optimizer.zero_grad()
            
    def train(self, X_train, y_train, X_test, y_test, num_epochs=100):
        train_loader = DataLoader(MyDataset(X_train, y_train), batch_size=self.batch_size)
        train_acc_history = []
        test_acc_history = []
        
        for i in range(num_epochs):
            self.train_one_epoch(train_loader)
            train_pred = self.predict(X_train)
            train_acc = accuracy_score(y_train, train_pred)
            test_pred = self.predict(X_test)
            test_acc = accuracy_score(y_test, test_pred)
            print(f"Epoch {i+1}: Train Accuracy: {train_acc} Test Accuracy: {test_acc}")
            train_acc_history.append(train_acc)
            test_acc_history.append(test_acc)
        return (train_acc_history, test_acc_history)
        
    def predict(self, X):
        loader = DataLoader(MyDataset(X, X), batch_size=self.batch_size, shuffle=False)
        results = []
        for X_batch, _ in loader:
            out = self.model(X_batch)
            out = [1 if x >= 0.5 else 0 for x in out.tolist()]
            results.extend(out)
        return results

In [None]:
num_features = X_train.shape[1]
model = MyModel(num_features)

train_acc_list, test_acc_list = model.train(
    X_train_res, y_train_res, 
    X_test, y_test,
    num_epochs=200
)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.xlabel("Epoch", size=12)
plt.ylabel("Accuracy", size=12)
plt.plot(train_acc_list, label="Training Accuracy")
plt.plot(test_acc_list, label="Testing Accuracy")
plt.grid()
plt.legend(fontsize=10)
plt.tight_layout()
plt.show()

In [None]:
epoch_with_highest_accuracy = np.argmax(test_acc_list) + 1
print(f"Epoch with highest accuracy: {epoch_with_highest_accuracy}")