In [104]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc, roc_auc_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import matthews_corrcoef
import time

In [105]:
df = pd.read_csv('base.csv')

In [106]:
# df.info()

In [107]:
print(df['device_fraud_count'].value_counts())

0    1000000
Name: device_fraud_count, dtype: int64


In [108]:
# check attribute: "device_fraud_count", just has one value, so drop this column
df = df.drop(['device_fraud_count'], axis=1, errors='ignore') 

In [109]:
# check the imbalanced data classes, so this is a highly imbalanced dataset
df['fraud_bool'].value_counts()

0    988971
1     11029
Name: fraud_bool, dtype: int64

In [110]:
# before any pre-processing technologies, I do train-test split. I did splitting after imputation in the old version
# Should do spliting before any pre-processing to prevent from any info leakage

In [111]:
### split data into target y and attributes X
y = df['fraud_bool']
X = df.drop(['fraud_bool'], axis=1)

In [112]:
# train-test split by "month", month 0-5 are training, month 6 and 7 are test
y_train = y[X['month']<6]
y_test = y[X['month']>=6]
X_train = X[X['month']<6]
X_test = X[X['month']>=6]

In [113]:
train_count = X_train['month'].value_counts().sum()
train_count

794989

In [114]:
test_count = X_test['month'].value_counts().sum()
test_count

205011

In [115]:
# X_train['velocity_6h'].isnull().sum()

In [116]:
# Do pre-processing on training set

In [117]:
# 1. Data cleaning: Dealing with missing value

In [118]:
# check attribute: 'prev_address_months_count'
X_train['prev_address_months_count'].value_counts()[-1] / train_count

0.7150073774605686

In [119]:
# -1 means a missing value and there are about 71% missing, so I drop this feature
X_train = X_train.drop(['prev_address_months_count'], axis=1, errors='ignore')

In [120]:
# check attribute: 'intended_balcon_amount', negatives are missing values
(X_train['intended_balcon_amount'] < 0).sum() / train_count

0.7370013924720971

In [121]:
# because negatives are about 74%, so I drop this feature
X_train = X_train.drop(['intended_balcon_amount'], axis=1, errors='ignore')

In [122]:
# check attribute: "current_address_months_count"
X_train['current_address_months_count'].value_counts()[-1] / train_count

0.0038075998535828798

In [123]:
# handle imputation of missing values in a time series dataset while avoiding future data leakage

In [124]:
# impute "current_address_months_count", -1 means a missing value, 0.4% in total, right skewed so use median
median_value1 = X_train[(X_train['current_address_months_count'] != -1) & (X_train['month'] == 0)]['current_address_months_count'].median()
print(median_value1)
# Replace -1 with the median
X_train.loc[X_train['current_address_months_count'] == -1, 'current_address_months_count'] = median_value1
print(X_train['current_address_months_count'].describe())

49.0
count    794989.000000
mean         88.295121
std          89.754507
min           0.000000
25%          21.000000
50%          53.000000
75%         132.000000
max         425.000000
Name: current_address_months_count, dtype: float64


In [125]:
# check attribute: "bank_months_count"
X_train['bank_months_count'].value_counts()[-1] / train_count

0.2510085045201883

In [126]:
# impute "bank_months_count", -1 means a missing value, 25% in total, right skewed so use median
median_value2 = X_train[(X_train['bank_months_count'] != -1) & (X_train['month'] == 0)]['bank_months_count'].median()
print(median_value2)
# Replace -1 with the median
X_train.loc[df['bank_months_count'] == -1, 'bank_months_count'] = median_value2
print(X_train['bank_months_count'].describe())

15.0
count    794989.000000
mean         14.934764
std           9.973376
min           1.000000
25%           5.000000
50%          15.000000
75%          25.000000
max          32.000000
Name: bank_months_count, dtype: float64


In [127]:
# check attribute: "session_length_in_minutes"
X_train['session_length_in_minutes'].value_counts()[-1] / train_count

0.001903170987271522

In [128]:
# impute "session_length_in_minutes", -1 means a missing value, 0.2% in total, right skewed so use median
median_value3 = X_train[(X_train['session_length_in_minutes'] != -1) & (X_train['month'] == 0)]['session_length_in_minutes'].median()
print(round(median_value3,2))
# Replace -1 with the median
X_train.loc[X_train['session_length_in_minutes'] == -1, 'session_length_in_minutes'] = median_value3
print(X_train['session_length_in_minutes'].describe())

5.44
count    794989.000000
mean          7.742919
std           8.165819
min           0.003262
25%           3.264237
50%           5.246147
75%           9.008757
max          85.899143
Name: session_length_in_minutes, dtype: float64


In [129]:
# check attribute: "device_distinct_emails_8w"
X_train['device_distinct_emails_8w'].value_counts()[-1] / train_count

0.0003886846233092533

In [130]:
# impute "device_distinct_emails_8w", -1 means a missing value, 0.04% in total, because of categorical, so use mode
mode_value1 = X_train[(X_train['month'] == 0)]['device_distinct_emails_8w'].mode()[0]
print(mode_value1)
X_train['device_distinct_emails_8w'].replace(-1, mode_value1, inplace=True)
X_train['device_distinct_emails_8w'].describe()

1


count    794989.000000
mean          1.021575
std           0.186475
min           0.000000
25%           1.000000
50%           1.000000
75%           1.000000
max           2.000000
Name: device_distinct_emails_8w, dtype: float64

In [131]:
# X_train.info()

In [132]:
# X_train['income'].describe()

In [133]:
# convert wrong numerical data to categorical data
convert_dtype = ['email_is_free', 'phone_home_valid', 'phone_mobile_valid', 'has_other_cards', 'foreign_request', 
                 'keep_alive_session','device_distinct_emails_8w']
X_train[convert_dtype] = X_train[convert_dtype].astype('category')

In [134]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 794989 entries, 96843 to 999999
Data columns (total 28 columns):
 #   Column                            Non-Null Count   Dtype   
---  ------                            --------------   -----   
 0   income                            794989 non-null  float64 
 1   name_email_similarity             794989 non-null  float64 
 2   current_address_months_count      794989 non-null  int64   
 3   customer_age                      794989 non-null  int64   
 4   days_since_request                794989 non-null  float64 
 5   payment_type                      794989 non-null  object  
 6   zip_count_4w                      794989 non-null  int64   
 7   velocity_6h                       794989 non-null  float64 
 8   velocity_24h                      794989 non-null  float64 
 9   velocity_4w                       794989 non-null  float64 
 10  bank_branch_count_8w              794989 non-null  int64   
 11  date_of_birth_distinct_emails_4w  7

In [135]:
X_train['velocity_6h'].describe()

count    794989.000000
mean       6148.453005
std        3013.664048
min        -130.456928
25%        3971.401783
50%        5961.190608
75%        8113.534146
max       16715.565404
Name: velocity_6h, dtype: float64

In [136]:
# Transform numerical attributes

In [137]:
# transform right skewed features: ['income', 'current_address_months_count', 'days_since_request', 'zip_count_4w',
# 'velocity_6h', 'bank_branch_count_8w', 'date_of_birth_distinct_emails_4w','session_length_in_minutes']

In [138]:
# 'velocity_6h' has significant negative value like -130, some others have negative values as -1
# so Shift and Logarithmic Transformation 'velocity_6h' by add a constant value 130
X_train['velocity_6h'] = X_train['velocity_6h'] + 130
X_train['velocity_6h'].describe()

count    794989.000000
mean       6278.453005
std        3013.664048
min          -0.456928
25%        4101.401783
50%        6091.190608
75%        8243.534146
max       16845.565404
Name: velocity_6h, dtype: float64

In [139]:
# instead of log, use log1p, which stands for log(x+1) 
columns_to_transform = ['income', 'current_address_months_count', 'days_since_request', 'zip_count_4w',
                        'velocity_6h', 'bank_branch_count_8w', 'date_of_birth_distinct_emails_4w',
                        'session_length_in_minutes']
X_train[columns_to_transform] = np.log1p(X_train[columns_to_transform])

In [140]:
# X_train.isnull().sum()

In [141]:
# Standardization, because of time series, so scale data during cross-validation to prevent leakage

In [142]:
# X_train_num = X_train.select_dtypes(include = [np.number])
# X_train_num.info()

In [143]:
# std_scaler = StandardScaler()
# X_train_num_std_scaler = std_scaler.fit_transform(X_train_num)

In [144]:
# X_train.describe()

In [145]:
# one-hot encoding categorical attributes
# Create a new DataFrame with the encoded categorical column
X_train_encoded = pd.get_dummies(X_train)
X_train_encoded.shape

(794989, 57)

In [146]:
# Oversampling Using SMOTE
smote =SMOTE(random_state=0)
X_train_smote, y_train_smote = smote.fit_resample(X_train_encoded, y_train)
print('Feature/label dataset for training before applying SMOTE: ', X_train_encoded.shape, y_train.shape)
print('Feature/label dataset for training after applying SMOTE: ', X_train_smote.shape, y_train_smote.shape)
print('Distribution of label values after applying SMOTE:\n',pd.Series(y_train_smote).value_counts())

Feature/label dataset for training before applying SMOTE:  (794989, 57) (794989,)
Feature/label dataset for training after applying SMOTE:  (1573676, 57) (1573676,)
Distribution of label values after applying SMOTE:
 1    786838
0    786838
Name: fraud_bool, dtype: int64


In [147]:
# X_train_smote.info()

In [148]:
# feature reduction
# Select only the numerical attributes
numerical_attributes = X_train_smote.select_dtypes(include=['int64', 'float64'])
# Calculate correlation between numerical attributes
corr_matrix = numerical_attributes.corr()
corr_matrix

Unnamed: 0,income,name_email_similarity,current_address_months_count,customer_age,days_since_request,zip_count_4w,velocity_6h,velocity_24h,velocity_4w,bank_branch_count_8w,date_of_birth_distinct_emails_4w,credit_risk_score,bank_months_count,proposed_credit_limit,session_length_in_minutes,month
income,1.0,-0.068475,0.037733,0.157972,-0.031571,-0.047536,-0.08284,-0.080052,-0.077734,-0.018908,-0.080154,0.241103,-0.007574,0.200717,-0.069629,0.064746
name_email_similarity,-0.068475,1.0,-0.00327,-0.085073,-0.016031,0.008845,0.027728,0.039679,0.0428,0.006957,0.066775,0.016263,-0.027388,0.038955,0.018655,-0.037697
current_address_months_count,0.037733,-0.00327,1.0,0.237089,-0.093444,0.06206,-0.013057,-0.025247,-0.038702,0.007447,-0.258247,0.182383,0.06996,0.186887,-0.046178,0.026148
customer_age,0.157972,-0.085073,0.237089,1.0,-0.041992,-0.006591,-0.043671,-0.009768,-0.00752,0.000672,-0.551825,0.255398,0.02414,0.224185,0.041593,-0.015371
days_since_request,-0.031571,-0.016031,-0.093444,-0.041992,1.0,-0.00202,0.06409,0.033455,0.025623,-0.064494,0.012005,-0.14833,0.035208,-0.117256,0.079581,-0.044102
zip_count_4w,-0.047536,0.008845,0.06206,-0.006591,-0.00202,1.0,0.064489,0.117849,0.212247,0.009201,0.060313,-0.087482,0.048689,-0.033912,0.048725,-0.1918
velocity_6h,-0.08284,0.027728,-0.013057,-0.043671,0.06409,0.064489,1.0,0.350198,0.286595,0.0241,0.065938,-0.177418,-0.01182,-0.104762,0.047205,-0.275994
velocity_24h,-0.080052,0.039679,-0.025247,-0.009768,0.033455,0.117849,0.350198,1.0,0.39808,0.052537,0.052683,-0.141161,-0.029355,-0.030628,0.072939,-0.403194
velocity_4w,-0.077734,0.0428,-0.038702,-0.00752,0.025623,0.212247,0.286595,0.39808,1.0,0.061417,0.121047,-0.155151,-0.011201,0.00634,0.102265,-0.792102
bank_branch_count_8w,-0.018908,0.006957,0.007447,0.000672,-0.064494,0.009201,0.0241,0.052537,0.061417,1.0,0.028507,-0.069792,0.024132,-0.033048,0.002392,-0.055242


In [149]:
X_train_smote['velocity_4w'].corr(X_train_smote['month'])

-0.7921018166771735

In [150]:
# the correlation is about -0.8, highly negative correlated, so I drop attribute 'velocity_4w'
X_train_smote = X_train_smote.drop(['velocity_4w'], axis=1, errors='ignore')

In [151]:
# print('Distribution of label values after applying SMOTE:\n',pd.Series(y_train_smote).value_counts())

In [152]:
# I want to sample about 20,000 for each class, so set k = 40, use 1-in-20 systematic sampling
# apply sampling before performing time-series validation
# sampling_ratio = 0.02542
# n_samples = int(len(X_train_smote) * sampling_ratio)
indices = list(range(0, len(X_train_smote), 20))
X_train_sampled = X_train_smote.iloc[indices]
y_train_sampled = y_train_smote.iloc[indices]

In [153]:
len(X_train_sampled)

78684

In [154]:
# X_train_sampled.info()

In [155]:
y_train_sampled.value_counts()

1    39343
0    39341
Name: fraud_bool, dtype: int64

# Effectiveness

In [156]:
# Perform time-series validation by iterating over 'month' from 0 to 5
temporal_column = "month"
start_month = 0
end_month = 5

# Initialize 2 lists to store mcc and accuracy scores
mcc_scores = []
accuracy_scores = []
# initial precision
prc = 0  
# initial recall
rec = 0  
# initial f1 score
f1 = 0  
# initial roc_auc
roc_auc = 0  

# Create a StandardScaler instance to do feature scaling in each iteration
scaler = StandardScaler()

# time series cross validation, split the data into training and test based on 'month'
for i in range(start_month, end_month):
    X_train_ts = X_train_sampled[X_train_sampled[temporal_column] <= i]
    y_train_ts = y_train_sampled[X_train_sampled[temporal_column] <= i]
    X_test_ts = X_train_sampled[X_train_sampled[temporal_column] == i + 1]
    y_test_ts = y_train_sampled[X_train_sampled[temporal_column] == i + 1]

    # Scale the cross-validation generated training and test data
    X_train_ts_scaled = scaler.fit_transform(X_train_ts)
    X_test_ts_scaled = scaler.transform(X_test_ts)

    # FROM HERE, I APPLY THREE ML MODELS TO DO COMPARISON
    # Train a logistic regression classifier using the scaled training data
    clf = DecisionTreeClassifier(random_state=42)
    clf.fit(X_train_ts_scaled, y_train_ts)

    # Predict class labels for the scaled testing data
    y_pred = clf.predict(X_test_ts_scaled)

    # Calculate the confusion matrix
    confusion = confusion_matrix(y_test_ts, y_pred)
    print('iteration: ', i, '\n', confusion)

    # Calculate evaluation metrics
    prc += precision_score(y_test_ts, y_pred)
    rec += recall_score(y_test_ts, y_pred)
    f1 += f1_score(y_test_ts, y_pred)
    roc_auc += roc_auc_score(y_test_ts, y_pred)
    # Calculate the Matthews Correlation Coefficient (MCC)
    mcc = matthews_corrcoef(y_test_ts, y_pred)
    mcc_scores.append(mcc)
    # Calculate the accuracy scores
    accuracy = accuracy_score(y_test_ts, y_pred)

# Compute the average valuation matrics
average_precision = prc / 5
average_recall = rec / 5
average_f1 = f1 / 5
average_roc_auc = roc_auc / 5
average_mcc = sum(mcc_scores) / 5
accuracy_scores.append(accuracy)


print("\nAverage precision:")
print(average_precision)
print("\nAverage recall:")
print(average_recall)
print("\nAverage f1 score:")
print(average_f1)
print("\nAverage roc_auc score:")
print(average_roc_auc)
print("\nAverage MCC:")
print(average_mcc)
print("\nAverage accuracy:")
print(accuracy_scores)

iteration:  0 
 [[6001  320]
 [ 401 7106]]
iteration:  1 
 [[6563  226]
 [ 206 5567]]
iteration:  2 
 [[7197  280]
 [ 338 7848]]
iteration:  3 
 [[6121  191]
 [ 233 6252]]
iteration:  4 
 [[5720  175]
 [ 168 3968]]

Average precision:
0.962312448386875

Average recall:
0.9586123611122387

Average f1 score:
0.9604516176180191

Average roc_auc score:
0.9611753588548051

Average MCC:
0.9220185538041665

Average accuracy:
[0.9658060013956734]


In [157]:
# Predict on the very beginning test data: month 6 and 7

In [158]:
X_test = X_test.drop(['prev_address_months_count'], axis=1, errors='ignore')
X_test = X_test.drop(['intended_balcon_amount'], axis=1, errors='ignore')

In [159]:
# impute "current_address_months_count", -1 means a missing value
median_value1_test = X_test[(X_test['current_address_months_count'] != -1) & (X_test['month'] == 6)]['current_address_months_count'].median()
print(median_value1_test)
# Replace -1 with the median
X_test.loc[X_test['current_address_months_count'] == -1, 'current_address_months_count'] = median_value1_test
print(X_test['current_address_months_count'].describe())

60.0
count    205011.000000
mean         81.070840
std          81.906621
min           0.000000
25%          16.000000
50%          49.000000
75%         124.000000
max         428.000000
Name: current_address_months_count, dtype: float64


In [160]:
# impute "bank_months_count", -1 means a missing value
median_value2_test = X_test[(X_test['bank_months_count'] != -1) & (X_test['month'] == 6)]['bank_months_count'].median()
print(median_value2_test)
# Replace -1 with the median
X_test.loc[df['bank_months_count'] == -1, 'bank_months_count'] = median_value2_test
print(X_test['bank_months_count'].describe())

15.0
count    205011.000000
mean         14.752818
std           9.903537
min           1.000000
25%           5.000000
50%          15.000000
75%          25.000000
max          32.000000
Name: bank_months_count, dtype: float64


In [161]:
# impute "device_distinct_emails_8w", -1 means a missing value, 0.04% in total, because of categorical, so use mode
mode_value1_test = X_test[(X_test['month'] == 6)]['device_distinct_emails_8w'].mode()[0]
print(mode_value1_test)
X_test['device_distinct_emails_8w'].replace(-1, mode_value1_test, inplace=True)
X_test['device_distinct_emails_8w'].describe()

1


count    205011.000000
mean          1.009160
std           0.131458
min           0.000000
25%           1.000000
50%           1.000000
75%           1.000000
max           2.000000
Name: device_distinct_emails_8w, dtype: float64

In [162]:
# convert wrong numerical data to categorical data
convert_dtype2 = ['email_is_free', 'phone_home_valid', 'phone_mobile_valid', 'has_other_cards', 'foreign_request', 
                 'keep_alive_session','device_distinct_emails_8w']
X_test[convert_dtype2] = X_test[convert_dtype2].astype('category')

In [163]:
X_test['velocity_6h'].describe()

count    205011.000000
mean       3791.718974
std        2130.921646
min        -170.603072
25%        2306.429463
50%        3609.343942
75%        4812.595460
max       16356.364000
Name: velocity_6h, dtype: float64

In [164]:
X_test['velocity_6h'] = X_test['velocity_6h'] + 170
X_test['velocity_6h'].describe()

count    205011.000000
mean       3961.718974
std        2130.921646
min          -0.603072
25%        2476.429463
50%        3779.343942
75%        4982.595460
max       16526.364000
Name: velocity_6h, dtype: float64

In [165]:
X_test['session_length_in_minutes'].describe()

count    205011.000000
mean          6.824736
std           7.417399
min          -1.000000
25%           2.523108
50%           4.651072
75%           8.328739
max          83.376775
Name: session_length_in_minutes, dtype: float64

In [166]:
# impute "session_length_in_minutes", -1 means a missing value
median_value3_test = X_test[(X_test['session_length_in_minutes'] != -1) & (X_test['month'] == 6)]['session_length_in_minutes'].median()
print(round(median_value3_test,2))
# Replace -1 with the median
X_test.loc[X_test['session_length_in_minutes'] == -1, 'session_length_in_minutes'] = median_value3_test
print(X_test['session_length_in_minutes'].describe())

4.98
count    205011.000000
mean          6.839388
std           7.407832
min           0.000872
25%           2.546404
50%           4.670989
75%           8.328739
max          83.376775
Name: session_length_in_minutes, dtype: float64


In [167]:
# instead of log, use log1p, which stands for log(x+1) 
columns_to_transform2 = ['income', 'current_address_months_count', 'days_since_request', 'zip_count_4w',
                        'velocity_6h', 'bank_branch_count_8w', 'date_of_birth_distinct_emails_4w',
                        'session_length_in_minutes']
X_test[columns_to_transform2] = np.log1p(X_test[columns_to_transform2])

In [168]:
# X_test.isnull().sum()

In [169]:
X_test_encoded = pd.get_dummies(X_test)
X_test_encoded.shape

(205011, 57)

In [170]:
X_test_encoded = X_test_encoded.drop(['velocity_4w'], axis=1, errors='ignore')

In [171]:
X_test_scaled = scaler.transform(X_test_encoded)

In [173]:
y_pred_final = clf.predict(X_test_scaled)

In [174]:
confusion = confusion_matrix(y_test, y_pred_final)
confusion

array([[193988,   8145],
       [  2388,    490]], dtype=int64)

In [175]:
prc_final = precision_score(y_test, y_pred_final)
rec_final = recall_score(y_test, y_pred_final)
f1_final = f1_score(y_test, y_pred_final)
roc_auc_final = roc_auc_score(y_test, y_pred_final)
mcc_final = matthews_corrcoef(y_test, y_pred_final)
accuracy_final = accuracy_score(y_test, y_pred_final)
print("final precision:")
print(prc_final)
print("\nfinal recall:")
print(rec_final)
print("\nfinal f1 score:")
print(f1_final)
print("\nfinal roc_auc score:")
print(roc_auc_final)
print("\nfinal MCC:")
print(mcc_final)
print("\nfinal accuracy:")
print(accuracy_final)

final precision:
0.056745801968731906

final recall:
0.1702571230020848

final f1 score:
0.08512116737600972

final roc_auc score:
0.5649809359277812

final MCC:
0.0761210180268539

final accuracy:
0.9486222690489778


# Efficiency

In [176]:
# time
start_time = time.time()
temporal_column = "month"
start_month = 0
end_month = 5

# Initialize 2 lists to store mcc and accuracy scores
mcc_scores = []
accuracy_scores = []
# initial precision
prc = 0  
# initial recall
rec = 0  
# initial f1 score
f1 = 0  
# initial roc_auc
roc_auc = 0  

# Create a StandardScaler instance to do feature scaling in each iteration
scaler = StandardScaler()

# time series cross validation, split the data into training and test based on 'month'
for i in range(start_month, end_month):
    X_train_ts = X_train_sampled[X_train_sampled[temporal_column] <= i]
    y_train_ts = y_train_sampled[X_train_sampled[temporal_column] <= i]
    X_test_ts = X_train_sampled[X_train_sampled[temporal_column] == i + 1]
    y_test_ts = y_train_sampled[X_train_sampled[temporal_column] == i + 1]

    # Scale the cross-validation generated training and test data
    X_train_ts_scaled = scaler.fit_transform(X_train_ts)
    X_test_ts_scaled = scaler.transform(X_test_ts)

    # FROM HERE, I APPLY THREE ML MODELS TO DO COMPARISON
    # Train a logistic regression classifier using the scaled training data
    clf = DecisionTreeClassifier(random_state=42)
    clf.fit(X_train_ts_scaled, y_train_ts)

    # Predict class labels for the scaled testing data
    y_pred = clf.predict(X_test_ts_scaled)

    # Calculate the confusion matrix
    confusion = confusion_matrix(y_test_ts, y_pred)
    print('iteration: ', i, '\n', confusion)

    # Calculate evaluation metrics
    prc += precision_score(y_test_ts, y_pred)
    rec += recall_score(y_test_ts, y_pred)
    f1 += f1_score(y_test_ts, y_pred)
    roc_auc += roc_auc_score(y_test_ts, y_pred)
    # Calculate the Matthews Correlation Coefficient (MCC)
    mcc = matthews_corrcoef(y_test_ts, y_pred)
    mcc_scores.append(mcc)
    # Calculate the accuracy scores
    accuracy = accuracy_score(y_test_ts, y_pred)

# Compute the average valuation matrics
average_precision = prc / 5
average_recall = rec / 5
average_f1 = f1 / 5
average_roc_auc = roc_auc / 5
average_mcc = sum(mcc_scores) / 5
accuracy_scores.append(accuracy)

end_time = time.time()
execution_time = end_time - start_time
print("Execution Time:", execution_time)

iteration:  0 
 [[6001  320]
 [ 401 7106]]
iteration:  1 
 [[6563  226]
 [ 206 5567]]
iteration:  2 
 [[7197  280]
 [ 338 7848]]
iteration:  3 
 [[6121  191]
 [ 233 6252]]
iteration:  4 
 [[5720  175]
 [ 168 3968]]
Execution Time: 4.209928750991821


# Stability

In [179]:
# seed = 420
temporal_column = "month"
start_month = 0
end_month = 5

# Initialize 2 lists to store mcc and accuracy scores
mcc_scores = []
accuracy_scores = []
# initial precision
prc = 0  
# initial recall
rec = 0  
# initial f1 score
f1 = 0  
# initial roc_auc
roc_auc = 0  

# Create a StandardScaler instance to do feature scaling in each iteration
scaler = StandardScaler()

# time series cross validation, split the data into training and test based on 'month'
for i in range(start_month, end_month):
    X_train_ts = X_train_sampled[X_train_sampled[temporal_column] <= i]
    y_train_ts = y_train_sampled[X_train_sampled[temporal_column] <= i]
    X_test_ts = X_train_sampled[X_train_sampled[temporal_column] == i + 1]
    y_test_ts = y_train_sampled[X_train_sampled[temporal_column] == i + 1]

    # Scale the cross-validation generated training and test data
    X_train_ts_scaled = scaler.fit_transform(X_train_ts)
    X_test_ts_scaled = scaler.transform(X_test_ts)

    # FROM HERE, I APPLY THREE ML MODELS TO DO COMPARISON
    # Train a logistic regression classifier using the scaled training data
    clf = DecisionTreeClassifier(random_state=420)
    clf.fit(X_train_ts_scaled, y_train_ts)

    # Predict class labels for the scaled testing data
    y_pred = clf.predict(X_test_ts_scaled)

    # Calculate the confusion matrix
    confusion = confusion_matrix(y_test_ts, y_pred)
    print('iteration: ', i, '\n', confusion)

    # Calculate evaluation metrics
    prc += precision_score(y_test_ts, y_pred)
    rec += recall_score(y_test_ts, y_pred)
    f1 += f1_score(y_test_ts, y_pred)
    roc_auc += roc_auc_score(y_test_ts, y_pred)
    # Calculate the Matthews Correlation Coefficient (MCC)
    mcc = matthews_corrcoef(y_test_ts, y_pred)
    mcc_scores.append(mcc)
    # Calculate the accuracy scores
    accuracy = accuracy_score(y_test_ts, y_pred)

# Compute the average valuation matrics
average_precision = prc / 5
average_recall = rec / 5
average_f1 = f1 / 5
average_roc_auc = roc_auc / 5
average_mcc = sum(mcc_scores) / 5
accuracy_scores.append(accuracy)


print("\nAverage precision:")
print(average_precision)
print("\nAverage recall:")
print(average_recall)
print("\nAverage f1 score:")
print(average_f1)
print("\nAverage roc_auc score:")
print(average_roc_auc)
print("\nAverage MCC:")
print(average_mcc)
print("\nAverage accuracy:")
print(accuracy_scores)

iteration:  0 
 [[6004  317]
 [ 408 7099]]
iteration:  1 
 [[6572  217]
 [ 216 5557]]
iteration:  2 
 [[7181  296]
 [ 333 7853]]
iteration:  3 
 [[6107  205]
 [ 220 6265]]
iteration:  4 
 [[5746  149]
 [ 167 3969]]

Average precision:
0.9630963065107085

Average recall:
0.958650869076131

Average f1 score:
0.9608643496823281

Average roc_auc score:
0.9613799034942152

Average MCC:
0.9226495846056298

Average accuracy:
[0.9684976572624863]


In [180]:
# seed = 4200
temporal_column = "month"
start_month = 0
end_month = 5

# Initialize 2 lists to store mcc and accuracy scores
mcc_scores = []
accuracy_scores = []
# initial precision
prc = 0  
# initial recall
rec = 0  
# initial f1 score
f1 = 0  
# initial roc_auc
roc_auc = 0  

# Create a StandardScaler instance to do feature scaling in each iteration
scaler = StandardScaler()

# time series cross validation, split the data into training and test based on 'month'
for i in range(start_month, end_month):
    X_train_ts = X_train_sampled[X_train_sampled[temporal_column] <= i]
    y_train_ts = y_train_sampled[X_train_sampled[temporal_column] <= i]
    X_test_ts = X_train_sampled[X_train_sampled[temporal_column] == i + 1]
    y_test_ts = y_train_sampled[X_train_sampled[temporal_column] == i + 1]

    # Scale the cross-validation generated training and test data
    X_train_ts_scaled = scaler.fit_transform(X_train_ts)
    X_test_ts_scaled = scaler.transform(X_test_ts)

    # FROM HERE, I APPLY THREE ML MODELS TO DO COMPARISON
    # Train a logistic regression classifier using the scaled training data
    clf = DecisionTreeClassifier(random_state=4200)
    clf.fit(X_train_ts_scaled, y_train_ts)

    # Predict class labels for the scaled testing data
    y_pred = clf.predict(X_test_ts_scaled)

    # Calculate the confusion matrix
    confusion = confusion_matrix(y_test_ts, y_pred)
    print('iteration: ', i, '\n', confusion)

    # Calculate evaluation metrics
    prc += precision_score(y_test_ts, y_pred)
    rec += recall_score(y_test_ts, y_pred)
    f1 += f1_score(y_test_ts, y_pred)
    roc_auc += roc_auc_score(y_test_ts, y_pred)
    # Calculate the Matthews Correlation Coefficient (MCC)
    mcc = matthews_corrcoef(y_test_ts, y_pred)
    mcc_scores.append(mcc)
    # Calculate the accuracy scores
    accuracy = accuracy_score(y_test_ts, y_pred)

# Compute the average valuation matrics
average_precision = prc / 5
average_recall = rec / 5
average_f1 = f1 / 5
average_roc_auc = roc_auc / 5
average_mcc = sum(mcc_scores) / 5
accuracy_scores.append(accuracy)


print("\nAverage precision:")
print(average_precision)
print("\nAverage recall:")
print(average_recall)
print("\nAverage f1 score:")
print(average_f1)
print("\nAverage roc_auc score:")
print(average_roc_auc)
print("\nAverage MCC:")
print(average_mcc)
print("\nAverage accuracy:")
print(accuracy_scores)

iteration:  0 
 [[5989  332]
 [ 417 7090]]
iteration:  1 
 [[6567  222]
 [ 214 5559]]
iteration:  2 
 [[7182  295]
 [ 328 7858]]
iteration:  3 
 [[6118  194]
 [ 228 6257]]
iteration:  4 
 [[5730  165]
 [ 169 3967]]

Average precision:
0.9621356735876828

Average recall:
0.9582591057111476

Average f1 score:
0.9601889727961194

Average roc_auc score:
0.960789298161367

Average MCC:
0.9213664264307964

Average accuracy:
[0.9667032200179444]
