In [24]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score

from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.linear_model import LogisticRegression as LR
from sklearn.naive_bayes import GaussianNB as NB
from sklearn.svm import SVC

from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.ensemble import AdaBoostClassifier as ABC
from xgboost import XGBClassifier as XGBC
from lightgbm import LGBMClassifier as LGBC
from catboost import CatBoostClassifier as CBC

import joblib

In [3]:
X_train_origin = pd.read_csv('../data/featured_data/X_train.csv')
y_train_origin = pd.read_csv('../data/featured_data/y_train.csv')

In [4]:
X_train_origin = X_train_origin.to_numpy()
y_train_origin = y_train_origin.to_numpy()
y_train_origin = y_train_origin.ravel()

In [5]:
X_train_origin.shape,y_train_origin.shape

((95512, 48), (95512,))

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X_train_origin, y_train_origin, test_size=0.2, random_state=1) 

In [7]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((76409, 48), (19103, 48), (76409,), (19103,))

In [22]:
# apply threshold to positive probabilities to create labels
def to_labels(pos_probs, threshold):
    return (pos_probs >= threshold).astype('int')

def modeling(model_name):
    model=model_name()
    model.fit(X_train,y_train)
    #y_pred = model.predict(X_val)
    
    # keep probabilities for the positive outcome only
    y_pred_proba = model.predict_proba(X_val)[:, 1]
    # define thresholds
    thresholds = np.arange(0, 1, 0.001)
    # evaluate each threshold
    scores = [f1_score(y_val, to_labels(y_pred_proba, t)) for t in thresholds]
    # get best threshold
    ix = np.argmax(scores)
    print('Probability threshold=%.3f, best F1-Score=%.5f' % (thresholds[ix], scores[ix]))
    
    # get y_pred
    y_pred = to_labels(y_pred_proba, thresholds[ix])
    
    print("Confusion_Matrix:")
    print(str(confusion_matrix(y_val,y_pred)))
    print(classification_report(y_val,y_pred))
    print("Accuracy:" + str(accuracy_score(y_val, y_pred)))
    print("AUC_ROC:" + str(roc_auc_score(y_val, y_pred)))

In [23]:
modeling(LR) 

Probability threshold=0.369, best F1-Score=0.77683
Confusion_Matrix:
[[10024  1997]
 [ 1316  5766]]
              precision    recall  f1-score   support

           0       0.88      0.83      0.86     12021
           1       0.74      0.81      0.78      7082

    accuracy                           0.83     19103
   macro avg       0.81      0.82      0.82     19103
weighted avg       0.83      0.83      0.83     19103

Accuracy:0.8265717426582212
AUC_ROC:0.824025419978936


In [25]:
modeling(KNN) 

Probability threshold=0.401, best F1-Score=0.78931
Confusion_Matrix:
[[10696  1325]
 [ 1601  5481]]
              precision    recall  f1-score   support

           0       0.87      0.89      0.88     12021
           1       0.81      0.77      0.79      7082

    accuracy                           0.85     19103
   macro avg       0.84      0.83      0.83     19103
weighted avg       0.85      0.85      0.85     19103

Accuracy:0.8468303407841701
AUC_ROC:0.8318550709561479
