In [141]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import joblib
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, f1_score, recall_score, classification_report
import warnings
warnings.filterwarnings('ignore')

In [142]:
df = pd.read_csv('processed_fraud_data.csv')
df.head()

Unnamed: 0,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_FRAUD,DAY,MONTH,YEAR,HOUR,MINUTE
0,566,5056,20.71,0,4,4,2018,14,34
1,524,1445,13.04,0,2,4,2018,10,46
2,2457,4068,157.29,0,2,4,2018,6,17
3,1636,3895,94.08,0,3,4,2018,0,2
4,4466,6925,24.55,0,4,4,2018,12,59


In [143]:
df.shape

(67000, 9)

In [144]:
x = df.drop(columns=['TX_FRAUD'], axis=1)
y = df['TX_FRAUD']

In [145]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state = 42, stratify=y)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((46900, 8), (20100, 8), (46900,), (20100,))

In [146]:
pipeline = Pipeline([
    #('smote', SMOTE(sampling_strategy=0.12, random_state=42)),   # mild oversampling: fraud -> ~12% (adjustable)
    ('scaler', StandardScaler()),
    ('model', AdaBoostClassifier(
        estimator=DecisionTreeClassifier(random_state=42),
        random_state=42
    ))
])

In [147]:
models = {
    'LogisticRegression': LogisticRegression(),
    'DecisionTreeClassifier' : DecisionTreeClassifier(),
    'RandomForestClassifier' : RandomForestClassifier(),
    'AdaBoostClassifier' : AdaBoostClassifier(),
    'GradientBoostingClassifier' : GradientBoostingClassifier()
}


In [148]:
def validation(true, predicted):
    auc = accuracy_score(true, predicted)
    f1 = f1_score(true, predicted,  average='weighted')
    recall = recall_score(true, predicted)
    roc = roc_auc_score(true, predicted)
    
    return auc, f1, recall, roc

In [149]:
for ml in range(len(models)):
    model = list(models.values())[ml]
    model_name  = list(models.keys())[ml]
    model.fit(x_train, y_train)
    
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)
    
    ml_train_acc, ml_train_f1, ml_train_recall, ml_train_roc = validation(y_train, y_train_pred)
    ml_test_acc, ml_test_f1, ml_test_recall, ml_test_roc = validation(y_test, y_test_pred)
    
    print("----------------------------------------------")
    print("Model Selection for Training Dataset : ", model_name)
    print("Accuracy Score : {:.4f}".format(ml_train_acc))
    print("F1 Score : {:.4f}".format(ml_train_f1))
    print("Recall Score : {:.4f}".format(ml_train_recall))
    print("ROC AUC Score : {:.4f}".format(ml_train_roc))
    
    print("----------------------------------------------")
    print("Model Selection for Test Dataset : ", model_name)
    print("Accuracy Score : {:.4f}".format(ml_test_acc))
    print("F1 Score : {:.4f}".format(ml_test_f1))
    print("Recall Score : {:.4f}".format(ml_test_recall))
    print("ROC AUC Score : {:.4f}".format(ml_test_roc))
    print("----------------------------------------------")
    
    
    

----------------------------------------------
Model Selection for Training Dataset :  LogisticRegression
Accuracy Score : 0.9985
F1 Score : 0.9985
Recall Score : 0.9160
ROC AUC Score : 0.9577
----------------------------------------------
Model Selection for Test Dataset :  LogisticRegression
Accuracy Score : 0.9989
F1 Score : 0.9989
Recall Score : 0.9314
ROC AUC Score : 0.9655
----------------------------------------------
----------------------------------------------
Model Selection for Training Dataset :  DecisionTreeClassifier
Accuracy Score : 1.0000
F1 Score : 1.0000
Recall Score : 1.0000
ROC AUC Score : 1.0000
----------------------------------------------
Model Selection for Test Dataset :  DecisionTreeClassifier
Accuracy Score : 0.9995
F1 Score : 0.9995
Recall Score : 0.9755
ROC AUC Score : 0.9876
----------------------------------------------
----------------------------------------------
Model Selection for Training Dataset :  RandomForestClassifier
Accuracy Score : 1.0000


In [150]:
param_dist = {
    'model__n_estimators': [50, 100, 150, 200, 300],
    'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'model__algorithm': ['SAMME.R', 'SAMME'],
    
    'model__estimator__max_depth': [1, 2, 3, 4],
    'model__estimator__min_samples_split': [2, 5, 10],
    'model__estimator__min_samples_leaf': [1, 2, 4]
}

In [151]:

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [152]:
    
rnd_search = RandomizedSearchCV(estimator=pipeline, param_distributions=param_dist, n_iter=30, scoring='roc_auc', n_jobs=-1, cv=cv, verbose=2, random_state=42)
rnd_search.fit(x_train, y_train)

print("Best params:")
print(rnd_search.best_params_)
print("Best CV ROC AUC: {:.4f}".format(rnd_search.best_score_))

Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best params:
{'model__n_estimators': 200, 'model__learning_rate': 0.01, 'model__estimator__min_samples_split': 5, 'model__estimator__min_samples_leaf': 4, 'model__estimator__max_depth': 3, 'model__algorithm': 'SAMME'}
Best CV ROC AUC: 1.0000


In [154]:
base_tree = DecisionTreeClassifier(
    max_depth=3,
    min_samples_split=5,
    min_samples_leaf=4,
    random_state=42
)

In [155]:
ada = AdaBoostClassifier(
    estimator=base_tree,
    n_estimators=200,
    learning_rate=0.01,
    algorithm='SAMME',
    random_state=42
)

In [156]:
pipeline = Pipeline([
    #('smote', SMOTE(random_state=42)),
    ('scaler', StandardScaler()),
    ('model', ada)
])
    
pipeline.fit(x_train, y_train)
y_train_pred = pipeline.predict(x_train)
y_test_pred = pipeline.predict(x_test)


ml_train_acc, ml_train_f1, ml_train_recall, ml_train_roc = validation(y_train, y_train_pred)
ml_test_acc, ml_test_f1, ml_test_recall, ml_test_roc = validation(y_test, y_test_pred)
    
print("----------------------------------------------")
print("Model Selection for Training Dataset :  AdaBoostClassifier ")
print("Accuracy Score : {:.4f}".format(ml_train_acc))
print("F1 Score : {:.4f}".format(ml_train_f1))
print("Recall Score : {:.4f}".format(ml_train_recall))
print("ROC AUC Score : {:.4f}".format(ml_train_roc))
    
print("----------------------------------------------")
print("Model Selection for Test Dataset :  AdaBoostClassifier")
print("Accuracy Score : {:.4f}".format(ml_test_acc))
print("F1 Score : {:.4f}".format(ml_test_f1))
print("Recall Score : {:.4f}".format(ml_test_recall))
print("ROC AUC Score : {:.4f}".format(ml_test_roc))
print("----------------------------------------------")
    


----------------------------------------------
Model Selection for Training Dataset :  AdaBoostClassifier 
Accuracy Score : 0.9997
F1 Score : 0.9997
Recall Score : 0.9727
ROC AUC Score : 0.9863
----------------------------------------------
Model Selection for Test Dataset :  AdaBoostClassifier
Accuracy Score : 0.9997
F1 Score : 0.9997
Recall Score : 0.9755
ROC AUC Score : 0.9877
----------------------------------------------


In [158]:
joblib.dump(pipeline, 'ada_model.pkl')

['ada_model.pkl']

In [157]:
sample_data = pd.DataFrame([{
    "CUSTOMER_ID": 124,
    "TERMINAL_ID": 568,
    "TX_AMOUNT": 30,
    "DAY": 15,
    "MONTH": 7,
    "YEAR": 2018,
    "HOUR": 14,
    "MINUTE": 35
}])

pred = pipeline.predict(sample_data)[0]
prob = pipeline.predict_proba(sample_data)[0][1]

status = "Fraud" if pred == 1 else "Non-Fraud"
print(f"Sample Prediction: {status} (Fraud Probability: {prob:.2%})")

Sample Prediction: Non-Fraud (Fraud Probability: 32.38%)
