In [None]:
import pandas as pd
import numpy as np
from google.colab import drive
from pathlib import Path
from sklearn.preprocessing import OrdinalEncoder

from pyspark.sql.functions import split
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.dummy import DummyClassifier
from sklearn.metrics import balanced_accuracy_score

from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import roc_curve
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from xgboost import XGBClassifier

In [None]:
drive.mount('/content/drive')
path = '/content/drive/Shareddrives/DS Capstone/base_baf_final.csv'
unbalanced_df = pd.read_csv(path)

Mounted at /content/drive


# **Data Encoding and Removal of Variables**

In [None]:
#Removing source and device_os variables
unbalanced_df = unbalanced_df.drop(['source', 'device_os'], axis=1)
unbalanced_df.head()

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,...,phone_mobile_valid,bank_months_count,has_other_cards,proposed_credit_limit,foreign_request,session_length_in_minutes,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
0,0,0.25,0.986507,0.0,0.060606,0.375,8.6e-05,0.918255,0.0,0.157934,...,1.0,0.30303,0.0,0.685864,0.0,0.198216,1.0,0.666667,0.0,0.0
1,0,0.875,0.617426,0.0,0.20979,0.125,0.000129,0.11426,0.75,0.24735,...,1.0,0.090909,0.0,0.685864,0.0,0.050217,1.0,0.666667,0.0,0.0
2,0,0.875,0.996708,0.026042,0.034965,0.375,0.000157,0.109273,0.25,0.163308,...,1.0,0.939394,0.0,0.005236,0.0,0.273082,0.0,0.666667,0.0,0.0
3,0,0.625,0.4751,0.03125,0.034965,0.25,8.9e-05,0.106372,0.25,0.519779,...,1.0,0.060606,0.0,0.005236,0.0,0.186605,1.0,0.666667,0.0,0.0
4,0,1.0,0.842307,0.0,0.06993,0.375,0.073195,0.487853,0.0,0.349007,...,1.0,0.818182,0.0,0.005236,0.0,0.054581,0.0,0.666667,0.0,0.0


# **Data Balancing**

In [None]:
#Undesampling
X = unbalanced_df.drop(columns='fraud_bool')
y = unbalanced_df['fraud_bool'].astype(int)


undersample = RandomUnderSampler(sampling_strategy='not minority', random_state=42)


X_resampled, y_resampled = undersample.fit_resample(X, y)


undersample_df = pd.concat([X_resampled, y_resampled], axis=1)

print(undersample_df[undersample_df['fraud_bool'] == 0]['fraud_bool'].count())
print(undersample_df[undersample_df['fraud_bool'] == 1]['fraud_bool'].count())



11029
11029


# **Logistic Regression**

In [None]:
#Logistical Regression on unbalanced dataset
X_unbalanced = unbalanced_df.drop(columns=['fraud_bool'])
y_unbalanced = unbalanced_df['fraud_bool']
#Splitting The data set in 70% - 30%
X_unbalanced_train, X_unbalanced_test, y_unbalanced_train, y_unbalanced_test = train_test_split(X_unbalanced, y_unbalanced, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_unbalanced_train = scaler.fit_transform(X_unbalanced_train)
X_unbalanced_test = scaler.transform(X_unbalanced_test)

clf_unbalanced = LogisticRegression(max_iter=10000, random_state=0)
clf_unbalanced .fit(X_unbalanced_train, y_unbalanced_train)

acc = accuracy_score(y_unbalanced_test, clf_unbalanced.predict(X_unbalanced_test)) * 100
print(f"Logistic Regression model accuracy for the unbalanced dataset: {acc:.2f}%")



y_pred_unbalanced = clf_unbalanced.predict(X_unbalanced_test)
print(y_pred_unbalanced)

precision_unbalanced = precision_score(y_unbalanced_test, y_pred_unbalanced, zero_division= 1)
recall_unbalanced = recall_score(y_unbalanced_test, y_pred_unbalanced, zero_division= 1)
f1_unbalanced = f1_score(y_unbalanced_test, y_pred_unbalanced, zero_division= 1)


print(f"Unbalanced Dataset - Precision: {precision_unbalanced:.2f}")
print(f"Unbalanced Dataset - Recall: {recall_unbalanced:.2f}")
print(f"Unbalanced Dataset - F1-score: {f1_unbalanced:.2f}")

# Calculate balanced accuracy for logistic regression
balanced_acc_unbalanced = balanced_accuracy_score(y_unbalanced_test, y_pred_unbalanced) * 100
print(f"Logistic Regression Balanced Accuracy: {balanced_acc_unbalanced:.2f}%")


Logistic Regression model accuracy for the unbalanced dataset: 98.93%
[0 0 0 ... 0 0 0]
Unbalanced Dataset - Precision: 0.71
Unbalanced Dataset - Recall: 0.00
Unbalanced Dataset - F1-score: 0.01
Logistic Regression Balanced Accuracy: 50.19%


In [None]:
# Finding TPR
lr_unbalanced_probs = clf_unbalanced.predict_proba(X_unbalanced_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_unbalanced_test, lr_unbalanced_probs)

fpr_target = 0.05
threshold_at_fpr_5 = thresholds[np.where(fpr >= fpr_target)[0][0]]

# Calculate TPR at that threshold
tpr_at_fpr_5 = tpr[np.where(fpr >= fpr_target)[0][0]]

print(f'(Unbalanced LR) TPR at FPR = 5%: {tpr_at_fpr_5:.4f}')

(Unbalanced LR) TPR at FPR = 5%: 0.4496


In [None]:
#Balanced Data
X_balanced = undersample_df.drop(columns=['fraud_bool'])
y_balanced = undersample_df['fraud_bool']
#Splitting The data 70-30
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

clf = LogisticRegression(max_iter=10000, random_state=0)
clf.fit(X_train, y_train)

acc = accuracy_score(y_test, clf.predict(X_test)) * 100
print(f"Logistic Regression model accuracy for the balanced dataset: {acc:.2f}%")

#Calulate balanced accuracy for balanced dataset
y_pred_balanced = clf.predict(X_test)



precision_balanced = precision_score(y_test, y_pred_balanced)
recall_balanced = recall_score(y_test, y_pred_balanced)
f1_balanced = f1_score(y_test, y_pred_balanced)

print(f"Balanced Dataset - Precision: {precision_balanced:.2f}")
print(f"Balanced Dataset - Recall: {recall_balanced:.2f}")
print(f"Balanced Dataset - F1-score: {f1_balanced:.2f}")

balanced_acc_balanced = balanced_accuracy_score(y_test, y_pred_balanced) * 100
print(f"Logistic Regression Balanced Accuracy: {balanced_acc_balanced:.2f}%")

Logistic Regression model accuracy for the balanced dataset: 76.88%
Balanced Dataset - Precision: 0.77
Balanced Dataset - Recall: 0.76
Balanced Dataset - F1-score: 0.77
Logistic Regression Balanced Accuracy: 76.88%


In [None]:
# Finding TPR
lr_balanced_probs = clf.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, lr_balanced_probs)

fpr_target = 0.05
threshold_at_fpr_5 = thresholds[np.where(fpr >= fpr_target)[0][0]]

# Calculate TPR at that threshold
tpr_at_fpr_5 = tpr[np.where(fpr >= fpr_target)[0][0]]

print(f'(Balanced LR) TPR at FPR = 5%: {tpr_at_fpr_5:.4f}')

(Balanced LR) TPR at FPR = 5%: 0.4171


In [None]:
#Balanced Model being tested on the unbalanced data
y_predict_unbalanced = clf.predict(X_unbalanced_test)

# Calculate  accuracy for logistic regression
acc = accuracy_score(y_unbalanced_test, y_predict_unbalanced) * 100
print(f"Logistic Regression  Accuracy: {acc:.2f}%")

precision_unbalanced = precision_score(y_unbalanced_test, y_predict_unbalanced, zero_division=1)
recall_unbalanced = recall_score(y_unbalanced_test, y_predict_unbalanced, zero_division=1)
f1_unbalanced = f1_score(y_unbalanced_test, y_predict_unbalanced, zero_division=1)

print(f"Unbalanced Test Set - Precision: {precision_unbalanced:.2f}")
print(f"Unbalanced Test Set - Recall: {recall_unbalanced:.2f}")
print(f"Unbalanced Test Set - F1-score: {f1_unbalanced:.2f}")

balanced_acc_unbalanced = balanced_accuracy_score(y_unbalanced_test, y_predict_unbalanced) * 100
print(f"Logistic Regression Balanced Accuracy: {balanced_acc_unbalanced:.2f}%")

Logistic Regression  Accuracy: 52.18%
Unbalanced Test Set - Precision: 0.02
Unbalanced Test Set - Recall: 0.93
Unbalanced Test Set - F1-score: 0.04
Logistic Regression Balanced Accuracy: 72.31%


In [None]:
#Finding TPR using balanced model on unbalanced data
lr_unbalanced_probs = clf.predict_proba(X_unbalanced_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_unbalanced_test, lr_unbalanced_probs)

fpr_target = 0.05
threshold_at_fpr_5 = thresholds[np.where(fpr >= fpr_target)[0][0]]

tpr_at_fpr_5 = tpr[np.where(fpr >= fpr_target)[0][0]]

print(f'(Balanced LR on unbalanced data) TPR at FPR = 5%: {tpr_at_fpr_5:.4f}')

(Balanced LR on unbalanced data) TPR at FPR = 5%: 0.4396


# **Naive Bayes**

In [None]:
#Unbalanced Data
model = GaussianNB()
model.fit(X_unbalanced_train, y_unbalanced_train)

y_pred_unbalanced = model.predict(X_unbalanced_test)

acc = accuracy_score(y_unbalanced_test, model.predict(X_unbalanced_test)) * 100
print(f"Naive Bayes model accuracy for the unbalanced dataset: {acc:.2f}%")

precision_unbalanced = precision_score(y_unbalanced_test, y_pred_unbalanced)
recall_unbalanced = recall_score(y_unbalanced_test, y_pred_unbalanced)
f1_unbalanced = f1_score(y_unbalanced_test, y_pred_unbalanced)

print(f"Unbalanced Dataset - Precision: {precision_unbalanced:.2f}")
print(f"Unbalanced Dataset - Recall: {recall_unbalanced:.2f}")
print(f"Unbalanced Dataset - F1-score: {f1_unbalanced:.2f}")

balanced_acc_unbalanced = balanced_accuracy_score(y_unbalanced_test, y_pred_unbalanced) * 100
print(f"Naive Bayes Balanced Accuracy: {balanced_acc_unbalanced:.2f}%")

Naive Bayes model accuracy for the unbalanced dataset: 93.19%
Unbalanced Dataset - Precision: 0.05
Unbalanced Dataset - Recall: 0.32
Unbalanced Dataset - F1-score: 0.09
Naive Bayes Balanced Accuracy: 62.80%


In [None]:
#TPR
nb_unbalanced_probs = model.predict_proba(X_unbalanced_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_unbalanced_test, nb_unbalanced_probs)

fpr_target = 0.05
threshold_at_fpr_5 = thresholds[np.where(fpr >= fpr_target)[0][0]]

tpr_at_fpr_5 = tpr[np.where(fpr >= fpr_target)[0][0]]

print(f'(Unbalanced NB) TPR at FPR = 5%: {tpr_at_fpr_5:.4f}')

(Unbalanced NB) TPR at FPR = 5%: 0.2514


In [None]:
#Balanced Data
model = GaussianNB()
model.fit(X_train, y_train)

y_pred_balanced = model.predict(X_test)

acc = accuracy_score(y_test, model.predict(X_test)) * 100
print(f"Naive Bayes model accuracy for the balanced dataset: {acc:.2f}%")

precision_balanced = precision_score(y_test, y_pred_balanced)
recall_balanced = recall_score(y_test, y_pred_balanced)
f1_balanced = f1_score(y_test, y_pred_balanced)

print(f"Balanced Dataset - Precision: {precision_balanced:.2f}")
print(f"Balanced Dataset - Recall: {recall_balanced:.2f}")
print(f"Balanced Dataset - F1-score: {f1_balanced:.2f}")

balanced_acc_balanced = balanced_accuracy_score(y_test, y_pred_balanced) * 100
print(f"Naive Bayes Balanced Accuracy: {balanced_acc_balanced:.2f}%")

Naive Bayes model accuracy for the balanced dataset: 73.89%
Balanced Dataset - Precision: 0.75
Balanced Dataset - Recall: 0.70
Balanced Dataset - F1-score: 0.73
Naive Bayes Balanced Accuracy: 73.85%


In [None]:
#Finding TPR
nb_balanced_probs = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, nb_balanced_probs)

fpr_target = 0.05
threshold_at_fpr_5 = thresholds[np.where(fpr >= fpr_target)[0][0]]

tpr_at_fpr_5 = tpr[np.where(fpr >= fpr_target)[0][0]]
print(f'(Balanced NB) TPR at FPR = 5%: {tpr_at_fpr_5:.4f}')

(Balanced NB) TPR at FPR = 5%: 0.2272


In [None]:
#Balanced model on unbalanced data for naive bayes
y_predict_unbalanced = model.predict(X_unbalanced_test)

acc = accuracy_score(y_unbalanced_test, y_predict_unbalanced) * 100
print(f"Naive Bayes model accuracy for the unbalanced dataset: {acc:.2f}%")

precision_unbalanced = precision_score(y_unbalanced_test, y_predict_unbalanced)
recall_unbalanced = recall_score(y_unbalanced_test, y_predict_unbalanced)
f1_unbalanced = f1_score(y_unbalanced_test, y_predict_unbalanced)

print(f"Unbalanced Dataset - Precision: {precision_unbalanced:.2f}")
print(f"Unbalanced Dataset - Recall: {recall_unbalanced:.2f}")
print(f"Unbalanced Dataset - F1-score: {f1_unbalanced:.2f}")

balanced_acc_unbalanced = balanced_accuracy_score(y_unbalanced_test, y_predict_unbalanced) * 100
print(f"Naive Bayes Balanced Accuracy: {balanced_acc_unbalanced:.2f}%")


Naive Bayes model accuracy for the unbalanced dataset: 59.66%
Unbalanced Dataset - Precision: 0.02
Unbalanced Dataset - Recall: 0.86
Unbalanced Dataset - F1-score: 0.04
Naive Bayes Balanced Accuracy: 72.56%


# **XGBoost**

In [None]:
#Xgboost on unbalanced data
xgbmodel = XGBClassifier()
xgbmodel.fit(X_unbalanced_train, y_unbalanced_train)
xgbpred_unbalanced = xgbmodel.predict(X_unbalanced_test)
xgbacc = accuracy_score(y_unbalanced_test, xgbpred_unbalanced) * 100
print(f"XGBoost model accuracy for the unbalanced dataset: {xgbacc:.2f}%")

precision_unbalanced = precision_score(y_unbalanced_test, xgbpred_unbalanced)
recall_unbalanced = recall_score(y_unbalanced_test, xgbpred_unbalanced)
f1_unbalanced = f1_score(y_unbalanced_test, xgbpred_unbalanced)

print(f"Unbalanced Dataset - Precision: {precision_unbalanced:.2f}")
print(f"Unbalanced Dataset - Recall: {recall_unbalanced:.2f}")
print(f"Unbalanced Dataset - F1-score: {f1_unbalanced:.2f}")

#Balanced Acc
balanced_acc_unbalanced = balanced_accuracy_score(y_unbalanced_test, xgbpred_unbalanced) * 100
print(f"XGBoost Balanced Accuracy: {balanced_acc_unbalanced:.2f}%")

XGBoost model accuracy for the unbalanced dataset: 98.91%
Unbalanced Dataset - Precision: 0.41
Unbalanced Dataset - Recall: 0.04
Unbalanced Dataset - F1-score: 0.07
XGBoost Balanced Accuracy: 51.91%


In [None]:
#Finding TPR
xgb_unbalanced_probs = xgbmodel.predict_proba(X_unbalanced_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_unbalanced_test, xgb_unbalanced_probs)

fpr_target = 0.05
threshold_at_fpr_5 = thresholds[np.where(fpr >= fpr_target)[0][0]]
print(f'(Unbalanced XGB) TPR at FPR = 5%: {tpr[np.where(fpr >= fpr_target)[0][0]]:.4f}')

(Unbalanced XGB) TPR at FPR = 5%: 0.4969


In [None]:
#XGboost on balanced data
xgbmodel = XGBClassifier()
xgbmodel.fit(X_train, y_train)
xgbpred_balanced = xgbmodel.predict(X_test)
xgbacc = accuracy_score(y_test, xgbpred_balanced) * 100
print(f"XGBoost model accuracy for the balanced dataset: {xgbacc:.2f}%")

precision_balanced = precision_score(y_test, xgbpred_balanced)
recall_balanced = recall_score(y_test, xgbpred_balanced)
f1_balanced = f1_score(y_test, xgbpred_balanced)

print(f"Balanced Dataset - Precision: {precision_balanced:.2f}")
print(f"Balanced Dataset - Recall: {recall_balanced:.2f}")
print(f"Balanced Dataset - F1-score: {f1_balanced:.2f}")

balanced_acc_balanced = balanced_accuracy_score(y_test, xgbpred_balanced) * 100
print(f"XGBoost Balanced Accuracy: {balanced_acc_balanced:.2f}%")

XGBoost model accuracy for the balanced dataset: 78.91%
Balanced Dataset - Precision: 0.79
Balanced Dataset - Recall: 0.78
Balanced Dataset - F1-score: 0.79
XGBoost Balanced Accuracy: 78.90%


In [None]:
#Finding TRP
xgb_balanced_probs = xgbmodel.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, xgb_balanced_probs)

fpr_target = 0.05
threshold_at_fpr_5 = thresholds[np.where(fpr >= fpr_target)[0][0]]
print(f'(Balanced XGB) TPR at FPR = 5%: {tpr[np.where(fpr >= fpr_target)[0][0]]:.4f}')

(Balanced XGB) TPR at FPR = 5%: 0.4534


In [None]:
#Balanced Model on unbalanced data
xgbpred_unbalanced = xgbmodel.predict(X_unbalanced_test)
xgbacc = accuracy_score(y_unbalanced_test, xgbpred_unbalanced) * 100
print(f"XGBoost model accuracy for the unbalanced dataset: {xgbacc:.2f}%")

precision_unbalanced = precision_score(y_unbalanced_test, xgbpred_unbalanced)
recall_unbalanced = recall_score(y_unbalanced_test, xgbpred_unbalanced)
f1_unbalanced = f1_score(y_unbalanced_test, xgbpred_unbalanced)

print(f"Precision: {precision_unbalanced:.2f}")
print(f"Recall: {recall_unbalanced:.2f}")
print(f"F1-score: {f1_unbalanced:.2f}")

balanced_acc_unbalanced = balanced_accuracy_score(y_unbalanced_test, xgbpred_unbalanced) * 100
print(f"XGBoost Balanced Accuracy: {balanced_acc_unbalanced:.2f}%")

XGBoost model accuracy for the unbalanced dataset: 35.47%
Precision: 0.02
Recall: 0.96
F1-score: 0.03
XGBoost Balanced Accuracy: 65.48%


In [None]:
#TRP
xgb_unbalanced_probs = xgbmodel.predict_proba(X_unbalanced_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_unbalanced_test, xgb_unbalanced_probs)

fpr_target = 0.05
threshold_at_fpr_5 = thresholds[np.where(fpr >= fpr_target)[0][0]]
print(f'(Unbalanced XGB) TPR at FPR = 5%: {tpr[np.where(fpr >= fpr_target)[0][0]]:.4f}')
#

(Unbalanced XGB) TPR at FPR = 5%: 0.3111
