In [42]:
## for data query
import cx_Oracle
## Data loading, processing and for more
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE

## Visualization
import seaborn as sns
import matplotlib.pyplot as plt
# set seaborn style because it prettier
sns.set()

## Metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report 
from sklearn import metrics
from sklearn.metrics import auc
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve, f1_score

#encoding & preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline

## Models
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

## Some other libraries
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [43]:
%%time
data = pd.read_excel(r'C:\Users\SamadovIAz\Desktop\snd\machine learning models\fraud_detection\df_last.xlsx')
data

Wall time: 57.8 s


Unnamed: 0,CUSTOMER,AGE,GENDER,MCC_GROUP,AMOUNT,EXPERIENCE_BY_MONTH,Fraud
0,4JZ9Z87,33,M,AFFILIATES,16.00,60,0
1,3JHUMJD,40,M,AFFILIATES,258.00,82,0
2,1E6R28G,39,F,AFFILIATES,10.00,192,0
3,6BVAVXV,25,F,AFFILIATES,327.00,7,0
4,5HV1XKM,30,M,AFFILIATES,198.00,74,0
...,...,...,...,...,...,...,...
747931,28GM43V,36,M,WHOLESALE SUPPLIERS AND MANUFACTURERS,580.00,153,1
747932,5J3258D,30,M,WHOLESALE SUPPLIERS AND MANUFACTURERS,3.90,67,1
747933,5J3258D,30,M,WHOLESALE SUPPLIERS AND MANUFACTURERS,28.33,67,1
747934,5VJ1EWF,28,M,WHOLESALE SUPPLIERS AND MANUFACTURERS,530.00,43,1


In [44]:
features=['AGE','GENDER','MCC_GROUP','AMOUNT','EXPERIENCE_BY_MONTH']
target=['Fraud']
X = data[features]
y = data[target]

In [45]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42,shuffle=True)
features_to_encode = X_train.columns[X_train.dtypes==object].tolist()  
col_trans = make_column_transformer((OneHotEncoder(drop='first'),features_to_encode),remainder = "passthrough")

In [29]:
rf_classifier = RandomForestClassifier(
                      min_samples_leaf=50,
                      n_estimators=150,
                      bootstrap=True,
                      oob_score=True,
                      n_jobs=-1,
                      random_state=47,
                      max_features='sqrt')

In [30]:
pipe = make_pipeline(col_trans, rf_classifier)
pipe.fit(X_train,np.ravel(y_train))

In [31]:
y_pred = pipe.predict(X_test)
accuracy_score(y_test, y_pred)
print(f"The accuracy of the model is {round(accuracy_score(y_test,y_pred),3)*100} %")

The accuracy of the model is 99.0 %


In [32]:
print("Classification Report for Random Forest Classifier: \n", classification_report(y_test, y_pred))

Classification Report for Random Forest Classifier: 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99    217958
           1       0.90      0.74      0.81      6423

    accuracy                           0.99    224381
   macro avg       0.95      0.87      0.90    224381
weighted avg       0.99      0.99      0.99    224381



In [33]:
train_probs = pipe.predict_proba(X_train)[:,1] 
probs = pipe.predict_proba(X_test)[:, 1]
train_predictions = pipe.predict(X_train)
print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs)}')
print(f'Test ROC AUC  Score: {roc_auc_score(y_test, probs)}')

Train ROC AUC Score: 0.9977980029702094
Test ROC AUC  Score: 0.9974972353077315


In [34]:
deploy = pd.read_excel(r'C:\Users\SamadovIAz\Desktop\snd\machine learning models\fraud_detection\deploy_data.xlsx')
deploy

Unnamed: 0,Unnamed: 1,CUSTOMER,AGE,GENDER,MCC_GROUP,AMOUNT,EXPERIENCE_BY_MONTH,FRAUD
0,1,0Y52MRJ,42,M,PERSONAL SERVICES,30.00,259,0
1,2,5UNXDMB,29,M,PERSONAL SERVICES,100.00,53,0
2,3,6AAWL0A,25,M,PERSONAL SERVICES,5.00,42,0
3,4,6FE9BGD,24,M,PERSONAL SERVICES,5.00,10,0
4,5,166KB3F,39,M,PERSONAL SERVICES,0.87,171,0
...,...,...,...,...,...,...,...,...
208780,208781,63003KY,26,M,GOVERNMENT SERVICES,11.50,8,0
208781,208782,4SKPEVY,33,M,GOVERNMENT SERVICES,3.00,71,0
208782,208783,5LE54Q8,30,M,GOVERNMENT SERVICES,85.39,37,0
208783,208784,5FV3N60,31,M,GOVERNMENT SERVICES,55.00,5,0


In [35]:
test = deploy.copy()
test_withoutID = test.copy().drop('CUSTOMER', axis = 1)
final_y = pipe.predict(test_withoutID)
#pipe model only takes in dataframe without ID column.
final_report = test
final_report['Fraud_prediction'] = final_y
final_report = final_report.loc[:,['CUSTOMER','Fraud_prediction']]
# also we can replace 1-0 with Yes-No to make it interpretable
# final_report= final_report.replace(1, 'Yes')
# final_report= final_report.replace(0, 'No')
final_report

Unnamed: 0,CUSTOMER,Fraud_prediction
0,0Y52MRJ,0
1,5UNXDMB,0
2,6AAWL0A,0
3,6FE9BGD,0
4,166KB3F,0
...,...,...
208780,63003KY,0
208781,4SKPEVY,0
208782,5LE54Q8,0
208783,5FV3N60,0


In [41]:
pwd

'C:\\Users\\SamadovIAz\\Desktop\\snd\\machine learning models\\fraud_detection'

In [38]:
final_report.to_excel('deploy_prediction.xlsx', engine='xlsxwriter')