In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn as sk
import datetime as dt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.combine import SMOTEENN
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
import joblib
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder


In [40]:
pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting statsmodels>=0.9.0 (from category_encoders)
  Downloading statsmodels-0.14.0-cp311-cp311-win_amd64.whl (9.2 MB)
     ---------------------------------------- 0.0/9.2 MB ? eta -:--:--
     - -------------------------------------- 0.3/9.2 MB 8.9 MB/s eta 0:00:01
     --- ------------------------------------ 0.7/9.2 MB 9.2 MB/s eta 0:00:01
     ----- ---------------------------------- 1.3/9.2 MB 10.1 MB/s eta 0:00:01
     ------ --------------------------------- 1.6/9.2 MB 10.2 MB/s eta 0:00:01
     ---------- ----------------------------- 2.4/9.2 MB 10.9 MB/s eta 0:00:01
     ------------ --------------------------- 2.9/9.2 MB 11.0 MB/s eta 0:00:01
     -------------- ------------------------- 3.4/9.2 MB 10.9 MB/s eta 0:00:01
     ----------------- ---------------------- 4.0/9.2 MB 11.1 MB/s eta 0:00:01
     ------------------ --------------------- 4.2/9.2 MB 11.1 MB/s eta

In [3]:
pd.options.display.max_rows=999
pd.options.display.max_columns=999

In [3]:
df=pd.read_excel('fraudTrain-Final.xlsx')

In [4]:
df.columns

Index(['S.no', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'Avg. Transaction Freq. Per day',
       'Avg. Transaction Freq. Per week', 'Avg. Transaction Freq. Per month',
       'Transaction velocity', 'is_fraud'],
      dtype='object')

In [5]:
df.drop(columns=['S.no','merchant','first','last','street','cc_num','zip','lat','long','job','trans_num','unix_time','merch_lat','merch_long'],inplace=True)

In [7]:
df.columns

Index(['trans_date_trans_time', 'category', 'amt', 'gender', 'city', 'state',
       'city_pop', 'dob', 'Avg. Transaction Freq. Per day',
       'Avg. Transaction Freq. Per week', 'Avg. Transaction Freq. Per month',
       'Transaction velocity', 'is_fraud'],
      dtype='object')

In [8]:
df['age'] = dt.date.today().year - pd.to_datetime(df['dob']).dt.year
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], format='%d-%m-%Y %H:%M')
df['hour']=pd.to_datetime(df['trans_date_trans_time']).dt.hour
df['day']=pd.to_datetime(df['trans_date_trans_time']).dt.dayofweek
df['month']=pd.to_datetime(df['trans_date_trans_time']).dt.month

In [30]:
df.drop(columns=['trans_date_trans_time','dob'],inplace=True)

In [31]:
numerical_columns = df.select_dtypes(include=[int, float]).columns
num=list(numerical_columns)
num.remove('is_fraud')
num

['amt',
 'city_pop',
 'Avg. Transaction Freq. Per day',
 'Avg. Transaction Freq. Per week',
 'Avg. Transaction Freq. Per month',
 'age',
 'hour',
 'day',
 'month']

In [32]:
cat_columns = df.select_dtypes(include='object').columns
cat=list(cat_columns)
cat

['category', 'gender', 'city', 'state', 'Transaction velocity']

In [33]:
pipeline = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), num),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), cat)
    ])),
    ('classifier', RandomForestClassifier())
])

In [34]:
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]), num),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('target_encoder', TargetEncoder())  # Using TargetEncoder from category_encoders
    ]), cat)
])

# Combine the updated 'preprocessor' with the rest of the pipeline
pipeline_1 = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

In [35]:
df_majority=df[df['is_fraud']==0]
df_minority=df[df['is_fraud']==1]
df_majority_downsampled = resample(df_majority,replace=False,n_samples=len(df_minority),random_state=42)

In [36]:
df_down=pd.concat([df_majority_downsampled,df_minority])

In [45]:
df_down

Unnamed: 0,category,amt,gender,city,state,city_pop,Avg. Transaction Freq. Per day,Avg. Transaction Freq. Per week,Avg. Transaction Freq. Per month,Transaction velocity,is_fraud,age,hour,day,month
669418,shopping_pos,7.53,F,Kilgore,TX,24536,2.788306,19.518139,83.649166,Medium,0,40,18,5,10
32567,travel,3.79,F,Washington Court House,OH,22305,2.845820,19.920741,85.374603,Medium,0,84,13,6,1
156587,entertainment,59.07,F,Preston,CT,4720,0.982349,6.876440,29.470457,Low,0,46,18,6,3
1020243,personal_care,25.58,M,Kirk,CO,207,1.904882,13.334174,57.146460,Medium,0,68,15,1,2
116272,personal_care,84.96,F,Baroda,MI,3104,2.841219,19.888533,85.236568,Medium,0,42,23,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1047089,misc_net,690.49,F,Spencer,SD,343,4.746101,33.222707,142.383028,High,1,51,3,1,3
1047157,grocery_pos,324.74,M,Aledo,TX,13602,1.895680,13.269758,56.870390,Medium,1,39,4,1,3
1047208,grocery_pos,331.33,F,Spencer,SD,343,4.746101,33.222707,142.383028,High,1,51,4,1,3
1047521,grocery_pos,356.20,F,Spencer,SD,343,4.746101,33.222707,142.383028,High,1,51,8,1,3


In [37]:
x_down=df_down.drop('is_fraud',axis=1)
y_down=df_down['is_fraud']

In [46]:
x_down.columns

Index(['category', 'amt', 'gender', 'city', 'state', 'city_pop',
       'Avg. Transaction Freq. Per day', 'Avg. Transaction Freq. Per week',
       'Avg. Transaction Freq. Per month', 'Transaction velocity', 'age',
       'hour', 'day', 'month'],
      dtype='object')

In [38]:
x_train_downsampled, x_test_downsampled, y_train_downsampled, y_test_downsampled = train_test_split(x_down, y_down, test_size=0.2, random_state=42)

In [39]:
pipeline.fit(x_train_downsampled, y_train_downsampled)
pipeline_1.fit(x_train_downsampled, y_train_downsampled)


In [40]:
y_pred_downsampled = pipeline.predict(x_test_downsampled)
y_pred_downsampled_1 = pipeline_1.predict(x_test_downsampled)

In [41]:
accuracy = accuracy_score(y_test_downsampled, y_pred_downsampled)
print(f"Accuracy: {accuracy:.2f}")
# Displaying additional metrics
print("Classification Report:")
print(classification_report(y_test_downsampled, y_pred_downsampled))

print("Confusion Matrix:")
print(confusion_matrix(y_test_downsampled, y_pred_downsampled))

Accuracy: 0.96
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1220
           1       0.97      0.94      0.96      1183

    accuracy                           0.96      2403
   macro avg       0.96      0.96      0.96      2403
weighted avg       0.96      0.96      0.96      2403

Confusion Matrix:
[[1183   37]
 [  66 1117]]


In [42]:
accuracy_1 = accuracy_score(y_test_downsampled, y_pred_downsampled_1)
print(f"Accuracy: {accuracy_1:.2f}")
# Displaying additional metrics
print("Classification Report:")
print(classification_report(y_test_downsampled, y_pred_downsampled_1))

print("Confusion Matrix:")
print(confusion_matrix(y_test_downsampled, y_pred_downsampled_1))

Accuracy: 0.96
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1220
           1       0.96      0.96      0.96      1183

    accuracy                           0.96      2403
   macro avg       0.96      0.96      0.96      2403
weighted avg       0.96      0.96      0.96      2403

Confusion Matrix:
[[1174   46]
 [  47 1136]]


In [43]:
joblib.dump(pipeline_1, 'random_forest_pipeline.pkl')

['random_forest_pipeline.pkl']

In [44]:
df_down.sample(10)

Unnamed: 0,category,amt,gender,city,state,city_pop,Avg. Transaction Freq. Per day,Avg. Transaction Freq. Per week,Avg. Transaction Freq. Per month,Transaction velocity,is_fraud,age,hour,day,month
138324,shopping_net,1204.44,M,Payson,IL,1656,2.774502,19.421514,83.235062,Medium,1,54,22,5,3
837715,kids_pets,17.42,F,Dongola,IL,2263,1.918685,13.430798,57.560565,Medium,1,39,23,3,12
227941,home,166.49,M,Burke,VA,43102,2.758398,19.308786,82.751939,Medium,0,73,12,3,4
765039,misc_net,756.9,M,West Palm Beach,FL,459921,0.95014,6.650983,28.504213,Low,1,54,22,5,11
536083,personal_care,15.79,F,Superior,AZ,2872,5.802068,40.614477,174.062044,High,0,36,12,6,8
286266,travel,6.52,F,Veedersburg,IN,4049,1.865772,13.060405,55.973163,Medium,0,64,18,0,5
954701,shopping_pos,606.65,M,Apison,TN,3730,0.98695,6.908648,29.608492,Low,1,32,19,5,1
477762,home,32.7,M,Methuen,MA,47249,3.812065,26.684452,114.361938,High,0,43,19,6,7
312277,gas_transport,22.92,M,Lohrville,IA,695,3.922493,27.457448,117.674776,High,0,69,8,4,5
1046415,travel,11.56,F,Winnsboro,SC,14267,0.025306,0.177145,0.759192,Low,1,60,22,0,3
