In [1]:
# Load necessary packages
import numpy as np
import pandas as pd
import matplotlib as mpl
from plotnine import *

# Visualization modules
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

# Modeling and evaluation modules 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

import warnings
warnings.filterwarnings("ignore")

# Module for saving model
import joblib 

# Set seed for reproducibility 
SEED = 42

In [2]:
# Load the dataset
df = pd.read_csv('df_model.csv')
print(f"Original Shape: {df.shape}")
df.head()

Original Shape: (20000, 7)


Unnamed: 0,type,amount,oldbalanceOrig,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,PAYMENT,600.0,170136.0,160296.36,0.0,0.0,0
1,PAYMENT,10000.0,21249.0,19384.72,0.0,0.0,0
2,PAYMENT,1000.0,181.0,0.0,0.0,0.0,0
3,PAYMENT,700.0,181.0,0.0,21182.0,0.0,0
4,PAYMENT,1000.0,41554.0,29885.86,0.0,0.0,0


In [10]:
y = df["isFraud"]
X = df.drop("isFraud", axis = 1)

In [11]:
# Split into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify = y)

In [12]:
#setting categorical and numerical data

categorical = ["type"]
numeric = ["amount", "oldbalanceOrig", "newbalanceOrig", "oldbalanceDest", "newbalanceDest"]

In [13]:
preprocessor = ColumnTransformer(
    transformers= [
        ("num", StandardScaler(), numeric),
        ("cat", OneHotEncoder(drop="first"), categorical)
    ],
    remainder = "drop"
)

In [14]:
#Creating a model pipeline
pipeline = Pipeline([
    ("prep", preprocessor),
    ("clf", LogisticRegression(class_weight = "balanced", max_iter=1000))
])

In [15]:
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('prep', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [17]:
#Make predictions

y_pred = pipeline.predict(X_test)

In [18]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      5880
           1       0.29      0.64      0.40       120

    accuracy                           0.96      6000
   macro avg       0.64      0.81      0.69      6000
weighted avg       0.98      0.96      0.97      6000



In [19]:
# Confusion matrix
confusion_matrix(y_test,y_pred)

array([[5694,  186],
       [  43,   77]])

In [20]:
pipeline.score(X_test, y_test)*100

96.18333333333334

In [21]:
#saving the model

joblib.dump(pipeline, "Logistic_fraud_model.pkl")

['Logistic_fraud_model.pkl']