Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from scipy.stats import randint

Loading Dataset

In [2]:
data = pd.read_csv(r'C:\Users\hp\Downloads\Fraud.csv')

Convert necessary columns to appropriate data types

In [3]:
data['type'] = data['type'].astype('category')
data['nameOrig'] = data['nameOrig'].astype('category')
data['nameDest'] = data['nameDest'].astype('category')
data = pd.get_dummies(data, columns=['type'], drop_first=True)

Feature Engineering for Creating new features

In [4]:
data['errorOrig'] = data['oldbalanceOrg'] - data['newbalanceOrig'] - data['amount']
data['errorDest'] = data['oldbalanceDest'] + data['amount'] - data['newbalanceDest']

Removing outliers using IQR method

In [5]:
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

numerical_columns = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
for col in numerical_columns:
    data = remove_outliers(data, col)

Splitting the data into features and target variable

In [6]:
X = data.drop(['isFraud', 'isFlaggedFraud', 'nameOrig', 'nameDest'], axis=1)
y = data['isFraud']

Splitting the data into training and testing sets

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

Feature Scaling for Normalize/Standardize features

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Defining and train models

In [10]:
logreg = LogisticRegression(random_state=42)
rf = RandomForestClassifier(random_state=42)
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

logreg.fit(X_train_scaled, y_train)
rf.fit(X_train_scaled, y_train)
xgb.fit(X_train_scaled, y_train)

y_pred_logreg = logreg.predict(X_test_scaled)
y_pred_rf = rf.predict(X_test_scaled)
y_pred_xgb = xgb.predict(X_test_scaled)

Evaluating the model performance

In [12]:
def evaluate_model(y_true, y_pred, model_name):
    print(f"{model_name} Performance:")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("Classification Report:\n", classification_report(y_true, y_pred))

evaluate_model(y_test, y_pred_logreg, "Logistic Regression")
print()
evaluate_model(y_test, y_pred_rf, "Random Forest")
print()
evaluate_model(y_test, y_pred_xgb, "XGBoost")

Logistic Regression Performance:
Accuracy: 0.999449469614721
Precision: 0.9215017064846417
Recall: 0.4462809917355372
F1 Score: 0.601336302895323
Confusion Matrix:
 [[649654     23]
 [   335    270]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    649677
           1       0.92      0.45      0.60       605

    accuracy                           1.00    650282
   macro avg       0.96      0.72      0.80    650282
weighted avg       1.00      1.00      1.00    650282


Random Forest Performance:
Accuracy: 0.9999969244112554
Precision: 1.0
Recall: 0.996694214876033
F1 Score: 0.9983443708609271
Confusion Matrix:
 [[649677      0]
 [     2    603]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    649677
           1       1.00      1.00      1.00       605

    accuracy                           1.00    650282
   macro avg       1.00   