# This block of code imports the necessary libraries:
## .pandas for data manipulation.
## .numpy for numerical operations.
## .train_test_split from sklearn.model_selection for splitting the dataset.
## .StandardScaler from sklearn.preprocessing for feature scaling.
## .Classification_report, confusion_matrix, and roc_auc_score from sklearn.metrics for model evaluation.
## .xgboost for the XGBoost classifier.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import xgboost as xgb

## This line reads the dataset from a CSV file into a pandas DataFrame. Replace 'path_to_your_dataset.csv' with the actual path to your dataset.

In [2]:
# Load Data
data = pd.read_csv('Fraud (1).csv')

In [3]:
data

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0,0
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1,0
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.00,C776919290,0.00,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.00,C1881841831,0.00,0.00,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.00,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.00,C2080388513,0.00,0.00,1,0


## This line drops the nameOrig and nameDest columns because they are not useful for the model (they are likely just IDs and do not contain useful information for prediction).

In [4]:
# Data Preprocessing
# Drop columns that are not useful for the model
data = data.drop(['nameOrig', 'nameDest'], axis=1)

## This line converts the categorical type column into dummy/one-hot encoded variables. drop_first=True avoids the dummy variable trap by dropping the first category.

In [5]:
# Handle categorical data
data = pd.get_dummies(data, columns=['type'], drop_first=True)

## This line fills any missing values in the dataset with 0. The inplace=True parameter ensures that the changes are made directly to the DataFrame.

In [6]:
# Handle missing values if any
data.fillna(0, inplace=True)

# These lines create two new features:

## errorBalanceOrig: Measures the difference between the new and old balances of the origin account after accounting for the transaction amount.
## errorBalanceDest: Measures the difference between the new and old balances of the destination account after accounting for the transaction amount.

In [7]:
# Feature Engineering
# Create new features based on existing data if necessary
data['errorBalanceOrig'] = data['newbalanceOrig'] + data['amount'] - data['oldbalanceOrg']
data['errorBalanceDest'] = data['oldbalanceDest'] + data['amount'] - data['newbalanceDest']

# This block defines the features (X) and the target variable (y):

## X includes all columns except isFraud and isFlaggedFraud.
## y is the isFraud column which indicates whether the transaction is fraudulent.

In [8]:
# Define features and target variable
X = data.drop(['isFraud', 'isFlaggedFraud'], axis=1)
y = data['isFraud']

# This line splits the dataset into training and testing sets:

## 80% of the data is used for training (X_train, y_train).
## 20% of the data is used for testing (X_test, y_test).
## random_state=42 ensures reproducibility of the split.

In [9]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# These lines scale the features to have a mean of 0 and a standard deviation of 1, which helps in improving the performance of the model. The StandardScaler is first fitted on the training data and then applied to both the training and testing data.

In [10]:
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# This block creates and trains an XGBoost classifier:

## n_estimators=100: The number of trees in the forest.
## learning_rate=0.1: The step size shrinkage used to prevent overfitting.
## max_depth=5: The maximum depth of a tree.
## random_state=42: Ensures reproducibility.

In [11]:
# Model Training
model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
model.fit(X_train, y_train)


# This block evaluates the model:

## y_pred: Predictions of the test set.
## y_pred_proba: Predicted probabilities for the positive class (fraud).
## confusion_matrix: Prints the confusion matrix showing the counts of true/false positives/negatives.
## classification_report: Prints precision, recall, f1-score, and support for each class.
## roc_auc_score: Prints the Area Under the Receiver Operating Characteristic Curve, which is a measure of the model's ability to distinguish between classes.

In [12]:
# Model Evaluation
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:,1]

In [13]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nROC AUC Score:")
print(roc_auc_score(y_test, y_pred_proba))

Confusion Matrix:
[[1270839      65]
 [     84    1536]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270904
           1       0.96      0.95      0.95      1620

    accuracy                           1.00   1272524
   macro avg       0.98      0.97      0.98   1272524
weighted avg       1.00      1.00      1.00   1272524


ROC AUC Score:
0.9998080466180076
