import neccessary libraries

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')


Load the dataset

In [24]:
df = pd.read_csv(r'D:\DataScience\Dataset\creditcard.csv')
print(df.head())

   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

Explore the dataset

In [25]:
print("\n" + "="*50)
print("DATA EXPLORATION")
print("="*50)


DATA EXPLORATION


Check for missing values

In [26]:
 
print("\nMissing values:")
print(df.isnull().sum())


Missing values:
Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64


Check class distribution

In [27]:
 
print("\nClass distribution:")
print(df['Class'].value_counts())
print(f"\nFraud percentage: {df['Class'].value_counts()[1] / len(df) * 100:.4f}%")



Class distribution:
0    284315
1       492
Name: Class, dtype: int64

Fraud percentage: 0.1727%


Basic statistics

In [28]:

print("\nBasic statistics:")
print(df[['Time', 'Amount', 'Class']].describe())



Basic statistics:
                Time         Amount          Class
count  284807.000000  284807.000000  284807.000000
mean    94813.859575      88.349619       0.001727
std     47488.145955     250.120109       0.041527
min         0.000000       0.000000       0.000000
25%     54201.500000       5.600000       0.000000
50%     84692.000000      22.000000       0.000000
75%    139320.500000      77.165000       0.000000
max    172792.000000   25691.160000       1.000000


DATA PREPROCESSING

In [29]:

print("\n" + "="*50)
print("DATA PREPROCESSING")
print("="*50)


DATA PREPROCESSING


 Handle the 'Time' feature - convert to meaningful hours

In [30]:

df['Hour'] = df['Time'] % 24
df['Day'] = (df['Time'] // 24) % 7


Scale the 'Amount' feature

In [31]:

scaler = StandardScaler()
df['Amount_Scaled'] = scaler.fit_transform(df['Amount'].values.reshape(-1, 1))


Create new features

In [32]:

df['Amount_Time_Ratio'] = df['Amount'] / (df['Time'] + 1)
df['Transaction_Frequency'] = df.groupby('Hour')['Hour'].transform('count')


Prepare features and target

In [33]:

features = [col for col in df.columns if col not in ['Time', 'Amount', 'Class']]
X = df[features]
y = df['Class']

print(f"Features used: {len(features)}")
print(f"Feature names: {features}")


Features used: 33
Feature names: ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Hour', 'Day', 'Amount_Scaled', 'Amount_Time_Ratio', 'Transaction_Frequency']


Split the data

In [34]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")



Training set shape: (227845, 33)
Test set shape: (56962, 33)


Handle class imbalance using SMOTE

In [35]:

print("\nApplying SMOTE to handle class imbalance...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(f"After SMOTE - Training set shape: {X_train_resampled.shape}")
print(f"Class distribution after SMOTE: {pd.Series(y_train_resampled).value_counts()}")



Applying SMOTE to handle class imbalance...
After SMOTE - Training set shape: (454902, 33)
Class distribution after SMOTE: 0    227451
1    227451
Name: Class, dtype: int64


Scale the 'Amount' feature

In [36]:

scaler = StandardScaler()
df['Amount_Scaled'] = scaler.fit_transform(df['Amount'].values.reshape(-1, 1))


Create new features

In [37]:

df['Amount_Time_Ratio'] = df['Amount'] / (df['Time'] + 1)
df['Transaction_Frequency'] = df.groupby('Hour')['Hour'].transform('count')


In [38]:
# Prepare features and target
features = [col for col in df.columns if col not in ['Time', 'Amount', 'Class']]
X = df[features]
y = df['Class']

print(f"Features used: {len(features)}")
print(f"Feature names: {features}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

# Handle class imbalance using SMOTE
print("\nApplying SMOTE to handle class imbalance...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(f"After SMOTE - Training set shape: {X_train_resampled.shape}")
print(f"Class distribution after SMOTE: {pd.Series(y_train_resampled).value_counts()}")




Features used: 33
Feature names: ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Hour', 'Day', 'Amount_Scaled', 'Amount_Time_Ratio', 'Transaction_Frequency']

Training set shape: (227845, 33)
Test set shape: (56962, 33)

Applying SMOTE to handle class imbalance...
After SMOTE - Training set shape: (454902, 33)
Class distribution after SMOTE: 0    227451
1    227451
Name: Class, dtype: int64


In [39]:
# MODEL 2: RANDOM FOREST
print("\n" + "="*50)
print("RANDOM FOREST MODEL")
print("="*50)

rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    class_weight='balanced'  # Handle imbalance
)

rf_model.fit(X_train_resampled, y_train_resampled)

# Predictions
y_train_pred_rf = rf_model.predict(X_train_resampled)
y_test_pred_rf = rf_model.predict(X_test)
y_test_prob_rf = rf_model.predict_proba(X_test)[:, 1]

# Calculate metrics
train_accuracy_rf = accuracy_score(y_train_resampled, y_train_pred_rf)
test_accuracy_rf = accuracy_score(y_test, y_test_pred_rf)
auc_score_rf = roc_auc_score(y_test, y_test_prob_rf)

print(f"Random Forest Training Accuracy: {train_accuracy_rf:.4f} ({train_accuracy_rf*100:.2f}%)")
print(f"Random Forest Test Accuracy:     {test_accuracy_rf:.4f} ({test_accuracy_rf*100:.2f}%)")
print(f"Random Forest AUC Score:         {auc_score_rf:.4f}")
print(f"Accuracy Difference:             {abs(train_accuracy_rf - test_accuracy_rf):.4f}")



RANDOM FOREST MODEL
Random Forest Training Accuracy: 0.9916 (99.16%)
Random Forest Test Accuracy:     0.9973 (99.73%)
Random Forest AUC Score:         0.9856
Accuracy Difference:             0.0057
