In [61]:
#libraries imports
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

#scikit-learn imports
from sklearn.model_selection import train_test_split , cross_val_score
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,IsolationForest
from sklearn.metrics import (
    classification_report , confusion_matrix , precision_recall_curve, average_precision_score , 
    f1_score , precision_score , recall_score, roc_auc_score
)


In [62]:
# starting by loading dataset
df = pd.read_csv('creditcard.csv')
df.shape

(284807, 31)

In [63]:
df.info

<bound method DataFrame.info of             Time         V1         V2        V3        V4        V5  \
0            0.0  -1.359807  -0.072781  2.536347  1.378155 -0.338321   
1            0.0   1.191857   0.266151  0.166480  0.448154  0.060018   
2            1.0  -1.358354  -1.340163  1.773209  0.379780 -0.503198   
3            1.0  -0.966272  -0.185226  1.792993 -0.863291 -0.010309   
4            2.0  -1.158233   0.877737  1.548718  0.403034 -0.407193   
...          ...        ...        ...       ...       ...       ...   
284802  172786.0 -11.881118  10.071785 -9.834783 -2.066656 -5.364473   
284803  172787.0  -0.732789  -0.055080  2.035030 -0.738589  0.868229   
284804  172788.0   1.919565  -0.301254 -3.249640 -0.557828  2.630515   
284805  172788.0  -0.240440   0.530483  0.702510  0.689799 -0.377961   
284806  172792.0  -0.533413  -0.189733  0.703337 -0.506271 -0.012546   

              V6        V7        V8        V9  ...       V21       V22  \
0       0.462388  0.239599  

In [64]:
print(f"Fraud Transactions: {df['Class'].sum():,} ({df['Class'].mean()*100:.3f}%)")
print(f"Normal Transactions: {(1-df['Class']).sum():,} ({(1-df['Class']).mean()*100:.3f}%)")

Fraud Transactions: 492 (0.173%)
Normal Transactions: 284,315 (99.827%)


In [65]:
#Feature Engineering
df['Amount_log']= np.log(df['Amount']+1)
df['Hour'] = (df['Time'] % (24*3600)) // 3600

#Create Time Based Features 
df['Time_sin'] = np.sin(2 * np.pi * df['Hour']/24)
df['Time_cos'] = np.cos(2 * np.pi * df['Hour']/24)

In [66]:
#exclude time and class 
features_cols = []
for col in df.columns:
    if col not in ['Time' , 'Class']:
        features_cols.append(col)
X=df[features_cols]
y=df['Class']

In [67]:
X.shape
X.columns



Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
       'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Amount_log',
       'Hour', 'Time_sin', 'Time_cos'],
      dtype='object')

In [68]:
X_train,X_test,y_train,y_test = train_test_split(X, y , test_size=0.2 , random_state=42, stratify=y)
print(f"Train Set: {len(X_train):,} samples")
print(f"fRAUD: {y_train.sum():,} ({y_train.mean()*100:.3f}%)")
print(f"Test Set: {len(X_test):,} samples ")
print(f"fRAUD: {y_test.sum():,} ({y_test.mean()*100:.3f}%)")

Train Set: 227,845 samples
fRAUD: 394 (0.173%)
Test Set: 56,962 samples 
fRAUD: 98 (0.172%)


In [69]:
#Feature Scaling
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=features_cols, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=features_cols, index=X_test.index)

In [70]:
X_train_normal = X_train_scaled[y_train == 0]
iso_forest = IsolationForest(
    contamination=0.002,
    random_state=42,
    n_jobs=-1 
)
iso_forest.fit(X_train_normal)



y_pred_iso = iso_forest.predict(X_test_scaled)
y_pred_iso_binary = np.where(y_pred_iso == -1 , 1 , 0)

y_scores_iso = -iso_forest.score_samples(X_test_scaled)

#calculate metrics 
iso_metrics = {
    'precision': precision_score(y_test, y_pred_iso_binary),
    'recall': recall_score(y_test, y_pred_iso_binary),
    'f1': f1_score(y_test, y_pred_iso_binary),
    'roc_auc': roc_auc_score(y_test, y_scores_iso)

}



In [71]:
print(iso_metrics)

{'precision': 0.17647058823529413, 'recall': 0.24489795918367346, 'f1': 0.20512820512820512, 'roc_auc': 0.955913608409036}


In [72]:
# 1. Create a DataFrame for test set analysis
test_analysis_df = X_test_scaled.copy()

# 2. Add the true labels and binary predictions
test_analysis_df['True_Class'] = y_test
test_analysis_df['Predicted_Fraud'] = y_pred_iso_binary # 1 for predicted fraud, 0 for predicted normal

In [73]:
# Filter for all transactions the model predicted as fraud
flagged_transactions = test_analysis_df[test_analysis_df['Predicted_Fraud'] == 1]

print(f"Total transactions flagged by Isolation Forest: {len(flagged_transactions):,}")

Total transactions flagged by Isolation Forest: 136


In [74]:
# Count the True Positives (TP) and False Positives (FP)
TP_transactions = flagged_transactions[flagged_transactions['True_Class'] == 1]
FP_transactions = flagged_transactions[flagged_transactions['True_Class'] == 0]

print(f"True Positives (Actual Fraud): {len(TP_transactions)}")
print(f"False Positives (False Alarms): {len(FP_transactions)}")

True Positives (Actual Fraud): 24
False Positives (False Alarms): 112


In [75]:
# Features of True Positives (Correctly flagged fraud)
print("\nMean Feature Values for True Positives:")
print(TP_transactions[features_cols].mean().sort_values(ascending=False).head(5))

# Features of False Positives (False Alarms)
print("\nMean Feature Values for False Positives (False Alarms):")
print(FP_transactions[features_cols].mean().sort_values(ascending=False).head(5))


Mean Feature Values for True Positives:
V11    6.080441
V2     5.540434
V4     5.050788
V21    3.456789
V19    2.057692
dtype: float64

Mean Feature Values for False Positives (False Alarms):
Amount        7.828583
V4            2.738057
V28           2.066464
Amount_log    1.775852
V6            1.466523
dtype: float64
