In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

%matplotlib inline

In [None]:
file_path = ''
df = pd.read_csv(file_path)

#Display the first few rows
df.head(10)

In [None]:
print(df.info())
df.describe()

In [None]:
fraud_counts = df['isFraud'].value_counts()
print(fraud_counts)

plt.figure(figsize=(6, 4))
sns.barplot(x=fraud_counts.index, y=fraud_counts.values)
plt.title("Distribution of Fraud vs Normal Transactions")
plt.xlabel('isFraud (0: Normal, 1: Fraud)')
plt.ylablel('Count')
plt.yscale('log')
plt.show()

In [None]:
missing_values = df.isnull().sum()
print(missing_values)

In [None]:
plt.figure(figsize=(10,4))
sns.boxplot(x=df['amount'])
plt.title('Boxplot of Transaction Amounts')
plt.show()

In [None]:
#Drop non-behavioural columns
cols_to_drop = ['nameOrig', 'nameDest', 'isFlaggedFraud']
df_processed = df.drop(cols_to_drop)

# Encode 'type' column
le = LabelEncoder()
df_processed['type'] = le.fit_transform(df_processed['type'])

# Feature Engineering: Balance Errors
df_processed['errorBalanceOrigin'] = df_processed['newbalanceOrig'] + df_processed['amount'] - df_processed['oldbalance']
df_processed['errorBalanceDest'] = df_processed['oldbalanceDest'] + df_processed['amount'] - df_processed['newbalanceDest']

#Preview processed data
df_processed.head(10)


In [None]:
sample_df = df_processed.sample(frac=0.1, random_state=42)

X = sample_df.drop(columns=['isFraud'])
y = sample_df['isFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f'Training Data Shape: {X_train.shape}')
print(f'Testing Data Shape: {X_test.shape}')

In [None]:
contamination_factor = y_train.value_counts(normalize=True)[1]
print(f'Dynamic Contamination Factor: {contamination_factor}')

iso_forest = IsolationForest(n_estimators=100, contamination=contamination_factor, random_state=42, n_jobs=-1)

iso_forest.fit(X_train_scaled)

In [4]:
def evaluate_model(model, X, y, set_name="Daya"):
  y_pred = model.predict(X)

  y_pred_mapped = [1 if x == -1 else 0 for x in y_pred]

  print(f"--- {set_name} Evaluation ---")
  print(classification_report(y, y_pred_mapped))

  return y_pred_mapped
  y_train_pred = evaluate_model(iso_forst, X_train_scaled, y_train, set_name="Training")

  y_test_pred = evaluation_model(iso_forst, X_train_scaled, y_test, set_name="Testing")

SyntaxError: invalid syntax (ipython-input-3039763580.py, line 9)

In [None]:
cm = confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted Normal', 'Predicted Fraud', yticklabels=['Actual Normal'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Test set)')
plt.show()