In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import zipfile

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

# Extract ZIP and Load Dataset
with zipfile.ZipFile("fraudTest.csv.zip", 'r') as zip_ref:
    zip_ref.extractall()

df = pd.read_csv("fraudTest.csv")
print("✅ Dataset Loaded")
print(df.head())

# Dataset Overview
print("\nColumns in the dataset:")
print(df.columns)
print("\nMissing values:")
print(df.isnull().sum())
print("\nClass distribution:")
print(df['is_fraud'].value_counts())

# Visualize Class Distribution
fraud_count = df['is_fraud'].value_counts().sort_index()
labels = ['Legit (0)', 'Fraud (1)']
colors_bar = ['#66bb6a', '#ef5350']
colors_pie = ['#42a5f5', '#ef5350']

plt.figure(figsize=(6, 4))
sns.barplot(x=labels, y=fraud_count.values, palette=colors_bar)
for i, val in enumerate(fraud_count.values):
    plt.text(i, val + 1000, f'{val:,}', ha='center', fontweight='bold')
plt.title("Fraud vs Legit Transactions")
plt.xlabel("Type")
plt.ylabel("Count")
plt.grid(axis='y', linestyle='--', alpha=0.3)
plt.tight_layout()
plt.show()

plt.figure(figsize=(5, 5))
plt.pie(fraud_count, labels=labels, colors=colors_pie, autopct='%1.2f%%', startangle=90)
plt.title("Class Distribution")
plt.axis('equal')
plt.show()

# Drop Unnecessary Columns
columns_to_drop = [
    'trans_num', 'trans_date_trans_time', 'first', 'last',
    'dob', 'unix_time', 'merchant', 'cc_num', 'street'
]
df = df.drop([col for col in columns_to_drop if col in df.columns], axis=1)

# Encode Categorical Columns
categorical_cols = ['gender', 'category', 'city', 'state', 'job']
le = LabelEncoder()
for col in categorical_cols:
    if col in df.columns:
        df[col] = le.fit_transform(df[col])

# Train/Test Split
X = df.drop('is_fraud', axis=1)
y = df['is_fraud']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Logistic Regression
log_model = LogisticRegression(max_iter=5000, class_weight='balanced')
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)

print("\n📊 Logistic Regression Evaluation")
print(confusion_matrix(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("\n🌲 Random Forest Evaluation")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# ROC Curve
y_probs = rf_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_probs)

plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label='Random Forest (AUC = {:.2f})'.format(roc_auc_score(y_test, y_probs)))
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid()
plt.show()