In [None]:
#DECISION TREES

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

# Load data
df = pd.read_excel(r"C:\Users\hasee\OneDrive\Desktop\FDA Final Project\fraud_detection_dataset3.csv")

# Drop irrelevant and high-correlation features
drop_cols = [
    'User_ID', 'Signup_Time', 'IP_Address', 'Device_Fingerprint',
    'VPN_Used', 'Failed_Attempts', 'Time_Taken_To_Signup',
    'Typing_Speed', 'Geolocation_Mismatch', 'ID_Verification_Time'
]
df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True)

# Convert binary Yes/No columns to 1/0
binary_cols = ['Name_Matches_ID', 'Phone_Number_Reuse', 'Social_Media_Linked',
               'Is_CNIC_Verified_With_NADRA', 'Is_SIM_Active',
               'Accounts_From_Same_Device', 'All_Documents_Uploaded',
               'Social_Media_Verification', 'Historical_Fraud',
               'Email_Verified', 'Phone_Verified', 'Browser_Language_Mismatch']

for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0, 'True': 1, 'False': 0})
    df[col] = df[col].fillna(0).astype(int)

# One-hot encode Email_Domain
df = pd.get_dummies(df, columns=['Email_Domain'], drop_first=True)

# Drop remaining NA rows
df.dropna(inplace=True)

# Define features and label
X = df.drop(columns=['Is_Fraud'])
y = df['Is_Fraud']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

# Train Decision Tree
clf = DecisionTreeClassifier(random_state=42, max_depth=5)
clf.fit(X_train_sm, y_train_sm)

# Predict
y_pred = clf.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
#CATBOOST

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from catboost import CatBoostClassifier

# Load dataset
df = pd.read_excel(r"C:\Users\hasee\OneDrive\Desktop\FDA Final Project\fraud_detection_dataset3.csv")

# Drop User_ID (not useful for prediction)
df = df.drop(columns=['User_ID'])

# List of binary columns (Yes/No to 1/0)
binary_cols = [
    'Name_Matches_ID', 'Phone_Number_Reuse', 'Social_Media_Linked',
    'Geolocation_Mismatch', 'VPN_Used', 'Is_CNIC_Verified_With_NADRA',
    'Is_SIM_Active', 'Accounts_From_Same_Device', 'All_Documents_Uploaded',
    'Social_Media_Verification', 'Email_Verified', 'Phone_Verified',
    'Browser_Language_Mismatch', 'Historical_Fraud'
]

# Convert binary columns to 1/0
for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0})

# Define target and features
y = df['Is_Fraud']
X = df.drop(columns=['Is_Fraud'])

# List of categorical columns that should be handled by CatBoost
categorical_cols = ['IP_Address', 'Device_Fingerprint', 'Email_Domain']

# Get indices of categorical columns
cat_feature_indices = [X.columns.get_loc(col) for col in categorical_cols]

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train CatBoost model
model = CatBoostClassifier(verbose=0, random_state=42)
model.fit(X_train, y_train, cat_features=cat_feature_indices)

# Predict
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

# Evaluation
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("AUC-ROC Score:", roc_auc_score(y_test, y_proba))

import matplotlib.pyplot as plt

# Plot feature importance
feature_importance = model.get_feature_importance()
feature_names = X.columns

plt.figure(figsize=(12, 6))
plt.barh(feature_names, feature_importance)
plt.xlabel('Feature Importance')
plt.title('CatBoost Feature Importance')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f'CatBoost (AUC = {roc_auc_score(y_test, y_proba):.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
#CATBOOST WITH CLASS WEIGHTS

import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

# Load dataset
df = pd.read_excel(r"C:\Users\hasee\OneDrive\Desktop\FDA Final Project\fraud_detection_dataset3.csv")

# Drop columns you don't want
df.drop(columns=['IP_Address', 'Device_Fingerprint', 'Signup_Time'], errors='ignore', inplace=True)

# Auto-detect categorical columns (non-numeric)
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns.tolist()
categorical_cols = [col for col in categorical_cols if col != 'Is_Fraud']

# Convert to string (required by CatBoost)
df[categorical_cols] = df[categorical_cols].astype(str)

# Separate features and target
X = df.drop(columns='Is_Fraud')
y = df['Is_Fraud']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Create Pools
train_pool = Pool(X_train, y_train, cat_features=categorical_cols)
test_pool = Pool(X_test, cat_features=categorical_cols)

from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Compute class weights
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))
print("Class Weights:", class_weights)

# Train model
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='Logloss',
    eval_metric='AUC',
    class_weights=class_weights,  # Pass the computed dictionary here
    random_seed=42,
    verbose=100
)

model.fit(train_pool)

# Predict
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

# Evaluate
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print(f"AUC-ROC Score: {roc_auc_score(y_test, y_proba):.4f}")



In [None]:
#EXTRA TREES CLASSIFIER

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Load your dataset
df = pd.read_excel(r"C:\Users\hasee\OneDrive\Desktop\FDA Final Project\fraud_detection_dataset3.csv")

# Drop non-numeric or high-cardinality columns
columns_to_drop = ['User_ID', 'IP_Address', 'Device_Fingerprint', 'Signup_Time']
df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True)

# Convert binary 'Yes'/'No' to 1/0
binary_cols = [
    'Name_Matches_ID', 'Phone_Number_Reuse', 'Social_Media_Linked',
    'Geolocation_Mismatch', 'VPN_Used', 'Is_CNIC_Verified_With_NADRA',
    'Is_SIM_Active', 'Accounts_From_Same_Device', 'All_Documents_Uploaded',
    'Social_Media_Verification', 'Email_Verified', 'Phone_Verified',
    'Browser_Language_Mismatch', 'Historical_Fraud'
]

for col in binary_cols:
    if col in df.columns:
        df[col] = df[col].map({'Yes': 1, 'No': 0})

# Define target and features
target = 'Is_Fraud'
X = df.drop(columns=[target])
y = df[target]

# Label encode categorical columns like Email_Domain
categorical_cols = ['Email_Domain']
X_encoded = X.copy()
label_encoders = {}

for col in categorical_cols:
    if col in X_encoded.columns:
        le = LabelEncoder()
        X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))
        label_encoders[col] = le

# Handle missing values (avoids column mismatch error)
imputer = SimpleImputer(strategy="most_frequent")
X_imputed_array = imputer.fit_transform(X_encoded)

# Fix column name mismatch by using only columns that remain
X_imputed = pd.DataFrame(X_imputed_array, columns=X_encoded.columns[:X_imputed_array.shape[1]])


# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_imputed, y, test_size=0.2, stratify=y, random_state=42
)

# Initialize Extra Trees Classifier
model = ExtraTreesClassifier(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

# Train the model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("Classification Report:\n", classification_report(y_test, y_pred))
print("AUC-ROC Score:", roc_auc_score(y_test, y_proba))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
# K-MEANS

# 1. Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

"""Load and Clean Data as the dataset is read from a CSV file and irrelevant or identifier columns such as `User_ID`, `Device_Fingerprint`, `IP_Address`, and `Signup_Time` are dropped to avoid data leakage.

"""

# 2. Load Data
df = pd.read_csv(r"C:\Users\hasee\OneDrive\Desktop\FDA Final Project\fraud_detection_dataset3.csv")
df = df.drop(columns=['User_ID', 'Device_Fingerprint', 'IP_Address', 'Signup_Time'])  # Drop identifiers

"""Feature Encoding categorical variables are converted into numerical format using one-hot encoding (`get_dummies`) except the target variable `Is_Fraud`.

Feature Scaling features are standardized using `StandardScaler` to ensure equal weightage during clustering.
"""

# 3. Encode categorical features
df_encoded = pd.get_dummies(df.drop(columns=['Is_Fraud']), drop_first=True)

# 4. Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_encoded)

"""Apply K-Means Clustering K-Means is initialized with 2 clusters (fraud vs non-fraud) and applied to the scaled data. Cluster labels are added to the DataFrame."""

# 5. K-Means Clustering
kmeans = KMeans(n_clusters=2, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_scaled)

"""Visualize Clusters PCA (Principal Component Analysis) is used to reduce dimensionality to 2D for visualization. A scatter plot shows the clustering results in a 2D plane."""

# 6. Visualize Clusters with PCA
pca = PCA(n_components=2)
pca_components = pca.fit_transform(X_scaled)

plt.figure(figsize=(8, 6))
sns.scatterplot(x=pca_components[:, 0], y=pca_components[:, 1], hue=df['Cluster'], palette='Set2')
plt.title("K-Means Clustering (PCA 2D Projection)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.show()

"""Map Cluster Labels to Match Actual Fraud Labels since K-Means cluster labels are arbitrary, they are mapped to match actual labels (`Is_Fraud`) based on the most frequent class in each cluster using `scipy.stats.mode`."""

# Compare predicted clusters to actual fraud
# Remap cluster labels to match fraud labels better (optional)
from scipy.stats import mode

mapping = {}
for cluster_label in df['Cluster'].unique():
    # Use .mode and .iloc[0] for safety
    common_class = mode(df[df['Cluster'] == cluster_label]['Is_Fraud'], keepdims=True).mode[0]
    mapping[cluster_label] = common_class


df['Cluster_Mapped'] = df['Cluster'].map(mapping)

"""Evaluation performance is evaluated using a confusion matrix, classification report (precision, recall, F1-score), and ROC AUC score to assess how well clustering aligns with actual fraud labels."""

# Evaluation
print("Confusion Matrix:")
print(confusion_matrix(df['Is_Fraud'], df['Cluster_Mapped']))

print("\nClassification Report:")
print(classification_report(df['Is_Fraud'], df['Cluster_Mapped']))

print("\nAUC-ROC Score:")
print(roc_auc_score(df['Is_Fraud'], df['Cluster_Mapped']))

In [None]:
#LIGHTGBM

# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, f1_score
from lightgbm import LGBMClassifier

# Load dataset
df = pd.read_csv(r"C:\Users\hasee\OneDrive\Desktop\FDA Final Project\fraud_detection_dataset3.csv")
df = df.dropna(subset=['Is_Fraud'])

# Separate features and target
X = df.drop(columns=['User_ID', 'Is_Fraud'])
y = df['Is_Fraud']

# Binary mapping
binary_map = {'Yes': 1, 'No': 0}
for col in X.columns:
    if X[col].dtype == 'object' and set(X[col].dropna().unique()).issubset({'Yes', 'No'}):
        X[col] = X[col].map(binary_map)

# Identify column types
cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), num_cols),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]), cat_cols)
])

# Split data
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Preprocess
X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)

# Define and train LightGBM
lgbm_model = LGBMClassifier(
    n_estimators=400,
    max_depth=7,
    learning_rate=0.03,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.3,
    reg_lambda=1.2,
    random_state=42
)

lgbm_model.fit(X_train_processed, y_train)

# Evaluate
val_preds = lgbm_model.predict(X_val_processed)
print(f"LightGBM Accuracy: {accuracy_score(y_val, val_preds):.4f}")
print(f"LightGBM F1 Score: {f1_score(y_val, val_preds):.4f}")

In [None]:
#RANDOM FOREST

# Install imbalanced-learn if not already installed
!pip install -q imbalanced-learn

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    roc_curve, accuracy_score, precision_score, recall_score,
    f1_score, RocCurveDisplay # Import RocCurveDisplay here
)
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

# Load the CSV file
df = pd.read_excel(r"C:\Users\hasee\OneDrive\Desktop\FDA Final Project\fraud_detection_dataset3.csv")  # Replace with your actual file name

# Display basic info
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# Drop datetime columns to avoid type errors
datetime_cols = df.select_dtypes(include=['datetime64', 'datetime64[ns]']).columns
df = df.drop(columns=datetime_cols)

# Print dropped datetime columns (if any)
if len(datetime_cols) > 0:
    print("Dropped datetime columns:", list(datetime_cols))

# Feature Engineering
df['Contribution_Income_Ratio'] = df['Committee_Contribution_Amount'] / (df['Income'] + 1)
df['Payout_Income_Ratio'] = df['Committee_Payout_Amount'] / (df['Income'] + 1)
df['Net_Committee_Profit'] = df['Committee_Payout_Amount'] - df['Committee_Contribution_Amount']
df['Committee_ROI'] = df['Net_Committee_Profit'] / (df['Committee_Contribution_Amount'] + 1)


# Encode categorical features using LabelEncoder
label_encoders = {}
for col in df.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Define features and target
X = df.drop(columns=['Identity_theft'])
y = df['Identity_theft']


#Applying SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)
y_proba = rf_model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
roc_auc = roc_auc_score(y_test, y_proba)

# Print evaluation metrics
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"ROC AUC:   {roc_auc:.4f}")

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# ROC curve
RocCurveDisplay.from_estimator(rf_model, X_test, y_test)
plt.title("ROC Curve")
plt.show()

# Feature importances
importances = pd.Series(rf_model.feature_importances_, index=X.columns)
plt.figure(figsize=(10, 6))
importances.sort_values(ascending=False).plot(kind='bar')
plt.title('Feature Importances from Random Forest')
plt.tight_layout()
plt.show()

In [None]:
#STACKED

# Import libraries
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Load dataset
df = pd.read_csv(r"C:\Users\hasee\OneDrive\Desktop\FDA Final Project\fraud_detection_dataset3.csv")

# Drop rows with missing target
df = df.dropna(subset=['Is_Fraud'])

# Separate features and target
X = df.drop(columns=['User_ID', 'Is_Fraud'])
y = df['Is_Fraud']

# Identify categorical and numerical columns
binary_map = {'Yes': 1, 'No': 0}

# Map binary strings to integers where possible
for col in X.columns:
    if X[col].dtype == 'object' and set(X[col].dropna().unique()).issubset({'Yes', 'No'}):
        X[col] = X[col].map(binary_map)

# After binary mapping, identify remaining categorical and numeric columns
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), num_cols),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]), cat_cols)
])

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Fit-transform the data
X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)

# Define base models
xgb = XGBClassifier(
    n_estimators=350,
    max_depth=6,
    learning_rate=0.03,
    subsample=0.85,
    colsample_bytree=0.85,
    gamma=0.1,
    reg_alpha=0.3,
    reg_lambda=1,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

lgbm = LGBMClassifier(
    n_estimators=350,
    max_depth=6,
    learning_rate=0.03,
    subsample=0.85,
    colsample_bytree=0.85,
    reg_alpha=0.3,
    reg_lambda=1,
    random_state=42
)

# Final estimator
meta_model = XGBClassifier(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

# Stacking model
stacked_model = StackingClassifier(
    estimators=[
        ('xgb', xgb),
        ('lgbm', lgbm),
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
    ],
    final_estimator=meta_model,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    stack_method='predict_proba',
    passthrough=False
)

# Train the stacked model
stacked_model.fit(X_train_processed, y_train)

# Validation
val_preds = stacked_model.predict(X_val_processed)
accuracy = accuracy_score(y_val, val_preds)
f1 = f1_score(y_val, val_preds)

print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Validation F1 Score : {f1:.4f}")


In [None]:
#NAIVE BAYES

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix,
    roc_curve, ConfusionMatrixDisplay
)

# Load dataset
df = pd.read_csv(r"C:\Users\hasee\OneDrive\Desktop\FDA Final Project\fraud_detection_dataset3.csv")

# Drop identifier columns
df = df.drop(columns=['User_ID', 'IP_Address', 'Device_Fingerprint', 'Signup_Time'])
df.head()

# Encode binary categorical features
binary_map = {'Yes': 1, 'No': 0}
for col in df.columns:
    if set(df[col].unique()) <= {'Yes', 'No'}:
        df[col] = df[col].map(binary_map)

# Encode other categorical variables
label_encoders = {}
for col in df.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Handle missing values
df.fillna(df.mean(numeric_only=True), inplace=True)

# Split features and target
X = df.drop(columns='Is_Fraud')
y = df['Is_Fraud']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Preprocessing Pipeline
preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('feature_selection', SelectKBest(score_func=mutual_info_classif, k=10))  # You can tune k
])

# Define classifiers
models = {
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000)
}

# Train and evaluate each model
for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_prob = pipeline.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    print(f"\nModel: {name}")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"AUC-ROC: {auc:.4f}")

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap=plt.cm.Blues)
    plt.title(f"Confusion Matrix - {name}")
    plt.show()

    # ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    plt.figure()
    plt.plot(fpr, tpr, label=f'{name} (AUC = {auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {name}')
    plt.legend(loc='lower right')
    plt.grid()
    plt.show()