In [34]:
# Team Members:
# Madhavan M
# Sriram V
# Siddarth Hilari
# Auston

import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
import numpy as np


# Load the dataset from CSV
df = pd.read_csv('/kaggle/input/datathon/DATATHON_EVENT_DATASET.csv')

# ---- Data Cleaning ----
# Use ffill directly to forward-fill missing values without a method parameter
df.ffill(inplace=True)  # Forward fill missing values

# ---- Label Conversion ----
# Convert 'Yes'/'No' in 'Fraud' column to 1 and 0
# This conversion makes the target column numerical, as the model requires numerical input
df['Fraud'] = df['Fraud'].map({'Yes': 1, 'No': 0})

# ---- Define Features (X) and Target (y) ----
# X is the dataset with the 'Fraud' column dropped, as it's the target
# y is the 'Fraud' column, which we are trying to predict
X = df.drop(columns=['Fraud'])  # Features (drop the target column)
y = df['Fraud']  # Target (fraud label)

# ---- Handling High Cardinality Columns ----
# Label encode columns with high cardinality (i.e., many unique values like Origin_ID and Destination_ID)
# Label encoding converts these categorical values into numerical labels
label_encoder = LabelEncoder()
X['Origin_ID'] = label_encoder.fit_transform(X['Origin_ID'])
X['Destination_ID'] = label_encoder.fit_transform(X['Destination_ID'])

# ---- One-Hot Encoding for Categorical Columns ----
# One-Hot Encoding is used for categorical columns with few unique values (like 'Transaction_Type', 'Expected_Fraud')
# This converts categorical data into a binary format (0s and 1s) for model input
categorical_cols = ['Transaction_Type', 'Expected_Fraud']
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_cols)],  # Apply One-Hot Encoding
    remainder='passthrough'  # Leave other columns (numerical ones) unchanged
)

# ---- Create the Pipeline ----
# Pipeline combines preprocessing (encoding + scaling) and model training into one step
# 1. Preprocessor: Applies one-hot encoding and scaling
# 2. Scaler: Standardizes numerical features (scaling to make values comparable)
# 3. Classifier: RandomForestClassifier used for the fraud detection task
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Preprocessing steps (One-Hot Encoding + Pass-through other features)
    ('scaler', StandardScaler()),  # Standardizes numerical features to have mean 0 and variance 1
    ('classifier', RandomForestClassifier(random_state=42))  # Random forest model for classification
])

# ---- Splitting the Data ----
# Split the data into training and testing sets
# 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# ---- Train the Model ----
# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# ---- Predictions and Evaluation ----
# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Calculate F1 Score, Confusion Matrix, and Accuracy
# F1-Score: Evaluates model performance (balance between precision and recall)
# Confusion Matrix: Provides insight into true positives, false positives, true negatives, and false negatives
# Accuracy: Percentage of correct predictions
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

# Print the results to evaluate model performance
print(f'Accuracy: {accuracy * 100:.2f}%')  # Accuracy in percentage
print(f'F1-Score: {f1}')  # F1 score to measure the balance between precision and recall
print('Confusion Matrix:')
print(cm)  # Show confusion matrix for evaluation

# ---- Save the Trained Model ----
# Save the entire pipeline (preprocessing + model) to a file using joblib
# The saved model can be reloaded for making predictions on new data without retraining
joblib.dump(pipeline, 'fraud_detection_pipeline.pkl')
print("Model saved successfully.")

# The output we got by running this code:
# Accuracy: 98.91%
# F1-Score: 0.9604976671850699
# Confusion Matrix:
# [[9959   41]   -> (9959 true negatives, 41 false positives)
#  [  86 1544]]  -> (86 false negatives, 1544 true positives)
# Model saved successfully.


Accuracy: 98.91%
F1-Score: 0.9604976671850699
Confusion Matrix:
[[9959   41]
 [  86 1544]]
Model saved successfully.
