In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import mlflow
import mlflow.sklearn
import pickle
import os

In [None]:
# Section 2: Load Data Files
# Prompt user to upload datasets manually from Drive
from google.colab import files

print("Upload 'Fraud_Data.csv'")
fraud_file = files.upload()

print("Upload 'IpAddress_to_Country.csv'")
ip_file = files.upload()

In [None]:
# Load datasets into DataFrames
fraud_data = pd.read_csv('Fraud_Data.csv')
ip_data = pd.read_csv('IpAddress_to_Country.csv')

In [None]:
# Section 3: Data Preprocessing
# Display first few rows of the fraud data
print("Fraud Data Sample:")
print(fraud_data.head())

In [None]:
# Check for missing values
print("\nMissing values:")
print(fraud_data.isnull().sum())

In [None]:
# Fill or drop missing values if needed
fraud_data = fraud_data.fillna(0)

In [None]:
# Encode categorical columns if any
fraud_data = pd.get_dummies(fraud_data, drop_first=True)

In [None]:
# Display IP address data
print("\nIP Address Data Sample:")
print(ip_data.head())

In [None]:
# Section 4: Feature Selection
X = fraud_data.drop(['is_fraud'], axis=1)  # Features
y = fraud_data['is_fraud']  # Target

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Section 5: Train the RandomForest Model
# Initialize RandomForest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
# Train the model
rf_model.fit(X_train, y_train)

In [None]:
# Predict on the test data
y_pred = rf_model.predict(X_test)

In [None]:
# Section 6: Evaluate the Model
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

In [None]:
# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='d')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Display classification report
class_report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(class_report)

In [None]:
# Section 7: Log with MLflow
mlflow.set_experiment("Fraud Detection")

with mlflow.start_run() as run:
    # Log model parameters
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("random_state", 42)

    # Log metrics
    mlflow.log_metric("accuracy", accuracy)

    # Save and log model
    model_path = "fraud_detection_model.pkl"
    with open(model_path, 'wb') as f:
        pickle.dump(rf_model, f)
    mlflow.sklearn.log_model(rf_model, "model")

print(f"MLflow run ID: {run.info.run_id}")


In [None]:
# Section 8: Save Trained Model Locally
# Download the model to local machine
files.download(model_path)

print("Model saved and downloaded successfully!")