<a href="https://colab.research.google.com/github/MehrdadDastouri/fraud_detection_random_forest/blob/main/fraud_detection_random_forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
# Dataset: https://www.kaggle.com/mlg-ulb/creditcardfraud
# Download the dataset and place it in the same directory
data = pd.read_csv("creditcard.csv")

# Explore the dataset
print("Dataset Head:")
print(data.head())
print("\nDataset Info:")
print(data.info())

# Check for class imbalance
print("\nClass Distribution:")
print(data['Class'].value_counts())

# Visualize class distribution
sns.countplot(data['Class'])
plt.title("Class Distribution (0 = Legit, 1 = Fraud)")
plt.show()

# Split features and target
X = data.drop(columns=["Class"])  # Features
y = data["Class"]                # Target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
print("\nTraining the Random Forest model...")
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("\nConfusion Matrix:")
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nAccuracy Score:")
print(f"{accuracy_score(y_test, y_pred) * 100:.2f}%")

# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=["Legit", "Fraud"], yticklabels=["Legit", "Fraud"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# Feature importance
feature_importances = model.feature_importances_
sorted_indices = np.argsort(feature_importances)[::-1]
top_features = X.columns[sorted_indices][:10]
top_importances = feature_importances[sorted_indices][:10]

plt.figure(figsize=(10, 6))
sns.barplot(x=top_importances, y=top_features, palette="viridis")
plt.title("Top 10 Feature Importances")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.show()