In [None]:
# fraudDetection.ipynb

# 📌 Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# 📌 Step 2: Create sample invoice dataset
# Columns: buyer, seller, amount, dueDate (days from today)
np.random.seed(42)

n_samples = 200
amounts = np.random.normal(5000, 2000, n_samples).clip(100, 20000)   # normal invoices
due_dates = np.random.randint(5, 60, n_samples)                      # due in 5–60 days

# Add some anomalies
fraud_amounts = [1000000, 2000000, 500000]   # unusually high amounts
fraud_due = [0, -10, 1]                      # already due / expired

amounts = np.concatenate([amounts, fraud_amounts])
due_dates = np.concatenate([due_dates, fraud_due])

df = pd.DataFrame({
    "buyer": np.random.choice(["CompanyA", "CompanyB", "CompanyC"], len(amounts)),
    "seller": np.random.choice(["VendorX", "VendorY", "VendorZ"], len(amounts)),
    "amount": amounts,
    "due_days": due_dates
})

print("📊 Sample Data:")
print(df.head())

# 📌 Step 3: Feature Scaling
scaler = StandardScaler()
X = scaler.fit_transform(df[["amount", "due_days"]])

# 📌 Step 4: Train Isolation Forest (unsupervised anomaly detection)
model = IsolationForest(contamination=0.05, random_state=42)
df["fraud_score"] = model.fit_predict(X)

# -1 = anomaly, 1 = normal
df["fraudulent"] = df["fraud_score"].apply(lambda x: "Yes" if x == -1 else "No")

print("\n🚨 Fraudulent Invoices Detected:")
print(df[df["fraudulent"] == "Yes"])

# 📌 Step 5: Visualization
plt.figure(figsize=(8,6))
colors = df["fraudulent"].map({"No":"blue", "Yes":"red"})
plt.scatter(df["amount"], df["due_days"], c=colors)
plt.xlabel("Invoice Amount ($)")
plt.ylabel("Due Days")
plt.title("Fraud Detection on Invoices")
plt.show()

# 📌 Step 6: Save trained model for backend use
import joblib
joblib.dump(model, "fraud_model.pkl")
print("\n✅ Model saved as fraud_model.pkl")
