In [None]:
# 1. IMPORTS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# 2. LOAD DATA
df = pd.read_csv("financial_data.csv")  # Replace with actual file name
print("Initial Data Preview:\n", df.head())
print("\nData Info:")
print(df.info())

# 3. EDA AND DATA QUALITY CHECKS
print("\nMissing Values:\n", df.isnull().sum())
print("\nDuplicate Records:", df.duplicated().sum())
print("\nStatistical Summary:\n", df.describe())

# Visualize correlation
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()

# 4. DATA CLEANING
# Drop duplicates
df = df.drop_duplicates()

# Fill missing values (forward fill as example)
df = df.fillna(method='ffill')

# 5. ENFORCE DATA SLAs
assert df.isnull().sum().sum() / df.size < 0.02, "Completeness SLA Failed: >2% missing"
assert df.duplicated().sum() / len(df) < 0.01, "Duplicate SLA Failed: >1% duplicated"

# Optional: Assert numeric ranges (example SLA)
assert df['amount'].min() >= 0, "Amount field has negative values"

# 6. FEATURE ENGINEERING
# Convert categorical if present
df = pd.get_dummies(df, drop_first=True)

# Define features and target
target_column = "is_fraud"  # Replace with actual target column
X = df.drop(columns=[target_column])
y = df[target_column]

# Normalize numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# 7. MODEL TRAINING
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# 8. EVALUATION
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# ROC Curve
y_proba = clf.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = roc_auc_score(y_test, y_proba)

plt.figure(figsize=(8, 5))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid()
plt.show()