In [None]:
!pip install numpy pandas scikit-learn xgboost

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report
from sklearn.ensemble import IsolationForest
import xgboost as xgb

# --- 1. Data Simulation/Loading ---
# In a real project, you would replace this with loading your own data.
# E.g., df = pd.read_csv('your_transactions_data.csv')

def generate_synthetic_data(num_transactions=20000):
    """
    Generates a synthetic dataset for demonstration purposes.
    Features include transaction amount, time, and other behavioral patterns.
    """
    np.random.seed(42)
    
    data = {
        'transaction_id': range(num_transactions),
        'amount': np.random.uniform(10, 5000, num_transactions),
        'time_in_minutes': np.random.randint(0, 1440, num_transactions),
        'location_risk_score': np.random.uniform(0, 1, num_transactions),
        'device_type': np.random.choice(['mobile', 'desktop', 'tablet'], num_transactions),
        'merchant_category': np.random.choice(['retail', 'online', 'travel', 'services'], num_transactions),
        'is_fraud': np.random.choice([0, 1], num_transactions, p=[0.985, 0.015])
    }
    df = pd.DataFrame(data)
    
    # Introduce some patterns for fraud
    df.loc[df['is_fraud'] == 1, 'amount'] = np.random.uniform(2000, 10000, df['is_fraud'].sum())
    df.loc[df['is_fraud'] == 1, 'time_in_minutes'] = np.random.randint(1300, 1440, df['is_fraud'].sum())
    
    return df

# Load the data
df = generate_synthetic_data()
print("Synthetic Data Head:")
print(df.head())
print("\nFraudulent transactions count:", df['is_fraud'].sum())

# --- 2. Feature Engineering ---

def feature_engineering(df):
    """
    Creates new features from the raw transaction data.
    """
    # Example: Simple behavioral features (in a real project, this would be more complex)
    df['hour_of_day'] = (df['time_in_minutes'] // 60)
    
    # One-hot encode categorical features
    df = pd.get_dummies(df, columns=['device_type', 'merchant_category'], drop_first=True)
    
    return df

df_features = feature_engineering(df.copy())
print("\nFeatures after Engineering:")
print(df_features.head())

# --- 3. Data Preprocessing ---

# Define features (X) and target (y)
X = df_features.drop(['transaction_id', 'is_fraud', 'time_in_minutes'], axis=1)
y = df_features['is_fraud']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale numerical features (important for models like Logistic Regression and Isolation Forest)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- 4. Model Building & Evaluation ---

def evaluate_model(y_true, y_pred, model_name):
    """
    Helper function to print evaluation metrics.
    """
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    
    print(f"\n--- {model_name} Performance ---")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print("Confusion Matrix:\n", cm)
    print("\n" + "="*40 + "\n")
    return precision, recall

# a) Baseline Model: Logistic Regression
print("Building Baseline Model: Logistic Regression...")
lr_model = LogisticRegression(solver='liblinear', random_state=42)
lr_model.fit(X_train_scaled, y_train)
y_pred_lr = lr_model.predict(X_test_scaled)
precision_lr, recall_lr = evaluate_model(y_test, y_pred_lr, "Logistic Regression Baseline")

# b) Ensemble Model: XGBoost + Isolation Forest
print("Building Ensemble Model: XGBoost + Isolation Forest...")

# Part 1: XGBoost Model
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=100,
    learning_rate=0.1,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
xgb_model.fit(X_train, y_train)
y_pred_xgb_proba = xgb_model.predict_proba(X_test)[:, 1]

# Part 2: Isolation Forest for Anomaly Detection (unsupervised)
# Note: Isolation Forest is unsupervised, so it's typically trained on the majority class (non-fraud).
# Here, we'll use a simplified approach for demonstration, training on the whole dataset.
# In a real scenario, you might use it to flag transactions and combine the flags with XGBoost predictions.
isf_model = IsolationForest(
    contamination=df['is_fraud'].sum() / len(df), # A common practice is to set contamination to the fraud ratio
    random_state=42,
    n_jobs=-1
)
isf_model.fit(X_train)
isf_scores = isf_model.decision_function(X_test)
# The decision_function returns a score; higher scores are less anomalous.
# We'll normalize and use it as an additional feature or score.

# Part 3: Combining the models (Simple Averaging)
# This is a basic way to combine them. A more advanced approach would be to use stacking.
# For simplicity, we'll combine the predictions. If an Isolation Forest score indicates high anomaly,
# we can flag it as potentially fraudulent.

# Let's create a final prediction based on a threshold and the Isolation Forest score.
# Example logic: if XGBoost probability is high OR Isolation Forest score is very low (highly anomalous)
# We'll define a simple combination logic for demonstration.
# Let's say we set a threshold on the XGBoost probability.
xgb_threshold = 0.5
y_pred_ensemble = (y_pred_xgb_proba > xgb_threshold).astype(int)

# To showcase the Isolation Forest's contribution, you could:
# a) Add the Isolation Forest score as a feature to the XGBoost model (this requires retraining).
# b) Create a hybrid prediction logic. For example, if XGBoost probability is 0.45 and Isolation Forest score is in the bottom 5th percentile,
#    we might classify it as fraud.

# Let's stick to the simplest and most common form of "ensemble" for this problem, which is just using XGBoost on its own,
# as it's a powerful ensemble method in itself. The project description implies using both, which often means either stacking or a custom pipeline.
# Let's re-align the code with the project description: "Built an ensemble fraud detection model using XGBoost and Isolation Forest"
# A simple way to represent this is to use XGBoost for classification and Isolation Forest for identifying anomalies that might be missed.
# Let's say we re-classify some non-fraud predictions if Isolation Forest flags them as high-risk.

# Identify transactions XGBoost predicted as non-fraud but Isolation Forest flagged as highly anomalous.
# The decision function returns a score; the lower the score, the more anomalous.
# Let's identify the bottom 10% of scores as potential anomalies.
isf_anomaly_threshold = np.percentile(isf_scores, 10)
isf_anomalies = (isf_scores < isf_anomaly_threshold)

# Final ensemble prediction logic
# Start with XGBoost's predictions
y_pred_final_ensemble = y_pred_xgb_proba.copy()
# Re-classify any non-fraud predictions from XGBoost that are flagged as anomalies by Isolation Forest
y_pred_final_ensemble[(y_pred_final_ensemble < 0.5) & (isf_anomalies)] = 1
# Convert probabilities to classes
y_pred_final_ensemble_class = (y_pred_final_ensemble > 0.5).astype(int)

precision_ensemble, recall_ensemble = evaluate_model(y_test, y_pred_final_ensemble_class, "Ensemble XGBoost + Isolation Forest")

# --- 5. Project Metrics & Conclusion ---

# Calculate improvements
recall_improvement_percent = ((recall_ensemble - recall_lr) / recall_lr) * 100
false_alerts_lr = np.sum((y_pred_lr == 1) & (y_test == 0))
false_alerts_ensemble = np.sum((y_pred_final_ensemble_class == 1) & (y_test == 0))
false_alerts_reduction_percent = ((false_alerts_lr - false_alerts_ensemble) / false_alerts_lr) * 100

print(f"Recall improved by: {recall_improvement_percent:.2f}% vs. Logistic Regression baseline.")
print(f"False Alerts reduced by: {false_alerts_reduction_percent:.2f}% vs. Logistic Regression baseline.")