In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Load processed datasets
fraud_df = pd.read_csv("../data/processed/fraud_geo_processed.csv")
credit_df = pd.read_csv("../data/processed/credit_cleaned.csv")

# -----------------------------
# 1️⃣ Fraud Data Feature Selection
# -----------------------------
features = ['purchase_value', 'age', 'time_to_purchase', 'time_since_signup', 
            'device_usage_count', 'ip_usage_count', 'hour_of_day', 'day_of_week', 
            'country_risk_score', 'source', 'browser', 'sex']

X_fraud = pd.get_dummies(fraud_df[features], drop_first=True)
y_fraud = fraud_df['class']

# Scaling
scaler = StandardScaler()
X_fraud_scaled = pd.DataFrame(scaler.fit_transform(X_fraud), columns=X_fraud.columns)

# Train-test split (stratified)
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(
    X_fraud_scaled, y_fraud, test_size=0.2, stratify=y_fraud, random_state=42
)

# SMOTE on training only
smote = SMOTE(random_state=42)
X_train_fraud_res, y_train_fraud_res = smote.fit_resample(X_train_fraud, y_train_fraud)

print(f"Class distribution after SMOTE: {np.bincount(y_train_fraud_res)}")

# -----------------------------
# 2️⃣ Credit Card Data Feature Engineering
# -----------------------------
# Scale Amount and Time
credit_scaler = StandardScaler()
credit_df[['Amount','Time']] = credit_scaler.fit_transform(credit_df[['Amount','Time']])

X_credit = credit_df.drop('Class', axis=1)
y_credit = credit_df['Class']

# Train-test split
X_train_credit, X_test_credit, y_train_credit, y_test_credit = train_test_split(
    X_credit, y_credit, test_size=0.2, stratify=y_credit, random_state=42
)

# Export processed features for Task 2
X_train_fraud_res.to_csv("../data/processed/fraud_X_train.csv", index=False)
y_train_fraud_res.to_csv("../data/processed/fraud_y_train.csv", index=False)
X_train_credit.to_csv("../data/processed/credit_X_train.csv", index=False)
y_train_credit.to_csv("../data/processed/credit_y_train.csv", index=False)

print("Feature engineering complete. Data ready for Task 2.")


Class distribution after SMOTE: [109568 109568]
Feature engineering complete. Data ready for Task 2.
