In [22]:
# %% [Imports]
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score
import lightgbm as lgb
import xgboost as xgb
from lime.lime_tabular import LimeTabularExplainer
import joblib
import gc

In [None]:
# %% [Data Loading and Preprocessing]
def load_and_preprocess():
    df = pd.read_csv('Dataset.csv')
    
    # Data cleaning
    df['category'] = df['category'].str.replace('es_', '', regex=False)
    df['age'] = pd.to_numeric(df['age'], errors='coerce').fillna(-1).astype('Int64')
    
    # Feature engineering
    df['amount_to_avg'] = df['amount'] / df.groupby('customer')['amount'].transform('mean')
    df['merchant_fraud_rate'] = df.groupby('merchant')['fraud'].transform('mean')
    
    # Select only the features we'll actually use
    selected_features = ['amount', 'amount_to_avg', 'merchant_fraud_rate', 
                       'age', 'gender', 'category', 'fraud']
    return df[selected_features]

df = load_and_preprocess()

# %% [Feature Selection]
numeric_features = ['amount', 'amount_to_avg', 'merchant_fraud_rate']
categorical_features = ['age', 'gender', 'category']
target = 'fraud'

# %% [Preprocessing Pipeline]
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='drop'  # This ensures we only process the specified features
)

# %% [Base Models]
lgb_model = lgb.LGBMClassifier(
    num_leaves=31,
    max_depth=6,
    learning_rate=0.05,
    n_estimators=300,
    random_state=42
)

xgb_model = xgb.XGBClassifier(
    max_depth=6,
    learning_rate=0.1,
    n_estimators=300,
    random_state=42
)

# %% [Stacked Model]
stacked_model = Pipeline([
    ('preprocessor', preprocessor),
    ('stack', StackingClassifier(
        estimators=[
            ('lgb', lgb_model),
            ('xgb', xgb_model)
        ],
        final_estimator=xgb.XGBClassifier(max_depth=3, n_estimators=100),
        cv=3
    ))
])

# %% [Train-Test Split]
X = df.drop(columns=[target])
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# %% [Model Training]
print("Training stacked model...")
stacked_model.fit(X_train, y_train)

# %% [Evaluation]
y_pred = stacked_model.predict(X_test)
y_proba = stacked_model.predict_proba(X_test)[:, 1]
print("\nStacked Model Performance:")
print(classification_report(y_test, y_pred))
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")

# %% [LIME Explainer - Fixed]
# Get the correct feature names after preprocessing
processed_features = numeric_features.copy()
cat_encoder = stacked_model.named_steps['preprocessor'].named_transformers_['cat']
processed_features += list(cat_encoder.get_feature_names_out(categorical_features))


# %% [Fixed Explanation Function]
def explain_transaction(model, transaction):
    try:
        # Ensure we only use the expected features
        transaction = transaction[numeric_features + categorical_features]
        processed_tx = model.named_steps['preprocessor'].transform(transaction)
        
        exp = lime_explainer.explain_instance(
            processed_tx[0],
            model.predict_proba,
            num_features=5
        )
        return exp
    except Exception as e:
        print(f"Explanation error: {str(e)}")
        return None


# %% [Save Models]
joblib.dump(stacked_model, 'stacked_fraud_model.joblib')
print("\nModel saved successfully!")

Training stacked model...


