In [17]:
# %% [Imports]
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score
import lightgbm as lgb
import xgboost as xgb
from lime.lime_tabular import LimeTabularExplainer
import joblib
import gc

In [None]:
# %% [Data Loading and Preprocessing]
def load_and_preprocess():
    df = pd.read_csv('Dataset.csv')
    
    # Data cleaning
    df['category'] = df['category'].str.replace('es_', '', regex=False)
    df['age'] = pd.to_numeric(df['age'], errors='coerce').fillna(-1).astype('Int64')
    
    # Feature engineering
    df['amount_to_avg'] = df['amount'] / df.groupby('customer')['amount'].transform('mean')
    df['merchant_fraud_rate'] = df.groupby('merchant')['fraud'].transform('mean')
    
    # Select only the features we'll actually use
    selected_features = ['amount', 'amount_to_avg', 'merchant_fraud_rate', 
                       'age', 'gender', 'category', 'fraud']
    return df[selected_features]

df = load_and_preprocess()

# %% [Feature Selection]
numeric_features = ['amount', 'amount_to_avg', 'merchant_fraud_rate']
categorical_features = ['age', 'gender', 'category']
target = 'fraud'

# %% [Preprocessing Pipeline]
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='drop'  # This ensures we only process the specified features
)

# %% [Base Models]
lgb_model = lgb.LGBMClassifier(
    num_leaves=31,
    max_depth=6,
    learning_rate=0.05,
    n_estimators=300,
    random_state=42
)

xgb_model = xgb.XGBClassifier(
    max_depth=6,
    learning_rate=0.1,
    n_estimators=300,
    random_state=42
)

# %% [Stacked Model]
stacked_model = Pipeline([
    ('preprocessor', preprocessor),
    ('stack', StackingClassifier(
        estimators=[
            ('lgb', lgb_model),
            ('xgb', xgb_model)
        ],
        final_estimator=xgb.XGBClassifier(max_depth=3, n_estimators=100),
        cv=3
    ))
])

# %% [Train-Test Split]
X = df.drop(columns=[target])
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# %% [Model Training]
print("Training stacked model...")
stacked_model.fit(X_train, y_train)

# %% [Evaluation]
y_pred = stacked_model.predict(X_test)
y_proba = stacked_model.predict_proba(X_test)[:, 1]
print("\nStacked Model Performance:")
print(classification_report(y_test, y_pred))
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")

# %% [LIME Explainer - Fixed]
# Get the correct feature names after preprocessing
processed_features = numeric_features.copy()
cat_encoder = stacked_model.named_steps['preprocessor'].named_transformers_['cat']
processed_features += list(cat_encoder.get_feature_names_out(categorical_features))


# %% [Fixed Explanation Function]
def explain_transaction(model, transaction):
    try:
        # Ensure we only use the expected features
        transaction = transaction[numeric_features + categorical_features]
        processed_tx = model.named_steps['preprocessor'].transform(transaction)
        
        exp = lime_explainer.explain_instance(
            processed_tx[0],
            model.predict_proba,
            num_features=5
        )
        return exp
    except Exception as e:
        print(f"Explanation error: {str(e)}")
        return None


# %% [Save Models]
joblib.dump(stacked_model, 'stacked_fraud_model.joblib')
print("\nModel saved successfully!")

Training stacked model...





Stacked Model Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    105738
           1       0.87      0.75      0.81      1298

    accuracy                           1.00    107036
   macro avg       0.93      0.88      0.90    107036
weighted avg       1.00      1.00      1.00    107036

ROC AUC: 0.9975

Setting up LIME explainer...

Model saved successfully!


Training LightGBM...
LightGBM training error: Unable to allocate 13.3 GiB for an array with shape (428142, 4183) and data type float64


MemoryError: Unable to allocate 13.3 GiB for an array with shape (428142, 4183) and data type float64

In [None]:

# %% [Base Model 2: Isotonic Calibration]
try:
    print("Calibrating probabilities...")
    lgb_probs = lgb_pipe.predict_proba(X_train)[:, 1]
    iso_model = IsotonicRegression(out_of_bounds='clip')
    iso_model.fit(lgb_probs, y_train)
    
except Exception as e:
    print(f"Isotonic regression error: {str(e)}")
    raise

# %% [Meta-Feature Creation]
try:
    calibrated_probs = iso_model.predict(lgb_pipe.predict_proba(X_test)[:, 1])
    meta_features = np.column_stack([
        calibrated_probs,
        X_test['merchant_fraud_rate'],
        X_test['customer_risk_score'],
        X_test['amount_to_avg']
    ])
except Exception as e:
    print(f"Meta-feature creation error: {str(e)}")
    raise

# %% [Meta-Model: XGBoost]
try:
    print("Training XGBoost meta-model...")
    xgb_meta = xgb.XGBClassifier(
        scale_pos_weight=len(y_train[y_train==0])/len(y_train[y_train==1]),
        max_depth=4,
        learning_rate=0.1,
        n_estimators=200,
        random_state=42
    )
    xgb_meta.fit(meta_features, y_test)
except Exception as e:
    print(f"XGBoost training error: {str(e)}")
    raise

# %% [LIME Explainer Setup]
try:
    # Get feature names after preprocessing
    num_feature_names = numeric_features.copy()
    cat_feature_names = []
    
    if 'cat' in preprocessor.named_transformers_:
        cat_encoder = preprocessor.named_transformers_['cat']
        cat_feature_names = list(cat_encoder.get_feature_names_out(categorical_features))
    
    all_feature_names = num_feature_names + cat_feature_names
    
    # Initialize LIME explainer
    lime_explainer = LimeTabularExplainer(
        training_data=preprocessor.transform(X_train),
        feature_names=all_feature_names,
        class_names=['Legit', 'Fraud'],
        mode='classification',
        discretize_continuous=False
    )
except Exception as e:
    print(f"LIME setup error: {str(e)}")
    raise

# %% [Example LIME Explanation]
def explain_transaction(transaction_idx):
    try:
        # Select transaction
        transaction = X_test.iloc[[transaction_idx]]
        
        # Get prediction
        proba = xgb_meta.predict_proba(
            np.column_stack([
                iso_model.predict(lgb_pipe.predict_proba(transaction)[:, 1]),
                transaction['merchant_fraud_rate'],
                transaction['customer_risk_score'],
                transaction['amount_to_avg']
            ])
        )[0, 1]
        
        # Generate LIME explanation
        exp = lime_explainer.explain_instance(
            preprocessor.transform(transaction)[0],
            lambda x: xgb_meta.predict_proba(
                np.column_stack([
                    iso_model.predict(lgb_pipe.predict_proba(
                        pd.DataFrame(
                            preprocessor.inverse_transform(x),
                            columns=all_feature_names
                        )
                    )[:, 1]),
                    x[:, all_feature_names.index('merchant_fraud_rate')],
                    x[:, all_feature_names.index('customer_risk_score')],
                    x[:, all_feature_names.index('amount_to_avg')]
                ])
            ),
            num_features=10
        )
        
        print(f"Transaction #{transaction_idx} | Fraud Probability: {proba:.1%}")
        return exp
    except Exception as e:
        print(f"Explanation error: {str(e)}")
        return None

# Example usage
exp = explain_transaction(0)
if exp:
    exp.show_in_notebook()

# %% [Save Models]
try:
    joblib.dump(lgb_pipe, 'lgb_pipeline.joblib')
    joblib.dump(iso_model, 'isotonic_calibrator.joblib')
    joblib.dump(xgb_meta, 'xgb_meta_model.joblib')
    print("Models saved successfully!")
except Exception as e:
    print(f"Model saving error: {str(e)}")