In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc, f1_score, make_scorer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE  # NEW: Import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline  # NEW: Use imblearn's pipeline
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import RandomizedSearchCV
import joblib
from collections import Counter

# --- 1. Load data and Feature Engineering ---
print("--- 1. Loading and Feature Engineering Data ---")
df = pd.read_pickle('../data/feature_engineered_transactions.pkl')
print(f"Data loaded: Rows={len(df)}, Columns={len(df.columns)}")

# --- NEW: Feature Engineering - Velocity by IP (Transactions in last 24 hours) ---
df['timestamp'] = pd.to_datetime(df['timestamp']) # Ensure timestamp is datetime
df = df.sort_values('timestamp').reset_index(drop=True)

# Calculate transaction count per IP in the preceding 24 hours
df['ip_txn_count_prev_24h'] = df.groupby('ip_address').rolling(
    '24h', on='timestamp', closed='left')['transaction_id'].count().reset_index(level=0, drop=True)
df['ip_txn_count_prev_24h'] = df['ip_txn_count_prev_24h'].fillna(0).astype('int32')
print("New feature 'ip_txn_count_prev_24h' created.")

# Prepare data for modeling
X = df.drop(['is_fraud', 'timestamp'], axis=1) # Drop 'timestamp' now
y = df['is_fraud']
identifiers = ['customer_id', 'device_id', 'transaction_id', 'ip_address'] # Keep 'ip_address' for CatBoost later, but remove it as an identifier here
X = X.drop(columns=[c for c in identifiers if c in X.columns])

categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64', 'int32']).columns.tolist()
print(f"Numerical: {len(numerical_features)}, Categorical: {len(categorical_features)}")

# --- 2. Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)
print(f"Train={len(X_train)}, Test={len(X_test)}, Fraud rate={y_train.mean():.4f}")
print(f"Original Training Class Distribution: {Counter(y_train)}")

# --- 3. Preprocessing for XGBoost & LightGBM (using imblearn pipeline) ---
numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))
])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])

preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

# --- 4. Function to evaluate models ---
def evaluate_model(model, X_test, y_test, model_name, preprocessor=None):
    if preprocessor:
        X_test_processed = preprocessor.transform(X_test)
        y_pred_proba = model.predict_proba(X_test_processed)[:,1]
    else:
        # For CatBoost without preprocessor
        y_pred_proba = model.predict_proba(X_test)[:,1]
        
    auc_roc = roc_auc_score(y_test, y_pred_proba)
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
    auc_pr = auc(recall, precision)
    f1_scores = 2*(precision[:-1]*recall[:-1])/(precision[:-1]+recall[:-1])
    f1_scores = np.nan_to_num(f1_scores)
    
    best_idx = np.argmax(f1_scores)
    best_threshold = thresholds[best_idx]
    y_pred_opt = (y_pred_proba >= best_threshold).astype(int)
    
    print(f"\n--- {model_name} ---")
    print(f"AUC-ROC: {auc_roc:.4f}, AUC-PR: {auc_pr:.4f}, Best threshold={best_threshold:.4f}")
    print(classification_report(y_test, y_pred_opt, target_names=['Non-Fraud','Fraud']))
    
    report_dict = classification_report(y_test, y_pred_opt, target_names=['Non-Fraud','Fraud'], output_dict=True)
    return {
        'model_name': model_name,
        'model_object': model,
        'auc_roc': auc_roc,
        'auc_pr': auc_pr,
        'f1_fraud': report_dict['Fraud']['f1-score'],
        'precision_fraud': report_dict['Fraud']['precision'],
        'recall_fraud': report_dict['Fraud']['recall']
    }

# --- 5. XGBoost with SMOTE Pipeline ---
print("\n--- 5. XGBoost with SMOTE Pipeline ---")
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    random_state=42,
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.7,
    colsample_bytree=0.7,
    use_label_encoder=False,
    # REMOVED: We no longer use scale_pos_weight because SMOTE balances the data.
)

# NEW: Create a pipeline that applies preprocessing, then SMOTE, then the model
xgb_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(sampling_strategy='minority', random_state=42, n_jobs=-1)),
    ('classifier', xgb_model)
])

xgb_pipeline.fit(X_train, y_train)

# We pass the preprocessor to the evaluation function so it can process X_test
results = [evaluate_model(xgb_pipeline, X_test, y_test, "XGBoost (SMOTE)", preprocessor=None)] 
# Note: ImbPipeline handles the preprocessor step internally, so pass None here

# --- 6. LightGBM with SMOTE Pipeline ---
print("\n--- 6. LightGBM with SMOTE Pipeline ---")
lgb_model = lgb.LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.7,
    colsample_bytree=0.7,
    # REMOVED: We no longer use is_unbalance=True because SMOTE balances the data.
    random_state=42
)

lgb_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(sampling_strategy='minority', random_state=42, n_jobs=-1)),
    ('classifier', lgb_model)
])

lgb_pipeline.fit(X_train, y_train)
results.append(evaluate_model(lgb_pipeline, X_test, y_test, "LightGBM (SMOTE)", preprocessor=None))

# --- 7. CatBoost (Keeping the original approach for comparison) ---
print("\n--- 7. CatBoost (Original Approach with new feature) ---")
# CatBoost handles categorical features natively, so we don't need the preprocessor/SMOTE pipeline
pos_weight = (len(y_train) - y_train.sum()) / y_train.sum()

cat_model = CatBoostClassifier(
    iterations=500,
    eval_metric='F1',
    verbose=0,
    early_stopping_rounds=50,
    scale_pos_weight=pos_weight,
    allow_writing_files=False,
    depth=6, # Using the best depth from your previous tuning
    learning_rate=0.03, # Using the best LR from your previous tuning
    random_seed=42)

cat_model.fit(X_train, y_train, cat_features=categorical_features)
results.append(evaluate_model(cat_model, X_test, y_test, "CatBoost (Weighted)"))


# --- 8. Final Model Selection and Saving ---
print("\n--- 8. Final Model Performance Summary ---")
results_df = pd.DataFrame(results).sort_values(by='f1_fraud', ascending=False)
best_row = results_df.iloc[0]
final_model = best_row['model_object']
final_model_name = best_row['model_name']

print(results_df[['model_name', 'auc_roc', 'auc_pr', 'f1_fraud', 'precision_fraud', 'recall_fraud']].to_string(index=False))

# --- 9. Save the Best Model and Preprocessor ---
print(f"\n--- 9. Saving Selected Model: {final_model_name} ---")

if final_model_name.startswith('CatBoost'):
    joblib.dump(final_model, f'../models/{final_model_name}_fraud_model.joblib')
    print(f"Final model ({final_model_name}) saved to '../models/{final_model_name}_fraud_model.joblib'.")
    print("No preprocessor saved, as CatBoost handles features natively.")
else:
    # Save the entire imblearn pipeline object for XGBoost/LightGBM
    joblib.dump(final_model, f'../models/{final_model_name}_fraud_model_pipeline.joblib')
    print(f"Final model pipeline ({final_model_name}) saved to '../models/{final_model_name}_fraud_model_pipeline.joblib'.")

print("--- Script Finished ---")