In [None]:
import pandas as pd
import numpy as np
import pickle
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from imblearn.over_sampling import SMOTE

# Load data
df1 = pd.read_csv("../data/old.csv")
df2 = pd.read_csv("../data/new_labeled_data.csv")
df = pd.concat([df1, df2], ignore_index=True).drop_duplicates()
df = df.reset_index().rename(columns={'index': 'original_index'})

# Subset (90%)
df_subset, _ = train_test_split(
    df, train_size=0.9, stratify=df['is_fraud'], random_state=42
)

# Feature selection
selected_features = [
    'original_index', 'first', 'last', 'cc_num', 'amt', 'gender', 'city', 'state',
    'zip', 'dob', 'job', 'category', 'is_fraud'
]
df_selected = df_subset[selected_features].copy()

# Feature engineering
df_selected['dob'] = pd.to_datetime(df_selected['dob'], errors='coerce')
df_selected['age'] = (pd.Timestamp.now() - df_selected['dob']).dt.days / 365.25
df_selected.drop(['dob'], axis=1, inplace=True)

# Fill numeric
df_selected['cc_num'] = pd.to_numeric(df_selected['cc_num'], errors='coerce').fillna(0)
df_selected['amt'] = pd.to_numeric(df_selected['amt'], errors='coerce').fillna(0)
df_selected['zip'] = pd.to_numeric(df_selected['zip'], errors='coerce').fillna(0)
df_selected['age'] = df_selected['age'].fillna(df_selected['age'].median())

# Encode categoricals
categorical_columns = ['first', 'last', 'gender', 'city', 'state', 'job', 'category']
label_encoders = {}
for col in categorical_columns:
    df_selected[col] = df_selected[col].astype(str).fillna('unknown')
    le = LabelEncoder()
    le.fit(list(df_selected[col].unique()) + ['unknown'])
    df_selected[col] = df_selected[col].apply(lambda x: x if x in le.classes_ else 'unknown')
    df_selected[col] = le.transform(df_selected[col])
    label_encoders[col] = le

# Split features & target
X = df_selected.drop(['is_fraud', 'original_index'], axis=1)
y = df_selected['is_fraud']
original_indices = df_selected['original_index']

# Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(
    X_scaled, y, original_indices, test_size=0.2, stratify=y, random_state=42
)

# SMOTE
smote = SMOTE(sampling_strategy=0.5, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Final model (from best params)
model = XGBClassifier(
    colsample_bytree=0.7569998494820424,
    learning_rate=0.24737555188414592,
    max_depth=6,
    min_child_weight=4,
    n_estimators=311,
    subsample=0.9570351922786028,
    scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]),
    random_state=42,
    eval_metric='logloss',
    tree_method='hist',
    device='cuda'
)
model.fit(X_train_resampled, y_train_resampled)

# Best threshold (based on earlier evaluation)
best_threshold = 0.85

# Save model and artifacts
model.save_model('credit_fraud_xgboost_model.json')
with open('best_threshold.pkl', 'wb') as f:
    pickle.dump(best_threshold, f)
with open('scaler_xgboost.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open('label_encoders_xgboost.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

print("✅ Model and preprocessing artifacts saved successfully.")
