In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score, make_scorer
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings('ignore')

df_train = pd.read_csv('train.csv')
df_train['Y'] = df_train['Y'].astype(int)
df_train.replace([np.inf, -np.inf], np.nan, inplace=True)

X_raw = df_train.drop(['id', 'Y'], axis=1)
y = df_train['Y']

# Fill missing values with median
median_values = X_raw.median()
X_imputed = X_raw.fillna(median_values)

param_dist = {
    'n_estimators': [100, 200, 500, 1000],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'min_child_weight': [1, 3, 5, 7],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'reg_lambda': [0.1, 1, 10],
    'reg_alpha': [0, 0.1, 0.5, 1]
}

auc_scorer = make_scorer(roc_auc_score, needs_proba=True)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    use_label_encoder=False,
    random_state=42
)

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=50,                
    scoring=auc_scorer,
    cv=skf,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

print("Starting Randomized Search with XGBoost...")
random_search.fit(X_imputed, y)

# --- 4. Report Results ---
print("\n" + "="*60)
print("üèÅ Randomized Search Complete!")
print(f"Best AUC-ROC Score: {random_search.best_score_:.4f}")
print("Best Hyperparameters:")
print(random_search.best_params_)
print("="*60)
