In [None]:
  !pip install --upgrade xgboost


Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl (253.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.9/253.9 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 2.1.4
    Uninstalling xgboost-2.1.4:
      Successfully uninstalled xgboost-2.1.4
Successfully installed xgboost-3.0.2


In [None]:
!pip uninstall -y xgboost
!pip install xgboost==1.7.6



Found existing installation: xgboost 2.1.4
Uninstalling xgboost-2.1.4:
  Successfully uninstalled xgboost-2.1.4
Collecting xgboost==1.7.6
  Downloading xgboost-1.7.6-py3-none-manylinux2014_x86_64.whl.metadata (1.9 kB)
Downloading xgboost-1.7.6-py3-none-manylinux2014_x86_64.whl (200.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.3/200.3 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-1.7.6


In [None]:
import pandas as pd
import numpy as np
import gc
import time
import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

# Load Data
print("Loading data...")
train_trans = pd.read_csv("train_transaction.csv")
train_id = pd.read_csv("train_identity.csv")
test_trans = pd.read_csv("test_transaction.csv")
test_id = pd.read_csv("test_identity.csv")

train_df = train_trans.merge(train_id, on='TransactionID', how='left')
test_df = test_trans.merge(test_id, on='TransactionID', how='left')
print(f"Train shape: {train_df.shape} | Test shape: {test_df.shape}")

y = train_df['isFraud'].values
train_df = train_df.drop(columns=['TransactionID', 'isFraud'])
test_ids = test_df['TransactionID'].values
test_df = test_df.drop(columns=['TransactionID'])

# Preprocessing & Feature Reduction

print("Filling missing values and encoding...")
for df in [train_df, test_df]:
    for col in df.select_dtypes(include='number').columns:
        df[col] = df[col].fillna(df[col].median())
    for col in df.select_dtypes(include=['object', 'category']).columns:
        df[col] = df[col].fillna('missing').astype('category').cat.codes


shared_cols = train_df.columns.intersection(test_df.columns)
train_df = train_df[shared_cols]
test_df = test_df[shared_cols]


var_thresh = VarianceThreshold(threshold=0.01)
var_thresh.fit(train_df)
selected_cols = train_df.columns[var_thresh.get_support()]
train_df = train_df[selected_cols]
test_df = test_df[selected_cols]


corr_matrix = train_df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if any(upper[col] > 0.95)]
train_df.drop(columns=to_drop, inplace=True)
test_df.drop(columns=to_drop, inplace=True)


rf = RandomForestClassifier(n_estimators=50, max_depth=5, n_jobs=-1, random_state=42)
rf.fit(train_df, y)
importances = rf.feature_importances_

imp_df = pd.DataFrame({'feature': train_df.columns, 'importance': importances})
imp_df = imp_df.sort_values(by='importance', ascending=False).head(100)
top_features = imp_df['feature'].tolist()


with open('selected_features.txt', 'w') as f:
    for feat in top_features:
        f.write(f"{feat}\n")

#
X = train_df[top_features]
X_test = test_df[top_features]
del train_df, test_df, rf, importances
gc.collect()

# AutoFeat Feature Engineering 
print("\nRunning AutoFeat to generate new features...")

from autofeat import AutoFeatClassifier

af_model = AutoFeatClassifier(verbose=1, featsel_runs=1)
X_af = af_model.fit_transform(X, y)
X_test_af = af_model.transform(X_test)

print(f"Original feature count: {X.shape[1]}")
print(f"New AutoFeat-enhanced feature count: {X_af.shape[1]}")


# Train XGBoost Model
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

xgb_clf = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='auc',
    use_label_encoder=False,
    tree_method='hist',
    random_state=42
)

xgb_clf.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=20,
    verbose=True
)




val_preds = xgb_clf.predict_proba(X_val)[:, 1]
val_auc = roc_auc_score(y_val, val_preds)
print(f"\nValidation ROC AUC: {val_auc:.4f}")

# STEP 4: Predict on Test Data 
print("Predicting on test set...")
test_preds = xgb_clf.predict_proba(X_test)[:, 1]

submission = pd.DataFrame({
    'TransactionID': test_ids,
    'isFraud': test_preds
})
submission.to_csv("xgb_submission.csv", index=False)
print("Submission saved to 'xgb_submission.csv'")


Loading data...
Train shape: (52143, 434) | Test shape: (54450, 433)
Filling missing values and encoding...
Aligning train/test columns before feature reduction...
Removing low-variance features...
Removing highly correlated features...
Selecting top 100 features with RandomForest...
Training XGBoost...
[0]	validation_0-auc:0.78258
[1]	validation_0-auc:0.78081
[2]	validation_0-auc:0.78355
[3]	validation_0-auc:0.78274
[4]	validation_0-auc:0.78354
[5]	validation_0-auc:0.79693
[6]	validation_0-auc:0.81865
[7]	validation_0-auc:0.81653
[8]	validation_0-auc:0.81926
[9]	validation_0-auc:0.81925
[10]	validation_0-auc:0.82034
[11]	validation_0-auc:0.82219
[12]	validation_0-auc:0.82174
[13]	validation_0-auc:0.82209
[14]	validation_0-auc:0.82393
[15]	validation_0-auc:0.84255
[16]	validation_0-auc:0.84681
[17]	validation_0-auc:0.84934
[18]	validation_0-auc:0.85056
[19]	validation_0-auc:0.85330
[20]	validation_0-auc:0.85483
[21]	validation_0-auc:0.85513
[22]	validation_0-auc:0.85558
[23]	validation

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

params_grid = [
    {'max_depth': 6, 'learning_rate': 0.05, 'min_child_weight': 3, 'subsample': 0.7, 'colsample_bytree': 0.7},
    {'max_depth': 6, 'learning_rate': 0.05, 'min_child_weight': 3, 'subsample': 0.9, 'colsample_bytree': 0.8, 'gamma': 1},
    {'max_depth': 6, 'learning_rate': 0.05, 'min_child_weight': 3, 'subsample': 0.8, 'colsample_bytree': 0.9, 'scale_pos_weight': 15}
]


best_score = 0
best_model = None
best_params = None

for params in params_grid:
    print(f"\nTrying params: {params}")

    model = XGBClassifier(
        n_estimators=500,
        #subsample=0.8,
        #colsample_bytree=0.8,
        eval_metric='auc',
        use_label_encoder=False,
        tree_method='hist',
        random_state=42,
        **params
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=20,
        verbose=False
    )

    preds = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, preds)
    print(f"AUC: {auc:.4f}")

    if auc > best_score:
        best_score = auc
        best_model = model
        best_params = params

print(f"\n✅ Best AUC: {best_score:.4f}")
print(f"✅ Best Parameters: {best_params}")



Trying params: {'max_depth': 6, 'learning_rate': 0.05, 'min_child_weight': 3, 'subsample': 0.7, 'colsample_bytree': 0.7}
AUC: 0.9130

Trying params: {'max_depth': 6, 'learning_rate': 0.05, 'min_child_weight': 3, 'subsample': 0.9, 'colsample_bytree': 0.8, 'gamma': 1}
AUC: 0.9179

Trying params: {'max_depth': 6, 'learning_rate': 0.05, 'min_child_weight': 3, 'subsample': 0.8, 'colsample_bytree': 0.9, 'scale_pos_weight': 15}
AUC: 0.9217

✅ Best AUC: 0.9217
✅ Best Parameters: {'max_depth': 6, 'learning_rate': 0.05, 'min_child_weight': 3, 'subsample': 0.8, 'colsample_bytree': 0.9, 'scale_pos_weight': 15}
