In [None]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV, train_test_split
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

In [None]:

train = pd.read_csv('Data/train_final.csv')
test_df = pd.read_csv('Data/test_final.csv')

X = train.drop(columns=['income>50K'])
y = train['income>50K']
X_test = test_df.drop('ID', axis=1)
ids = test_df['ID']


def preprocess_missing_values(df):
    return df.replace('?', np.nan)

X = preprocess_missing_values(X)
X_test = preprocess_missing_values(X_test)

In [None]:
def encode_categorical_columns(df):
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].astype('category').cat.codes
    return df

X = encode_categorical_columns(X)
X_test = encode_categorical_columns(X_test)

def impute_missing_values(X_train, X_test):
    imputer = IterativeImputer(max_iter=1000000, random_state=42, initial_strategy='most_frequent')
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)
    return pd.DataFrame(X_train_imputed, columns=X_train.columns), pd.DataFrame(X_test_imputed, columns=X_test.columns)

X, X_test = impute_missing_values(X, X_test)

In [None]:
def handle_categorical(X_train, X_test):
    X_train = pd.get_dummies(X_train, drop_first=True)
    
    X_test = pd.get_dummies(X_test, drop_first=True)
    
    X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)
    
    return X_train, X_test

In [None]:
X, X_test = handle_categorical(X, X_test)


In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
def objective(params):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    
    xgb_params = {
        'eta': params['eta'],
        'max_depth': int(params['max_depth']),
        'subsample': params['subsample'],
        'colsample_bytree': params['colsample_bytree'],
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'seed': 42
    }
    
    model = xgb.train(
        xgb_params, 
        dtrain, 
        num_boost_round=int(params['n_estimators']),
        evals=[(dval, 'validation')],
        early_stopping_rounds=50,
        verbose_eval=False
    )
    
    preds = model.predict(dval)
    
    auc = roc_auc_score(y_val, preds)
    
    return {'loss': -auc, 'status': STATUS_OK}



param_space = {
    'eta': hp.uniform('eta', 0.01, 0.4),
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
    'n_estimators': hp.quniform('n_estimators', 50, 700, 10) 
}


trials = Trials()


best_params = fmin(
    fn=objective,
    space=param_space,
    algo=tpe.suggest,
    max_evals=10000,  
    trials=trials
)


best_params['max_depth'] = int(best_params['max_depth'])
best_params['n_estimators'] = int(best_params['n_estimators'])

print("Best Parameters:", best_params)


final_params = {
    'eta': best_params['eta'],
    'max_depth': best_params['max_depth'],
    'subsample': best_params['subsample'],
    'colsample_bytree': best_params['colsample_bytree'],
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'seed': 42
}

dtrain_final = xgb.DMatrix(X_train, label=y_train)
dtest_final = xgb.DMatrix(X_test)

In [None]:
dtrain_final = xgb.DMatrix(X_train, label=y_train)
dtest_final = xgb.DMatrix(X_test)

final_model = xgb.train(
    final_params,
    dtrain_final,
    num_boost_round=best_params['n_estimators']
)


In [None]:
y_test_pred_proba = final_model.predict(dtest_final)


In [None]:
output_df = pd.DataFrame({'ID': ids, 'Prediction': y_test_pred_proba})

output_df.to_csv('Predictions/attempt_27_nulls_hyperopt_10000_iter_expanded_normal_param.csv', index=False)