In [None]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV, train_test_split
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [None]:
train = pd.read_csv('Data/train_final.csv')

X = train.drop(columns=['income>50K'])
y = train['income>50K']

test_df = pd.read_csv('Data/test_final.csv')
X_test = test_df.drop('ID', axis=1) 
ids = test_df['ID']

In [None]:
def handle_categorical(X_train, X_test):
    X_train = pd.get_dummies(X_train, drop_first=True)
    
    X_test = pd.get_dummies(X_test, drop_first=True)
    
    X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)
    
    return X_train, X_test
    

In [None]:
X, X_test = handle_categorical(X, X_test)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:

xgb_model = xgb.XGBClassifier(eval_metric='logloss')

param_grid = {
    'n_estimators': [50, 100, 200, 250, 300, 350, 400, 450],
    'eta': [0.01, 0.1, 0.2, 0.3],
    'max_depth': [3, 5, 7, 9],
    'subsample': [0.8, 1.0],
}



In [None]:
%%time
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='roc_auc', cv=5, verbose=1)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

In [None]:
xgb_best_model = xgb.XGBClassifier(**best_params, eval_metric='logloss')

xgb_best_model.fit(X, y)

In [None]:
y_test_pred_proba = xgb_best_model.predict_proba(X_test)[:, 1]


In [None]:
output_df = pd.DataFrame({'ID': ids, 'Prediction': y_test_pred_proba})
output_df.to_csv('Predictions/attempt_16_main_4_params_450_estimators_non_probabilities.csv', index=False)