In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score

  from pandas import MultiIndex, Int64Index


In [2]:
data = pd.read_csv('../data/train_clean.csv')

In [3]:
data.head()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,h1n1_vaccine,seasonal_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,no_response,no_response,0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe,0,1
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo,0,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,no_response,no_response,0,1
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb,0,0


In [4]:
data.dtypes

respondent_id                    int64
h1n1_concern                   float64
h1n1_knowledge                 float64
behavioral_antiviral_meds      float64
behavioral_avoidance           float64
behavioral_face_mask           float64
behavioral_wash_hands          float64
behavioral_large_gatherings    float64
behavioral_outside_home        float64
behavioral_touch_face          float64
doctor_recc_h1n1                object
doctor_recc_seasonal            object
chronic_med_condition           object
child_under_6_months           float64
health_worker                   object
health_insurance                object
opinion_h1n1_vacc_effective    float64
opinion_h1n1_risk              float64
opinion_h1n1_sick_from_vacc    float64
opinion_seas_vacc_effective    float64
opinion_seas_risk              float64
opinion_seas_sick_from_vacc    float64
age_group                       object
education                       object
race                            object
sex                      

In [5]:
categorical_columns = list(data.select_dtypes('object').columns)
categorical_columns

['doctor_recc_h1n1',
 'doctor_recc_seasonal',
 'chronic_med_condition',
 'health_worker',
 'health_insurance',
 'age_group',
 'education',
 'race',
 'sex',
 'income_poverty',
 'marital_status',
 'rent_or_own',
 'employment_status',
 'hhs_geo_region',
 'census_msa',
 'employment_industry',
 'employment_occupation']

In [6]:
X = data.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])
y = data['seasonal_vaccine']

In [7]:
Xd = pd.get_dummies(data=X, columns=categorical_columns, drop_first=True)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(Xd, y, random_state=42, stratify=y)

In [9]:
X_train = np.array(X_train)
X_test = np.array(X_test)

In [10]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [108]:
xg_cls = xgb.XGBClassifier(random_state=42, use_label_encoder=False)

params = {
    'n_estimators' : [100, 250, 500],
    'max_depth' : [3, 5, 6, 10],
    'learning_rate' : [0.01, 0.1, 0.3],
    'colsample_bytree' : [0.5, 1],
    'subsample' : [0.6, 1] 
}

gs = GridSearchCV(estimator=xg_cls, param_grid=params, scoring='roc_auc')

gs.fit(X_train, y_train)

y_pred = gs.predict(X_test)



In [109]:
gs.best_score_

0.8623440999692423

In [110]:
gs.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5,
              enable_categorical=False, gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=16, num_parallel_tree=1,
              predictor='auto', random_state=42, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=0.6, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=None)

In [115]:
gs.best_params_

{'colsample_bytree': 0.5,
 'learning_rate': 0.01,
 'max_depth': 6,
 'n_estimators': 500,
 'subsample': 0.6}

In [111]:
accuracy_score(y_test, y_pred), accuracy_score(y_train, gs.predict(X_train))

(0.7834356747042085, 0.8124812780828757)

In [112]:
precision_score(y_test, y_pred), precision_score(y_train, gs.predict(X_train))

(0.7864278332759215, 0.810479375696767)

In [113]:
recall_score(y_test, y_pred), recall_score(y_train, gs.predict(X_train))

(0.734319716950788, 0.7795410679819859)

In [114]:
roc_auc_score(y_test, y_pred), roc_auc_score(y_train, gs.predict(X_train))

(0.7802764504036451, 0.810360967473803)

**Just model**

In [None]:
xg_cls = xgb.XGBClassifier(random_state=42,
                           use_label_encoder=False,
                           max_depth=8,
                           colsample_bytree = 0.7,
                           subsample=0.7,
                           learning_rate=0.005,
                           n_estimators=1000,
                           reg_lambda=5
                            )

xg_cls.fit(X_train, y_train)

y_pred = xg_cls.predict(X_test)

In [104]:
accuracy_score(y_test, y_pred), accuracy_score(y_train, xg_cls.predict(X_train))

(0.7831361389845739, 0.836445332001997)

In [105]:
precision_score(y_test, y_pred), precision_score(y_train, xg_cls.predict(X_train))

(0.7833503923575571, 0.83440194561132)

In [106]:
recall_score(y_test, y_pred), recall_score(y_train, xg_cls.predict(X_train))

(0.7385011257639112, 0.8093502037315033)

In [107]:
roc_auc_score(y_test, y_pred), roc_auc_score(y_train, xg_cls.predict(X_train))

(0.7802651368729869, 0.8347012603111927)

Drop some features and see if that helps