In [None]:
from initialise import *

# Import data
df = pd.read_excel("Data/publichealth_v10i1e47979_app2.xlsx")

In [2]:
df.head()

Unnamed: 0,Response ID,age,income,marital status,employment,sex,ethnicity,education level,mask wearing,social distancing,...,peak negative risk,reward tipping point,aversion tipping point,total reward risk,total aversion risk,reward aversion tradeoff,tradeoff range,reward aversion consistency,consistency range,full vaccine
0,1,31,2,2,2,2,1,3,1,1,...,1.86,1.473,1.554,3.101,3.879,52.321,41.821,2.629,0.294,1
1,2,20,5,2,2,1,1,4,1,1,...,1.644,0.988,1.477,1.242,3.225,47.266,38.139,2.706,0.118,1
2,3,23,5,2,2,1,1,4,1,1,...,1.553,1.868,1.123,2.346,2.324,66.2,13.957,2.804,0.103,1
3,4,41,2,1,2,1,1,2,1,1,...,1.599,1.303,1.476,2.234,3.221,44.86,45.001,1.071,1.26,0
4,5,36,2,2,2,2,1,2,1,1,...,1.609,1.555,1.531,2.848,3.384,52.876,40.814,2.112,1.055,1


In [None]:
# Don't need Response ID
df.drop("Response ID", axis = 1, inplace = True)
# Predictors and response
y = df["full vaccine"]; X = df.drop("full vaccine", axis = 1)

In [None]:
# 80:20 train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
# Pipelines for baseline models, standard scaler, no parameter tuning
logreg_pipe = Pipeline([("scaler", StandardScaler()), ("logreg", LogisticRegression())])
rf_pipe     = Pipeline([("scaler", StandardScaler()), ("rf", RandomForestClassifier())])
xgb_pipe    = Pipeline([("scaler", StandardScaler()), ("xgb", XGBClassifier())])

In [6]:
# Baseline scores
print(f'''
      Baseline scores ---
      LogReg: {logreg_pipe.fit(X_train, y_train).score(X_test, y_test):.3f}
      Random Forest: {rf_pipe.fit(X_train, y_train).score(X_test, y_test):.3f}
      XGBoost: {xgb_pipe.fit(X_train, y_train).score(X_test, y_test):.3f}
''')


      Baseline scores ---
      LogReg: 0.767
      Random Forest: 0.789
      XGBoost: 0.753



In [23]:
# Define parameter grid for RF
rf_param_dist = {
    "rf__n_estimators": [100, 300, 500],
    "rf__max_depth": [None, 10, 30],
    "rf__min_samples_split": [2, 5],
    "rf__min_samples_leaf": [1, 2],
    "rf__max_features": ["sqrt", "log2"],
    "rf__bootstrap": [True, False],
    "rf__criterion": ["gini", "entropy"],
    "rf__class_weight": [None, "balanced"]
}

# Setup RandomizedSearchCV
rf_random_search = RandomizedSearchCV(
    estimator = rf_pipe,
    param_distributions = rf_param_dist,
    n_iter = 50,
    scoring = "roc_auc",
    cv = 5,
    verbose = 1,
    n_jobs = -1,
    random_state = 42
)

# Fit to training data
rf_random_search.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [17]:
# Best score and parameters
print(f"Best ROC AUC score: {rf_random_search.best_score_:.3f}")
print("Best parameters:", rf_random_search.best_params_)

Best ROC AUC score: 0.786
Best parameters: {'rf__n_estimators': 500, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 1, 'rf__max_features': 'log2', 'rf__max_depth': 10, 'rf__criterion': 'entropy', 'rf__class_weight': None, 'rf__bootstrap': True}


In [9]:
# Define the parameter grid for XGB
xgb_param_dist = {
    "xgb__n_estimators": [100, 200, 300, 500],
    "xgb__max_depth": [3, 5, 7, 10],
    "xgb__learning_rate": [0.01, 0.05, 0.1, 0.2],
    "xgb__subsample": [0.6, 0.8, 1.0],
    "xgb__colsample_bytree": [0.6, 0.8, 1.0],
    "xgb__gamma": [0, 0.1, 0.3, 0.5],
    "xgb__reg_alpha": [0, 0.1, 1, 10],
    "xgb__reg_lambda": [0.1, 1, 10, 100],
    "xgb__scale_pos_weight": [1, 2, 5, 10],
    "xgb__min_child_weight": [1, 3, 5, 10]
}

# Setup RandomizedSearchCV
xgb_random_search = RandomizedSearchCV(
    estimator = xgb_pipe,
    param_distributions = xgb_param_dist,
    n_iter = 50,
    scoring = "roc_auc",
    cv = 5,
    verbose = 1,
    n_jobs = -1,
    random_state = 42
)

# Fit to training data
xgb_random_search.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [14]:
# Best score and parameters
print(f"Best ROC AUC score: {xgb_random_search.best_score_:.3f}")
print("Best parameters found:", xgb_random_search.best_params_)

Best ROC AUC score: 0.794
Best parameters found: {'xgb__subsample': 0.6, 'xgb__scale_pos_weight': 2, 'xgb__reg_lambda': 100, 'xgb__reg_alpha': 0.1, 'xgb__n_estimators': 100, 'xgb__min_child_weight': 10, 'xgb__max_depth': 3, 'xgb__learning_rate': 0.1, 'xgb__gamma': 0.3, 'xgb__colsample_bytree': 0.8}


In [None]:
# Pipelines for tuned RF and XGB models
rf_tuned_pipe   = Pipeline([("scaler", StandardScaler()), ("rf_tuned", RandomForestClassifier(**{x.replace("rf__", ""): v for x, v in rf_random_search.best_params_.items()}))])
xgb_tuned_pipe  = Pipeline([("scaler", StandardScaler()), ("xgb_tuned", XGBClassifier(**{x.replace("xgb__", ""): v for x, v in xgb_random_search.best_params_.items()}))])

In [40]:
rf_tuned_pipe

In [53]:
predictions = (
    pd.concat(
        [
            pd.DataFrame({"actual": list(y_test), "predicted": logreg_pipe.fit(X_train, y_train).predict(X_test)}),
            pd.DataFrame({"actual": list(y_test), "predicted": rf_tuned_pipe.fit(X_train, y_train).predict(X_test)}),
            pd.DataFrame({"actual": list(y_test), "predicted": xgb_tuned_pipe.fit(X_train, y_train).predict(X_test)}),
        ], 
        keys = ["logreg", "rf", "xgb"]
        )
        .reset_index(names = ["model", "index"])
        .drop("index", axis = 1)
        )

predictions.head()

Unnamed: 0,model,actual,predicted
0,logreg,1,1
1,logreg,1,1
2,logreg,1,1
3,logreg,1,1
4,logreg,1,1


In [54]:
# Save data and the hyper-parameter tuned RF and XGB models
modelling_output = {
    "X_train": X_train,
    "y_train": y_train,
    "X_test": X_test,
    "y_test": y_test,
    "logreg_model": logreg_pipe,
    "rf_model": rf_tuned_pipe,
    "xgb_model": xgb_tuned_pipe,
    "predictions": predictions
}

with open(".pickle", "wb") as file:
    pickle.dump(modelling_output, file, protocol = pickle.HIGHEST_PROTOCOL)