# **Loading the data**

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder


In [None]:
# Load the cleaned dataset
joined_df = pd.read_csv('treemodel_joined_df.csv', index_col='respondent_id')
print("joined_df.shape", joined_df.shape)
joined_df.head()


joined_df.shape (26707, 46)


Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,hhs_geo_region_dqpwygqj,hhs_geo_region_fpwskwrf,hhs_geo_region_kbazzjca,hhs_geo_region_lrircsnp,hhs_geo_region_lzgpxyit,hhs_geo_region_mlyzmhmf,hhs_geo_region_oxchjgsf,hhs_geo_region_qufhixun,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,1
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0


## **Random Forest + Grid Search**

In [None]:
# Separate features and targets
target_cols = ['h1n1_vaccine', 'seasonal_vaccine']
X = joined_df.drop(columns=target_cols)  # Features
y = joined_df[target_cols]  # Targets


In [None]:
print("Features shape:", X.shape)
print("Targets shape:", y.shape)

Features shape: (26707, 45)
Targets shape: (26707, 2)


In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

print("Training features shape:", X_train.shape)
print("Validation features shape:", X_val.shape)
print("Training targets shape:", y_train.shape)
print("Validation targets shape:", y_val.shape)


Training features shape: (21365, 45)
Validation features shape: (5342, 45)
Training targets shape: (21365, 2)
Validation targets shape: (5342, 2)


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [None]:

# Initialize the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Use GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='roc_auc', verbose=2, n_jobs=-1)


In [None]:
# Fit the grid search model
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best Parameters:", grid_search.best_params_)
print("Best ROC AUC Score:", grid_search.best_score_)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


KeyboardInterrupt: 

In [None]:
# Train the final model with the best parameters
best_rf_model = grid_search.best_estimator_
best_rf_model.fit(X_train, y_train)


In [None]:
# Evaluate the model
y_val_proba = best_rf_model.predict_proba(X_val)

In [None]:
# Extract probabilities for each target
h1n1_val_proba = y_val_proba[0][:, 1]
seasonal_val_proba = y_val_proba[1][:, 1]

In [None]:
# Calculate ROC AUC for each target
h1n1_auc = roc_auc_score(y_val['h1n1_vaccine'], h1n1_val_proba)
seasonal_auc = roc_auc_score(y_val['seasonal_vaccine'], seasonal_val_proba)

print(f"H1N1 Vaccine AUC: {h1n1_auc}")
print(f"Seasonal Vaccine AUC: {seasonal_auc:}")
print(f"Overall AUC: {(h1n1_auc + seasonal_auc) / 2}")

**Predict on Test Set**

In [None]:
test_features_df = pd.read_csv("clean_test_df.csv",
                               index_col="respondent_id")
test_features_df.shape

In [None]:
test_features_df.head()

In [None]:
test_features_df.columns

In [None]:
# predict
y_test_proba = best_rf_model.predict_proba(test_features_df)

# 提取预测概率
h1n1_test_preds = y_test_proba[0][:, 1]  # H1N1疫苗预测概率
seasonal_test_preds = y_test_proba[1][:, 1]  # 季节性疫苗预测概率

# 构建提交文件
submission = pd.DataFrame({
    'respondent_id': test_features_df.index,
    'h1n1_vaccine': h1n1_test_preds,
    'seasonal_vaccine': seasonal_test_preds
})


In [None]:
submission.head()

In [None]:
# # save CSV
# submission.to_csv('random_tree_prediction.csv', index=False)
# print("Submission file saved as 'random_tree_prediction.csv'.")

## **Multilabel XGBoost + Optuna Model**

In [None]:
! pip install xgboost optuna




In [None]:
import xgboost as xgb
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split


In [None]:
def objective(trial):
    # Define hyperparameters to optimize
    param = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'tree_method': 'hist',  # Use 'gpu_hist' if using GPU
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0)
    }

    # Train XGBoost models for each target separately
    h1n1_model = xgb.XGBClassifier(**param)
    seasonal_model = xgb.XGBClassifier(**param)

    # Train and predict for h1n1_vaccine
    h1n1_model.fit(X_train, y_train['h1n1_vaccine'])
    h1n1_val_proba = h1n1_model.predict_proba(X_val)[:, 1]

    # Train and predict for seasonal_vaccine
    seasonal_model.fit(X_train, y_train['seasonal_vaccine'])
    seasonal_val_proba = seasonal_model.predict_proba(X_val)[:, 1]

    # Calculate AUC for both targets
    h1n1_auc = roc_auc_score(y_val['h1n1_vaccine'], h1n1_val_proba)
    seasonal_auc = roc_auc_score(y_val['seasonal_vaccine'], seasonal_val_proba)

    # Return the mean AUC as the optimization target
    return (h1n1_auc + seasonal_auc) / 2


In [None]:
# Create an Optuna study
study = optuna.create_study(direction='maximize')

# Optimize the study
study.optimize(objective, n_trials=50, timeout=3600)

# Print the best parameters and best score
print("Best Parameters:", study.best_params)
print("Best ROC AUC Score:", study.best_value)


In [None]:
# Extract the best parameters
best_params = study.best_params

In [None]:
# Train the final models for each target
final_h1n1_model = xgb.XGBClassifier(**best_params)
final_seasonal_model = xgb.XGBClassifier(**best_params)

final_h1n1_model.fit(X_train, y_train['h1n1_vaccine'])
final_seasonal_model.fit(X_train, y_train['seasonal_vaccine'])


In [None]:
# Predict probabilities on validation set
h1n1_val_proba = final_h1n1_model.predict_proba(X_val)[:, 1]
seasonal_val_proba = final_seasonal_model.predict_proba(X_val)[:, 1]



In [None]:
# Calculate ROC AUC for final models
h1n1_auc = roc_auc_score(y_val['h1n1_vaccine'], h1n1_val_proba)
seasonal_auc = roc_auc_score(y_val['seasonal_vaccine'], seasonal_val_proba)

print(f"Final H1N1 Vaccine AUC: {h1n1_auc:.3f}")
print(f"Final Seasonal Vaccine AUC: {seasonal_auc:.3f}")
print(f"Final Overall AUC: {(h1n1_auc + seasonal_auc) / 2:.3f}")



**Predict on Test Set**

In [None]:
# Predict on the test set
h1n1_test_proba = final_h1n1_model.predict_proba(test_features_df)[:, 1]
seasonal_test_proba = final_seasonal_model.predict_proba(test_features_df)[:, 1]

# Create the submission file
submission = pd.DataFrame({
    'respondent_id': test_features_df.index,
    'h1n1_vaccine': h1n1_test_proba,
    'seasonal_vaccine': seasonal_test_proba
})

submission

In [None]:

# Save to CSV
submission.to_csv('xgboost_optuna_submission.csv', index=False)
print("Submission file saved as 'xgboost_optuna_submission.csv'.")

## **Separate XGBoost + Optuna Models**

In [None]:
! pip install optuna



In [None]:
import xgboost as xgb
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split


In [None]:
# Separate features and target columns
target_cols = ['h1n1_vaccine', 'seasonal_vaccine']
X = joined_df.drop(columns=target_cols)  # Features
y = joined_df[target_cols]  # Targets

In [None]:
# Print shapes to confirm separation
print("Features shape:", X.shape)
print("Targets shape:", y.shape)

Features shape: (26707, 44)
Targets shape: (26707, 2)


In [None]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes of the datasets
print("Training features shape:", X_train.shape)
print("Validation features shape:", X_val.shape)
print("Training targets shape:", y_train.shape)
print("Validation targets shape:", y_val.shape)


Training features shape: (22700, 44)
Validation features shape: (4007, 44)
Training target shape: (22700,)
Validation target shape: (4007,)


In [None]:
# Check for missing values
print("Missing values in training features:", X_train.isnull().sum().sum())
print("Missing values in validation features:", X_val.isnull().sum().sum())

# Fill missing values with -999 as a placeholder
X_train.fillna(-999, inplace=True)
X_val.fillna(-999, inplace=True)


Missing values in training features: 0
Missing values in validation features: 0


H1N1 Model

In [None]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score, classification_report

# Initialize the XGBoost model
basic_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',      # AUC metric for evaluation
    tree_method='hist',     # Use 'gpu_hist' for GPU acceleration if available
    max_depth=6,            # Maximum depth of a tree (prevents overfitting)
    learning_rate=0.1,      # Controls the step size for weight updates
    n_estimators=500,       # Number of boosting rounds (trees)
    subsample=0.8,          # Fraction of training samples used for each tree
    colsample_bytree=0.8,   # Fraction of features used for each tree
    gamma=1,                # Minimum loss reduction required for further splits
    min_child_weight=1,     # Minimum sum of weights in a child node
    alpha=0.0,              # L1 regularization term to reduce overfitting
    random_state=42         # For reproducibility
)


In [None]:
# Train the model
basic_model.fit(X_train, y_train)

In [None]:
# Predict probabilities for the validation set
y_val_proba = basic_model.predict_proba(X_val)[:, 1]

# Evaluate AUC
auc_score = roc_auc_score(y_val, y_val_proba)
print(f"Baseline XGBoost AUC for h1n1_vaccine: {auc_score:.4f}")

# Classification report
y_val_preds = basic_model.predict(X_val)
print("Classification Report:")
print(classification_report(y_val, y_val_preds))

Baseline XGBoost AUC for h1n1_vaccine: 0.8550
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.81      0.80      2855
           1       0.77      0.74      0.76      2487

    accuracy                           0.78      5342
   macro avg       0.78      0.78      0.78      5342
weighted avg       0.78      0.78      0.78      5342



In [None]:
import optuna

def h1n1_objective(trial):
    # Define the parameter space
    param = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'tree_method': 'hist',  # Use 'gpu_hist' for GPU acceleration
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0),
    }

    # Train the XGBoost model
    model = xgb.XGBClassifier(**param)
    model.fit(X_train, y_train)

    # Predict probabilities and calculate AUC
    y_val_proba = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_val_proba)
    return auc


In [None]:
# Create an Optuna study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(h1n1_objective, n_trials=50, timeout=3600)

[I 2024-11-27 09:26:27,246] A new study created in memory with name: no-name-9c59f07f-6a7d-4c71-aabd-090c4581562d
[I 2024-11-27 09:26:31,737] Trial 0 finished with value: 0.8354161147405887 and parameters: {'max_depth': 6, 'learning_rate': 0.14736617327957008, 'n_estimators': 411, 'subsample': 0.7774111907796395, 'colsample_bytree': 0.748501166815782, 'lambda': 3.222184722739105, 'alpha': 5.681871918208234}. Best is trial 0 with value: 0.8354161147405887.
[I 2024-11-27 09:26:35,655] Trial 1 finished with value: 0.8268935034853805 and parameters: {'max_depth': 5, 'learning_rate': 0.2105065349307634, 'n_estimators': 597, 'subsample': 0.8262843782419276, 'colsample_bytree': 0.8309806167855158, 'lambda': 7.105309235664158, 'alpha': 3.962075822525857}. Best is trial 0 with value: 0.8354161147405887.
[I 2024-11-27 09:26:38,230] Trial 2 finished with value: 0.8200339480350036 and parameters: {'max_depth': 8, 'learning_rate': 0.25458318784726425, 'n_estimators': 348, 'subsample': 0.71634786672

In [None]:
# Print the best parameters and the best score
print("Best Parameters for h1n1_vaccine:", study.best_params)
print("Best ROC AUC for h1n1_vaccine:", study.best_value)


Best Parameters for h1n1_vaccine: {'max_depth': 7, 'learning_rate': 0.04561606918951591, 'n_estimators': 423, 'subsample': 0.8324850343116067, 'colsample_bytree': 0.779364188528628, 'lambda': 4.738190503905115, 'alpha': 3.1868647295703854}
Best ROC AUC for h1n1_vaccine: 0.8383608607010133


In [None]:
# Train the final model with best parameters
best_params = study.best_params
h1n1_model = xgb.XGBClassifier(**best_params)
h1n1_model.fit(X_train, y_train)


In [None]:
# Evaluate on the validation set
y_val_proba = h1n1_model.predict_proba(X_val)[:, 1]
final_auc_score = roc_auc_score(y_val, y_val_proba)
print(f"Final XGBoost AUC for h1n1_vaccine: {final_auc_score:.4f}")

Final XGBoost AUC for h1n1_vaccine: 0.8613


In [None]:
# Classification report
y_val_preds = h1n1_model.predict(X_val)
print("Final Classification Report:")
print(classification_report(y_val, y_val_preds))

Final Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.82      0.81      2855
           1       0.78      0.75      0.77      2487

    accuracy                           0.79      5342
   macro avg       0.79      0.78      0.79      5342
weighted avg       0.79      0.79      0.79      5342



Seasonal Model

In [None]:
# Extract seasonal_vaccine target
y_seasonal = joined_df['seasonal_vaccine']

# Features remain the same (X), as they are already extracted


In [None]:
# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y_seasonal,            # Use 'seasonal_vaccine' as the target
    test_size=0.2,            # 20% for validation
    random_state=42,          # For reproducibility
    stratify=y_seasonal       # Maintain class distribution
)

# Print shapes to confirm
print("Training features shape:", X_train.shape)
print("Validation features shape:", X_val.shape)
print("Training target shape:", y_train.shape)
print("Validation target shape:", y_val.shape)


Training features shape: (21365, 44)
Validation features shape: (5342, 44)
Training target shape: (21365,)
Validation target shape: (5342,)


In [None]:
# Initialize the XGBoost model
basic_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',      # AUC metric for evaluation
    tree_method='hist',     # Use 'gpu_hist' for GPU acceleration if available
    max_depth=6,            # Maximum depth of a tree (prevents overfitting)
    learning_rate=0.1,      # Controls the step size for weight updates
    n_estimators=500,       # Number of boosting rounds (trees)
    subsample=0.8,          # Fraction of training samples used for each tree
    colsample_bytree=0.8,   # Fraction of features used for each tree
    gamma=1,                # Minimum loss reduction required for further splits
    min_child_weight=1,     # Minimum sum of weights in a child node
    alpha=0.0,              # L1 regularization term to reduce overfitting
    random_state=42         # For reproducibility
)


In [None]:

# Train the model
basic_model.fit(X_train, y_train)

# Predict probabilities for the validation set
y_val_proba = basic_model.predict_proba(X_val)[:, 1]

# Evaluate AUC
auc_score = roc_auc_score(y_val, y_val_proba)
print(f"Baseline XGBoost AUC for seasonal_vaccine: {auc_score:.4f}")

# Classification report
y_val_preds = basic_model.predict(X_val)
print("Classification Report:")
print(classification_report(y_val, y_val_preds))


Baseline XGBoost AUC for seasonal_vaccine: 0.8550
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.81      0.80      2855
           1       0.77      0.74      0.76      2487

    accuracy                           0.78      5342
   macro avg       0.78      0.78      0.78      5342
weighted avg       0.78      0.78      0.78      5342



In [None]:
def seasonal_objective(trial):
    # Define the parameter space
    param = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'tree_method': 'hist',  # Use 'gpu_hist' for GPU acceleration
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0),
    }

    # Train the XGBoost model
    model = xgb.XGBClassifier(**param)
    model.fit(X_train, y_train)

    # Predict probabilities and calculate AUC
    y_val_proba = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_val_proba)
    return auc


In [None]:
# Create an Optuna study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(seasonal_objective, n_trials=50, timeout=3600)

# Print the best parameters and the best score
print("Best Parameters for seasonal_vaccine:", study.best_params)
print("Best ROC AUC for seasonal_vaccine:", study.best_value)


[I 2024-11-27 09:35:14,579] A new study created in memory with name: no-name-2a0ed610-4be9-41be-8f3b-16412eabac50
[I 2024-11-27 09:35:19,114] Trial 0 finished with value: 0.8582568410022837 and parameters: {'max_depth': 3, 'learning_rate': 0.19123251323761997, 'n_estimators': 916, 'subsample': 0.8629159586969057, 'colsample_bytree': 0.9264075246395871, 'lambda': 9.751728080911152, 'alpha': 9.679484060727221}. Best is trial 0 with value: 0.8582568410022837.
[I 2024-11-27 09:35:23,187] Trial 1 finished with value: 0.8460572208408418 and parameters: {'max_depth': 9, 'learning_rate': 0.141014346019758, 'n_estimators': 865, 'subsample': 0.8544385364429403, 'colsample_bytree': 0.6121812692184435, 'lambda': 7.007103836934582, 'alpha': 6.550423378050255}. Best is trial 0 with value: 0.8582568410022837.
[I 2024-11-27 09:35:24,273] Trial 2 finished with value: 0.8578670029864578 and parameters: {'max_depth': 7, 'learning_rate': 0.01521813368261019, 'n_estimators': 211, 'subsample': 0.72841142206

Best Parameters for seasonal_vaccine: {'max_depth': 3, 'learning_rate': 0.08882348477386529, 'n_estimators': 996, 'subsample': 0.7437243195263293, 'colsample_bytree': 0.6614805378795324, 'lambda': 3.2290125736102437, 'alpha': 9.979286188017088}
Best ROC AUC for seasonal_vaccine: 0.861322026904175


In [None]:
# Train the final model with best parameters
best_params = study.best_params
seasonal_model = xgb.XGBClassifier(**best_params)
seasonal_model.fit(X_train, y_train)


In [None]:
# Evaluate on the validation set
y_val_proba = seasonal_model.predict_proba(X_val)[:, 1]
final_auc_score = roc_auc_score(y_val, y_val_proba)
print(f"Final XGBoost AUC for seasonal_vaccine: {final_auc_score:.4f}")

Final XGBoost AUC for seasonal_vaccine: 0.8613


In [None]:

# Classification report
y_val_preds = seasonal_model.predict(X_val)
print("Final Classification Report:")
print(classification_report(y_val, y_val_preds))

Final Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.82      0.81      2855
           1       0.78      0.75      0.77      2487

    accuracy                           0.79      5342
   macro avg       0.79      0.78      0.79      5342
weighted avg       0.79      0.79      0.79      5342



**Predict on Test Set**

In [None]:
test_features_df = pd.read_csv("treemodel_test_data.csv",
                               index_col="respondent_id")
test_features_df.shape

(26708, 44)

In [None]:
test_features_df

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,hhs_geo_region_atmpeygn,hhs_geo_region_bhuqouqj,hhs_geo_region_dqpwygqj,hhs_geo_region_fpwskwrf,hhs_geo_region_kbazzjca,hhs_geo_region_lrircsnp,hhs_geo_region_lzgpxyit,hhs_geo_region_mlyzmhmf,hhs_geo_region_oxchjgsf,hhs_geo_region_qufhixun
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53410,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53411,3.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
53412,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
53413,3.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Predict probabilities on test set for H1N1 vaccine
h1n1_test_proba = h1n1_model.predict_proba(test_features_df)[:, 1]

# Predict probabilities on test set for Seasonal vaccine
seasonal_test_proba = seasonal_model.predict_proba(test_features_df)[:, 1]

# Create the submission DataFrame
submission = pd.DataFrame({
    'respondent_id': test_features_df.index,
    'h1n1_vaccine': h1n1_test_proba,
    'seasonal_vaccine': seasonal_test_proba
})

In [None]:
submission.head()

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.10389,0.248065
1,26708,0.039672,0.059094
2,26709,0.534211,0.804516
3,26710,0.640749,0.868613
4,26711,0.262284,0.41901


In [None]:
# Save submission file to CSV
submission.to_csv('Separate XGBoost Models submission.csv', index=True)
print("Submission file saved as 'submission.csv'")


Submission file saved as 'submission.csv'


In [None]:
separate_df = pd.read_csv("separate_model_submission.csv",
                               index_col="respondent_id")
separate_df

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0.035659,0.028121
26708,0.026314,0.021786
26709,0.040227,0.030755
26710,0.104622,0.080550
26711,0.075981,0.076892
...,...,...
53410,0.083258,0.073584
53411,0.053661,0.053549
53412,0.020437,0.023202
53413,0.021263,0.042632
