# **Problem Statement : Predicting probabilities for xyz_vaccine and seasonal_vaccine**

# 1. **Imports**

In [None]:
# to work with dataframes
import pandas as pd
import numpy as np

# to split data into training and validation sets
from sklearn.model_selection import train_test_split

# to impute missing values
from sklearn.impute import SimpleImputer

# to normalize data
from sklearn.preprocessing import StandardScaler

# to build the model
from sklearn.ensemble import RandomForestClassifier

# to evaluate the model
from sklearn.metrics import roc_auc_score


# **2. Load Datasets**

# 2.1 Load Training and Testing data

In [None]:
# training data
train_data = pd.read_csv('training_set_features.csv')
labels = pd.read_csv('training_set_labels.csv')
training_data = train_data.merge(labels, on='respondent_id')

# testing data
testing_data = pd.read_csv('test_set_features.csv')

# **3. Data preprocessing for training data**

# 3.1 Seggregation of features (X) and target variables (y)

In [None]:
# features
X = training_data.drop(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], axis=1)

# target variables
y_xyz = training_data['xyz_vaccine']
y_seasonal = training_data['seasonal_vaccine']


In [None]:
X.head()

Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [None]:
y_xyz.head()

0    0
1    0
2    0
3    0
4    0
Name: xyz_vaccine, dtype: int64

# 3.2 One Hot Encoding

In [None]:
encoded_X_train = pd.get_dummies(X, drop_first=True)
encoded_test_data = pd.get_dummies(testing_data, drop_first=True)

# testing data after one hot encoding
encoded_test_data.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,employment_occupation_qxajmpny,employment_occupation_rcertsgn,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv
0,26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,False,False,False,False,False,False,False,False,False,False
1,26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,True,False,False
2,26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,False,False,False,False,False,False,False,False,False,False
3,26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
4,26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,False,False,False,False,False,False,False,False,False,False


In [None]:
# training data after one hot encoding
encoded_X_train.head()

Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,employment_occupation_qxajmpny,employment_occupation_rcertsgn,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,False,False,False,False,False,False,False,False,False,False
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,False,False,False,False,False,False,True,False,False,False
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,...,False,False,False,False,False,False,False,False,True,False
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,False,False,False,False,False,False,False,False,False,False


In [None]:
# Ensure columns in X_data and test_data are in same alignment after OHE
X, test_data = encoded_X_train.align(encoded_test_data, join='left', axis=1, fill_value=0)

# 3.3 Splitting the training data into training and validation sets

In [None]:
X_train, X_val, y_xyz_train, y_xyz_val = train_test_split(X, y_xyz, test_size=0.25, random_state=56)
X_train, X_val, y_seasonal_train, y_seasonal_val = train_test_split(X, y_seasonal, test_size=0.25, random_state=56)

In [None]:
# Create an Imputer for Missing Values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_val_imputed = imputer.transform(X_val)
test_data_imputed = imputer.transform(test_data)

# 3.4 Data Normalization

In [None]:
scaler = StandardScaler()
normalized_X_train = pd.DataFrame(scaler.fit_transform(X_train_imputed),columns=X_train.columns)
normalized_X_val = scaler.transform(X_val_imputed)
normalized_test_data = scaler.transform(test_data_imputed)
normalized_X_train.head()

Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,employment_occupation_qxajmpny,employment_occupation_rcertsgn,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv
0,-0.681793,-2.041194,-0.22644,0.616005,-0.27281,0.463435,1.339451,1.404925,-1.452913,-0.552699,...,-0.145455,-0.103181,-0.120782,-0.117112,-0.131013,-0.116012,-0.201831,-0.138502,-0.267547,-0.098895
1,-1.782847,-0.420767,-0.22644,-1.636107,-0.27281,-2.161253,-0.74908,-0.713813,-1.452913,-0.552699,...,-0.145455,-0.103181,-0.120782,-0.117112,-0.131013,-0.116012,-0.201831,-0.138502,-0.267547,-0.098895
2,0.41926,-0.420767,-0.22644,0.616005,-0.27281,0.463435,-0.74908,-0.713813,0.691448,-0.552699,...,-0.145455,-0.103181,-0.120782,-0.117112,-0.131013,-0.116012,4.954641,-0.138502,-0.267547,-0.098895
3,1.520313,-0.420767,-0.22644,-1.636107,-0.27281,0.463435,1.339451,1.404925,0.691448,-0.552699,...,-0.145455,-0.103181,-0.120782,-0.117112,-0.131013,-0.116012,-0.201831,-0.138502,-0.267547,-0.098895
4,-0.681793,-0.420767,-0.22644,-1.636107,-0.27281,0.463435,-0.74908,-0.713813,-1.452913,-0.552699,...,-0.145455,-0.103181,-0.120782,-0.117112,-0.131013,-0.116012,-0.201831,-0.138502,-0.267547,-0.098895


# **4. Building the Random Forest Classifier Model**

# 4.1 Initialization

In [None]:
xyz_model = RandomForestClassifier(n_estimators=100, random_state=56)
seasonal_model = RandomForestClassifier(n_estimators=100, random_state=56)

# 4.2 Train the model

In [None]:
xyz_model.fit(normalized_X_train, y_xyz_train)
seasonal_model.fit(normalized_X_train, y_seasonal_train)

# 4.3 Prediction for validation set

In [None]:
y_xyz_pred = xyz_model.predict_proba(normalized_X_val)[:, 1]
y_seasonal_pred = seasonal_model.predict_proba(normalized_X_val)[:, 1]

# 4.4 Calculation of ROC AUC Scores

In [None]:
xyz_roc_auc = roc_auc_score(y_xyz_val,y_xyz_pred)
seasonal_roc_auc = roc_auc_score(y_seasonal_val, y_seasonal_pred)


In [None]:
print(f'xyz Vaccine AUC-ROC: {xyz_roc_auc:.4f}')
print(f'Seasonal Vaccine AUC-ROC: {seasonal_roc_auc:.4f}')

xyz Vaccine AUC-ROC: 0.8528
Seasonal Vaccine AUC-ROC: 0.8572


In [None]:
mean_roc_auc = (xyz_roc_auc + seasonal_roc_auc) / 2
print(f'Mean ROC AUC Score: {mean_roc_auc:.4f}')

Mean ROC AUC Score: 0.8550


# 4.5 Prediction for test data

In [None]:
xyz_test_pred = xyz_model.predict_proba(normalized_test_data)[:, 1]
seasonal_test_pred = seasonal_model.predict_proba(normalized_test_data)[:, 1]

## **5. Dataframe containing final prediction**

In [None]:
predicted_answer = pd.DataFrame({
    'respondent_id': range(len(test_data)),
    'xyz_vaccine': xyz_test_pred,
    'seasonal_vaccine': seasonal_test_pred
})

In [None]:
predicted_answer.head()

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,0,0.3,0.35
1,1,0.06,0.11
2,2,0.29,0.72
3,3,0.55,0.87
4,4,0.34,0.49


In [None]:
predicted_answer.to_csv('predictedAnswer.csv', index=False)

from google.colab import files
files.download('predictedAnswer.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>