In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [3]:
# Loading the data
train_labels = pd.read_csv('training_set_labels.csv')
train_features = pd.read_csv('training_set_features.csv')
test_features = pd.read_csv('test_set_features.csv')

In [4]:
# Merge training features and labels
train_data = pd.merge(train_features, train_labels, on='respondent_id')

In [5]:
# Inspectin the data
print(train_data.head())
print(train_data.info())

   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              0          1.0            0.0                        0.0   
1              1          3.0            2.0                        0.0   
2              2          1.0            1.0                        0.0   
3              3          1.0            1.0                        0.0   
4              4          2.0            1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0                      1.0  

In [6]:
# Fit the imputer on the training features only (excluding target columns)
imputer = SimpleImputer(strategy='most_frequent')
X_train_imputed = pd.DataFrame(imputer.fit_transform(train_features))
X_test_imputed = pd.DataFrame(imputer.transform(test_features))

In [7]:
# Restore column names after imputation
X_train_imputed.columns = train_features.columns
X_test_imputed.columns = test_features.columns


In [8]:
# Merge imputed features with labels
train_data_imputed = pd.merge(X_train_imputed, train_labels, on='respondent_id')

In [9]:
# Separate features and targets
X = train_data_imputed.drop(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], axis=1)
y_xyz = train_data_imputed['xyz_vaccine']
y_seasonal = train_data_imputed['seasonal_vaccine']

In [10]:
X_test = X_test_imputed.drop(['respondent_id'], axis=1)

In [11]:
# using the One-Hot Encoding for categorical variables in the data
categorical_features = ['age_group', 'education', 'race', 'sex', 'income_poverty', 
                        'marital_status', 'rent_or_own', 'employment_status', 
                        'hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation']

In [12]:
encoder = OneHotEncoder(drop='first', sparse_output=False)
X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_features]))
X_test_encoded = pd.DataFrame(encoder.transform(X_test[categorical_features]))

In [13]:
# Dropping the original categorical columns 
# adding encoded columns
X = X.drop(categorical_features, axis=1)
X_test = X_test.drop(categorical_features, axis=1)
X = pd.concat([X, X_encoded], axis=1)
X_test = pd.concat([X_test, X_test_encoded], axis=1)

In [14]:
# Convert all column names to strings in the file
X.columns = X.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

In [15]:
# data into training and validation sets
X_train, X_val, y_train_xyz, y_val_xyz = train_test_split(X, y_xyz, test_size=0.2, random_state=42)
_, _, y_train_seasonal, y_val_seasonal = train_test_split(X, y_seasonal, test_size=0.2, random_state=42)

In [16]:
# Training models for both xyz and seasonal vaccines using random state = 42
model_xyz = RandomForestClassifier(random_state=42)
model_seasonal = RandomForestClassifier(random_state=42)

model_xyz.fit(X_train, y_train_xyz)
model_seasonal.fit(X_train, y_train_seasonal)


In [17]:
# Predicting on validation set
val_pred_xyz = model_xyz.predict_proba(X_val)[:, 1]
val_pred_seasonal = model_seasonal.predict_proba(X_val)[:, 1]

In [18]:
# Evaluate models
roc_auc_xyz = roc_auc_score(y_val_xyz, val_pred_xyz)
roc_auc_seasonal = roc_auc_score(y_val_seasonal, val_pred_seasonal)
print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')

ROC AUC for xyz_vaccine: 0.8307497961996488
ROC AUC for seasonal_vaccine: 0.8523577511829576


In [19]:
# Predict on test set
test_pred_xyz = model_xyz.predict_proba(X_test)[:, 1]
test_pred_seasonal = model_seasonal.predict_proba(X_test)[:, 1]

test_pred_xyz


array([0.37, 0.06, 0.71, ..., 0.28, 0.25, 0.54])

In [25]:
test_pred_seasonal

array([0.37, 0.06, 0.71, ..., 0.28, 0.25, 0.54])

In [20]:

# Prepare the submission file
submission= pd.DataFrame({
    
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_pred_xyz,
    'seasonal_vaccine': test_pred_seasonal
})

In [23]:

#creating a submission_file
submission.to_csv('submission_rfr.csv', index=False)
print("Submission file created successfully!")


Submission file created successfully!


In [24]:
pd.read_csv('submission_rfr.csv')

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,26707,0.19,0.37
1,26708,0.04,0.06
2,26709,0.52,0.71
3,26710,0.57,0.84
4,26711,0.25,0.49
...,...,...,...
26703,53410,0.29,0.48
26704,53411,0.18,0.36
26705,53412,0.04,0.28
26706,53413,0.06,0.25
