# HACKATHON SUBMISSION WEEK 3

###### IMPORTING NECESSARY LIBRARIES

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score

###### LOADING NECESSARY FILES

In [21]:
train_features = pd.read_csv(r"C:\Users\Ritama\Downloads\dataset and all\training_set_features.csv")
train_labels = pd.read_csv(r"C:\Users\Ritama\Downloads\dataset and all\training_set_labels.csv")
test_features = pd.read_csv(r"C:\Users\Ritama\Downloads\dataset and all\test_set_features.csv")

###### IDENTIFYING CATEGORICAL COLUMNS

In [22]:
categorical_features = ['age_group', 'education', 'race', 'sex', 'income_poverty', 
                        'marital_status', 'rent_or_own', 'employment_status', 
                        'hhs_geo_region', 'census_msa', 'employment_industry', 
                        'employment_occupation']

In [23]:

# Create a column transformer with one-hot encoder for categorical features and simple imputer for numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), [col for col in train_features.columns if col not in categorical_features]),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Create the RandomForest model pipeline
model_xyz = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

model_seasonal = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Split the data
X = train_features
y_xyz = train_labels['xyz_vaccine']
y_seasonal = train_labels['seasonal_vaccine']
X_train, X_val, y_train_xyz, y_val_xyz = train_test_split(X, y_xyz, test_size=0.2, random_state=42)
_, _, y_train_seasonal, y_val_seasonal = train_test_split(X, y_seasonal, test_size=0.2, random_state=42)

# Train the model for xyz_vaccine
model_xyz.fit(X_train, y_train_xyz)
preds_val_xyz = model_xyz.predict_proba(X_val)[:, 1]

# Train the model for seasonal_vaccine
model_seasonal.fit(X_train, y_train_seasonal)
preds_val_seasonal = model_seasonal.predict_proba(X_val)[:, 1]

# Predict for the test set
X_test = test_features
test_preds_xyz = model_xyz.predict_proba(X_test)[:, 1]
test_preds_seasonal = model_seasonal.predict_proba(X_test)[:, 1]





###### CALCULATING ROC AUC SCORE

In [24]:
# Calculate ROC AUC scores
roc_auc_xyz = roc_auc_score(y_val_xyz, preds_val_xyz)
roc_auc_seasonal = roc_auc_score(y_val_seasonal, preds_val_seasonal)

# Calculate overall score (mean of ROC AUC scores)
overall_score = (roc_auc_xyz + roc_auc_seasonal) / 2.0

print(f'ROC AUC Score for xyz_vaccine: {roc_auc_xyz:.4f}')
print(f'ROC AUC Score for seasonal_vaccine: {roc_auc_seasonal:.4f}')
print(f'Overall Score (Mean ROC AUC): {overall_score:.4f}')

ROC AUC Score for xyz_vaccine: 0.8294
ROC AUC Score for seasonal_vaccine: 0.8521
Overall Score (Mean ROC AUC): 0.8408


###### SAVING SUBMISSION FILE

In [25]:
# Prepare the submission file
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_preds_xyz,
    'seasonal_vaccine': test_preds_seasonal
})

# Save the submission file
submission.to_csv('submission.csv', index=False)
print('Submission file saved as submission.csv')


Submission file saved as submission.csv
