<a href="https://colab.research.google.com/github/Ishanml/Vaccine/blob/main/Vaccine(IITG).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Data Preprocessing, Model Training and Validation

In [7]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score

# Load the dataset
data = pd.read_csv('training_set_features.csv')
Labels = pd.read_csv('training_set_labels.csv')
data = pd.concat([data,Labels])

# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Encode categorical features
categorical_features = ['age_group', 'education', 'race', 'sex', 'income_poverty',
                        'marital_status', 'rent_or_own', 'employment_status',
                        'hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation']
numeric_features = data.columns.difference(categorical_features + ['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

# Apply preprocessing
X = data_imputed.drop(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], axis=1)
y = data_imputed[['xyz_vaccine', 'seasonal_vaccine']]

# Convert target columns to integer type
y = y.astype(int)

# Preprocess features
X_preprocessed = preprocessor.fit_transform(X)

# Split the data
X_train, X_valid, y_train, y_valid = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42, stratify=y)

# Define the model
model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
y_pred_proba = model.predict_proba(X_valid)
y_pred_proba = np.array([pred[:, 1] for pred in y_pred_proba]).T

# Calculate ROC AUC for each target
roc_auc_xyz = roc_auc_score(y_valid['xyz_vaccine'], y_pred_proba[:, 0])
roc_auc_seasonal = roc_auc_score(y_valid['seasonal_vaccine'], y_pred_proba[:, 1])

# Mean ROC AUC
mean_roc_auc = np.mean([roc_auc_xyz, roc_auc_seasonal])
print(f'Mean ROC AUC: {mean_roc_auc}')


Mean ROC AUC: 0.8055804042706749


## Prediction and submission

In [9]:
# Predict on the test set
test_data = pd.read_csv('test_set_features.csv')

# Add missing target columns to the test data (filled with NaN)
test_data['xyz_vaccine'] = np.nan
test_data['seasonal_vaccine'] = np.nan

test_data_imputed = pd.DataFrame(imputer.transform(test_data), columns=test_data.columns)

# Now drop the target columns before further processing
X_test = test_data_imputed.drop(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], axis=1)
X_test_preprocessed = preprocessor.transform(X_test)

# Make predictions
test_pred_proba = model.predict_proba(X_test_preprocessed)
test_pred_proba = np.array([pred[:, 1] for pred in test_pred_proba]).T

# Prepare submission file
submission = pd.DataFrame({
    'respondent_id': test_data['respondent_id'],
    'xyz_vaccine': test_pred_proba[:, 0],
    'seasonal_vaccine': test_pred_proba[:, 1]
})

submission.to_csv('submission.csv', index=False)