<a href="https://colab.research.google.com/github/Navaneeth1174/Samala_Navaneeth_Datahack-/blob/main/Summer_Analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score

train_features = pd.read_csv('/content/drive/MyDrive/Summer Analytics/training_set_features.csv')
train_labels = pd.read_excel('/content/drive/MyDrive/Summer Analytics/training set labels 1.xlsx')
test_features = pd.read_csv('/content/drive/MyDrive/Summer Analytics/training_set_features.csv')

train_data = train_features.merge(train_labels, on='respondent_id')

X = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = train_data[['xyz_vaccine', 'seasonal_vaccine']]

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

y_valid_pred = pipeline.predict_proba(X_valid)
y_valid_pred = np.hstack([pred[:, 1].reshape(-1, 1) for pred in y_valid_pred])

roc_auc_xyz = roc_auc_score(y_valid['xyz_vaccine'], y_valid_pred[:, 0])
roc_auc_seasonal = roc_auc_score(y_valid['seasonal_vaccine'], y_valid_pred[:, 1])
mean_roc_auc = np.mean([roc_auc_xyz, roc_auc_seasonal])

print(f'ROC AUC for xyz vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {mean_roc_auc}')

X_test = test_features.drop(columns=['respondent_id'])

y_test_pred = pipeline.predict_proba(X_test)
y_test_pred = np.hstack([pred[:, 1].reshape(-1, 1) for pred in y_test_pred])

submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': y_test_pred[:, 0],
    'seasonal_vaccine': y_test_pred[:, 1]
})

submission.to_csv('/content/drive/MyDrive/Summer Analytics/submission.csv', index=False)
submission.head()


ROC AUC for xyz vaccine: 0.8294325525888947
ROC AUC for seasonal vaccine: 0.8518072872366175
Mean ROC AUC: 0.8406199199127561


Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,0,0.0,0.06
1,1,0.1,0.68
2,2,0.01,0.03
3,3,0.02,0.95
4,4,0.01,0.08
