In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, roc_curve

In [16]:
data = pd.read_csv('test_set_features.csv')

In [17]:
X = data.drop(['respondent_id', 'doctor_recc_xyz', 'doctor_recc_seasonal'], axis=1)
y_xyz = data['doctor_recc_xyz']
y_seasonal = data['doctor_recc_seasonal']

In [19]:
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

In [20]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

In [21]:
model = RandomForestClassifier()

In [22]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', model)])

In [23]:

mask_xyz = ~y_xyz.isna()
X_cleaned_xyz = X[mask_xyz]
y_cleaned_xyz = y_xyz[mask_xyz]


mask_seasonal = ~y_seasonal.isna()
X_cleaned_seasonal = X[mask_seasonal]
y_cleaned_seasonal = y_seasonal[mask_seasonal]


X_train, X_test, y_train_xyz, y_test_xyz = train_test_split(X_cleaned_xyz, y_cleaned_xyz, test_size=0.2, stratify=y_cleaned_xyz, random_state=42)
X_train, X_test, y_train_seasonal, y_test_seasonal = train_test_split(X_cleaned_seasonal, y_cleaned_seasonal, test_size=0.2, stratify=y_cleaned_seasonal, random_state=42)


In [24]:
pipeline.fit(X_train, y_train_xyz)
y_pred_xyz = pipeline.predict_proba(X_test)[:, 1]
roc_auc_xyz = roc_auc_score(y_test_xyz, y_pred_xyz)

In [25]:
pipeline.fit(X_train, y_train_seasonal)
y_pred_seasonal = pipeline.predict_proba(X_test)[:, 1]
roc_auc_seasonal = roc_auc_score(y_test_seasonal, y_pred_seasonal)


In [26]:
predictions_xyz = pipeline.predict_proba(X)[:, 1]
predictions_seasonal = pipeline.predict_proba(X)[:, 1]

In [27]:
Gaurav_datahack = pd.DataFrame({
    'respondent_id': data['respondent_id'],
    'xyz_vaccine': predictions_xyz,
    'seasonal_vaccine': predictions_seasonal
})

Gaurav_datahack.to_csv('Gaurav_datahack.csv', index=False)