In [70]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [71]:
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')

In [72]:
train_data = train_features.merge(train_labels, on='respondent_id')

X = train_data.drop(columns=['xyz_vaccine', 'seasonal_vaccine'])
y = train_data[['xyz_vaccine', 'seasonal_vaccine']]

In [73]:
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

In [74]:
binary_cols = [col for col in numerical_cols if X[col].dropna().isin([0, 1]).all()]

for col in binary_cols:
    X[col].fillna(X[col].mode()[0], inplace=True)
    test_features[col].fillna(test_features[col].mode()[0], inplace=True)


In [75]:
non_binary_cols = [col for col in numerical_cols if col not in binary_cols]
for col in non_binary_cols:
    X[col].fillna(X[col].median(), inplace=True)
    test_features[col].fillna(test_features[col].median(), inplace=True)

for col in categorical_cols:
    X[col].fillna(X[col].mode()[0], inplace=True)
    test_features[col].fillna(test_features[col].mode()[0], inplace=True)


In [76]:
# One-hot encoding krdete h category wlo ki
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
test_features = pd.get_dummies(test_features, columns=categorical_cols, drop_first=True)


In [77]:
test_features = test_features.reindex(columns=X.columns, fill_value=0)


scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
test_features[numerical_cols] = scaler.transform(test_features[numerical_cols])

In [83]:
#guyss datasplit krdege ab ham
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# modelbna kr training krdege ab jldi jldi
model_xyz = LogisticRegression(random_state=42, max_iter=1000)
model_seasonal = LogisticRegression(random_state=42, max_iter=1000)

model_xyz.fit(X_train, y_train['xyz_vaccine'])
model_seasonal.fit(X_train, y_train['seasonal_vaccine'])


In [79]:
#prediction timeee
y_pred_xyz = model_xyz.predict_proba(X_valid)[:, 1] 
y_pred_seasonal = model_seasonal.predict_proba(X_valid)[:, 1]  


In [80]:
# ROC AUC score :/
roc_auc_xyz = roc_auc_score(y_valid['xyz_vaccine'], y_pred_xyz)
roc_auc_seasonal = roc_auc_score(y_valid['seasonal_vaccine'], y_pred_seasonal)
roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2
print(f'Validation ROC AUC Score: {roc_auc}')

Validation ROC AUC Score: 0.8437459341505344


In [81]:
test_pred_xyz = model_xyz.predict_proba(test_features)[:, 1]  
test_pred_seasonal = model_seasonal.predict_proba(test_features)[:, 1]  

In [82]:
submission = pd.DataFrame({
    'respondent_id': test_features.index,
    'xyz_vaccine': test_pred_xyz,
    'seasonal_vaccine': test_pred_seasonal
})

submission.to_csv('submission.csv', index=False)
print("Submission file created successfully!")

Submission file created successfully!
