In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC

In [6]:
data = pd.read_csv('test_set_features.csv')

In [7]:
X = data.drop(['respondent_id', 'doctor_recc_xyz', 'doctor_recc_seasonal'], axis=1)
y_xyz = data['doctor_recc_xyz']
y_seasonal = data['doctor_recc_seasonal']

In [8]:
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

In [9]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

In [10]:
model = SVC(probability=True)

In [11]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', model)])

In [16]:

mask_xyz = ~y_xyz.isna()
X_cleaned_xyz = X[mask_xyz]
y_cleaned_xyz = y_xyz[mask_xyz]

mask_seasonal = ~y_seasonal.isna()
X_cleaned_seasonal = X[mask_seasonal]
y_cleaned_seasonal = y_seasonal[mask_seasonal]


X_train, X_test, y_train_xyz, y_test_xyz = train_test_split(X_cleaned_xyz, y_cleaned_xyz, test_size=0.2, stratify=y_cleaned_xyz, random_state=42)
X_train, X_test, y_train_seasonal, y_test_seasonal = train_test_split(X_cleaned_seasonal, y_cleaned_seasonal, test_size=0.2, stratify=y_cleaned_seasonal, random_state=42)


In [17]:
param_grid = {
    'classifier__C': [0.1, 1, 10],
    'classifier__gamma': [1, 0.1, 0.01],
    'classifier__kernel': ['rbf']
}

In [19]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)

In [21]:
grid_search.fit(X_train, y_train_xyz)
best_model_xyz = grid_search.best_estimator_
y_pred_xyz = best_model_xyz.predict_proba(X_test)[:, 1]
roc_auc_xyz = roc_auc_score(y_test_xyz, y_pred_xyz)

ValueError: 
All the 45 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\engin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\engin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\engin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 476, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Users\engin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\engin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\svm\_base.py", line 190, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\engin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\engin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\validation.py", line 1273, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "c:\Users\engin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\validation.py", line 1059, in check_array
    _assert_all_finite(
  File "c:\Users\engin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\validation.py", line 126, in _assert_all_finite
    _assert_all_finite_element_wise(
  File "c:\Users\engin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\validation.py", line 175, in _assert_all_finite_element_wise
    raise ValueError(msg_err)
ValueError: Input X contains NaN.
SVC does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values


In [22]:
grid_search.fit(X_train, y_train_seasonal)
best_model_seasonal = grid_search.best_estimator_
y_pred_seasonal = best_model_seasonal.predict_proba(X_test)[:, 1]
roc_auc_seasonal = roc_auc_score(y_test_seasonal, y_pred_seasonal)

ValueError: 
All the 45 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\engin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\engin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\engin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 476, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Users\engin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\engin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\svm\_base.py", line 190, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\engin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\engin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\validation.py", line 1273, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "c:\Users\engin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\validation.py", line 1059, in check_array
    _assert_all_finite(
  File "c:\Users\engin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\validation.py", line 126, in _assert_all_finite
    _assert_all_finite_element_wise(
  File "c:\Users\engin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\validation.py", line 175, in _assert_all_finite_element_wise
    raise ValueError(msg_err)
ValueError: Input X contains NaN.
SVC does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values


In [None]:
predictions_xyz = pipeline.predict_proba(X)[:, 1]
predictions_seasonal = pipeline.predict_proba(X)[:, 1]

In [None]:
Gaurav_datahack = pd.DataFrame({
    'respondent_id': data['respondent_id'],
    'xyz_vaccine': predictions_xyz,
    'seasonal_vaccine': predictions_seasonal
})

Gaurav_datahack.to_csv('Gaurav_datahack.csv', index=False)