IMPORT ALL NECESSARY LIBRARIES

In [10]:
import os
import zipfile
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score


LOAD THE ALL DATASETS

In [11]:
file= '/content/dataset and all.zip'
file_reading= zipfile.ZipFile(file,'r')
file_reading.extractall('/content/dataset and all')
file_reading.close()

In [12]:
training_features= pd.read_csv('/content/dataset and all/training_set_features.csv')
training_labels= pd.read_csv('/content/dataset and all/training_set_labels.csv')
testing_features= pd.read_csv('/content/dataset and all/test_set_features.csv')
submission= pd.read_csv('/content/dataset and all/submission_format.csv')

TRAINING AND TESTING DATASETS

In [13]:
training_features.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [14]:
training_labels.head()

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,0,0,0
1,1,0,1
2,2,0,0
3,3,0,1
4,4,0,0


In [15]:
testing_features.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"> $75,000",Not Married,Rent,Employed,mlyzmhmf,"MSA, Not Principle City",1.0,0.0,atmlpfrs,hfxkjkmi
1,26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,Non-MSA,3.0,0.0,atmlpfrs,xqwwgdyp
2,26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,"> $75,000",Married,Own,Employed,lrircsnp,Non-MSA,1.0,0.0,nduyfdeo,pvmttkik
3,26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Not in Labor Force,lrircsnp,"MSA, Not Principle City",1.0,0.0,,
4,26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,lzgpxyit,Non-MSA,0.0,1.0,fcxhlnwr,mxkfnird


DROP THE COULMN OF RESONDENT_ID

In [16]:
x= training_features.drop(columns=['respondent_id'])
y= training_labels.drop(columns=['respondent_id'])
test_data= testing_features.drop(columns=['respondent_id'])

SPLIT THE TRAINING DATASET

In [17]:
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size = 0.2, random_state = 42)

PREPROCESSING

In [18]:
cat_data = [cname for cname in x.columns if x[cname].dtype == "object"]
num_data = [cname for cname in x.columns if x[cname].dtype in ['int64', 'float64']]
num_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),('scaler', StandardScaler())])
cat_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocess = ColumnTransformer(transformers=[('num', num_transformer, num_data),('cat', cat_transformer, cat_data)])

SVM MODEL

In [19]:
svm_model = SVC(probability=True)

In [21]:
result = Pipeline(steps=[('preprocessor', preprocess),
                      ('classifier', MultiOutputClassifier(svm_model))])

In [22]:
result.fit(x_train,y_train)

In [32]:
y_predict= result.predict_proba(x_test)

ROC AND AUC

In [34]:
xyz_roc_auc = roc_auc_score(y_test['xyz_vaccine'], y_predict[0][:, 1])
seasonal_roc_auc = roc_auc_score(y_test['seasonal_vaccine'], y_predict[1][:, 1])
mean_score = (xyz_roc_auc + seasonal_roc_auc) / 2

print(xyz_roc_auc)
print(seasonal_roc_auc)
print(mean_score)

0.8051805208884855
0.8553952875883046
0.830287904238395


TESTING DATA

In [35]:
predict_test_data= result.predict_proba(test_data)

In [36]:
predict_test_data

[array([[0.91274022, 0.08725978],
        [0.86997132, 0.13002868],
        [0.71343643, 0.28656357],
        ...,
        [0.87601234, 0.12398766],
        [0.93483196, 0.06516804],
        [0.53960974, 0.46039026]]),
 array([[0.81225206, 0.18774794],
        [0.86626838, 0.13373162],
        [0.30790309, 0.69209691],
        ...,
        [0.82906583, 0.17093417],
        [0.70613491, 0.29386509],
        [0.22593731, 0.77406269]])]

CONVERT TO SUBMISSION FILE

In [38]:
submissionfile = pd.DataFrame({
    'respondent_id': testing_features['respondent_id'],
    'xyz_vaccine': predict_test_data[0][:, 1],
    'seasonal_vaccine': predict_test_data[1][:, 1]
})


In [39]:
file="submissionfile.csv"
submissionfile.to_csv(file,index=False)