# Getting Started

In this notebook, we will do multilabel classification for two labels: h1n1_vaccine and seasonal_vaccine. The Output format of the labels will be in the from of independent probabilties with the respective IDs.  

First well will Preprocess the data by:

1. Label encoding of categorical columns
2. making up for the missing values thorugh Simple Imputer ("mean"}.
3. Scaling the Data though Min_max_scaler.

Thereafter, we will apply two algorithms: Random Forest and SVC.
The best out of the two models will be selected on the basis of AUC_ROC accuracy for binary labels in train_test_split (sklearn).



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler

In [2]:
# Loading the files

df1 = training_set_features = pd.read_csv("OneDrive/Desktop/GIT/Required datasets/training_set_features.csv")
df2 = training_set_labels = pd.read_csv("OneDrive/Desktop/GIT/Required datasets/training_set_labels.csv")

df1.info()

FileNotFoundError: [Errno 2] No such file or directory: 'OneDrive/Desktop/GIT/Required datasets/training_set_features.csv'

In [4]:
df1.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


So we will one hot encode these categorical columns


In [5]:
def encoder(data):

   encoder = LabelEncoder()
   bin = data.select_dtypes(include = ['object'])

   for columns in bin.columns:

    bin[columns] = encoder.fit_transform(bin[columns])

   return bin

In [6]:
encoded  = encoder(df1)

In [7]:
encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   age_group              26707 non-null  int64
 1   education              26707 non-null  int64
 2   race                   26707 non-null  int64
 3   sex                    26707 non-null  int64
 4   income_poverty         26707 non-null  int64
 5   marital_status         26707 non-null  int64
 6   rent_or_own            26707 non-null  int64
 7   employment_status      26707 non-null  int64
 8   hhs_geo_region         26707 non-null  int64
 9   census_msa             26707 non-null  int64
 10  employment_industry    26707 non-null  int64
 11  employment_occupation  26707 non-null  int64
dtypes: int64(12)
memory usage: 2.4 MB


In [8]:
non_encoded = df1[[cols for cols in df1.columns if cols not in encoded.columns]]
non_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   xyz_concern                  26615 non-null  float64
 2   xyz_knowledge                26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_recc_xyz              24547 non-null  float64
 11  doctor_recc_seasonal         24547 non-null  float64
 12  chronic_med_condition        25736 non-null  float64
 13  child_under_6_mo

In [9]:
newdf1 = pd.concat([non_encoded,encoded],axis = 1)
newdf1.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,3,0,2,1,0,1,8,2,21,23
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,3,1,2,1,1,0,1,0,12,19
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,3,1,0,1,0,0,9,0,14,21
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,3,0,2,1,1,1,5,1,21,23
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,3,0,0,0,0,0,9,0,18,5


Simple Imputation for the missing values

In [10]:
# Simple Imputation for missing values


num_imputer = SimpleImputer(strategy='mean')
imppdf = num_imputer.fit_transform(newdf1)

impdf = pd.DataFrame(imppdf, columns = newdf1.columns)

In [11]:
impdf.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,employment_industry,employment_occupation
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,3.0,0.0,2.0,1.0,0.0,1.0,8.0,2.0,21.0,23.0
1,1.0,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,3.0,1.0,2.0,1.0,1.0,0.0,1.0,0.0,12.0,19.0
2,2.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1.0,0.0,1.0,0.0,0.0,9.0,0.0,14.0,21.0
3,3.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,3.0,0.0,2.0,1.0,1.0,1.0,5.0,1.0,21.0,23.0
4,4.0,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,3.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,18.0,5.0


Scaling the dataset

In [12]:
# Scaling the data :

min_max_scaler = MinMaxScaler()
scaled_df = pd.DataFrame(min_max_scaler.fit_transform(impdf))

In [13]:
scaled_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 36 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       26707 non-null  float64
 1   1       26707 non-null  float64
 2   2       26707 non-null  float64
 3   3       26707 non-null  float64
 4   4       26707 non-null  float64
 5   5       26707 non-null  float64
 6   6       26707 non-null  float64
 7   7       26707 non-null  float64
 8   8       26707 non-null  float64
 9   9       26707 non-null  float64
 10  10      26707 non-null  float64
 11  11      26707 non-null  float64
 12  12      26707 non-null  float64
 13  13      26707 non-null  float64
 14  14      26707 non-null  float64
 15  15      26707 non-null  float64
 16  16      26707 non-null  float64
 17  17      26707 non-null  float64
 18  18      26707 non-null  float64
 19  19      26707 non-null  float64
 20  20      26707 non-null  float64
 21  21      26707 non-null  float64
 22

In [14]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   respondent_id     26707 non-null  int64
 1   xyz_vaccine       26707 non-null  int64
 2   seasonal_vaccine  26707 non-null  int64
dtypes: int64(3)
memory usage: 626.1 KB


In [15]:
X = scaled_df.iloc[:,1:]
y = df2.iloc[:,1:]

In [16]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)
print(type(y_test))

<class 'pandas.core.frame.DataFrame'>


So for our estimation, we will use two models : Support Vector Machines and Random Forest

## Random Forest

In [17]:


# random forest - HyperParameter Tuned

# I got these parameters through GridSearch CV method code for which is present in commented cell nos. 21,22,23
rf = RandomForestClassifier(max_depth=  None,
 max_features= 1,
 max_samples = 0.5,
 n_estimators= 120)

In [18]:
rf.fit(X_train,y_train)

In [19]:
y_rfpred = rf.predict(X_test)

In [20]:
print("Random Forest Hyperparameter_tuned (for binary labels):\n")

roc_auc = roc_auc_score(pd.DataFrame(y_test).iloc[:,0], pd.DataFrame(y_rfpred).iloc[:,0])

# Step 4: Print the ROC AUC score
print("ROC AUC Score for 1st Target :", roc_auc)

roc_auc = roc_auc_score(pd.DataFrame(y_test).iloc[:,1], pd.DataFrame(y_rfpred).iloc[:,1])

# Step 4: Print the ROC AUC score
print("ROC AUC Score for 2nd Target :", roc_auc)



Random Forest Hyperparameter_tuned:

ROC AUC Score for 1st Target : 0.6081051609812672
ROC AUC Score for 2nd Target : 0.7678478108667693


In [21]:
"""

# Number of trees in random forest
n_estimators = [20,60,100,120]

# Number of features to consider at every split
max_features = [0.2,0.6,1.0]

# Maximum number of levels in tree
max_depth = [2,8,None]

# Number of samples
max_samples = [0.5,0.75,1.0]

# 108 diff random forest with different parameters
# here we are storing the whole tuning to be tested by gridsearch,
# in a dictionary called param_grid, which can be called directly later on.

param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
              'max_samples':max_samples
             }
print(param_grid)

"""

"\n\n# Number of trees in random forest\nn_estimators = [20,60,100,120]\n\n# Number of features to consider at every split\nmax_features = [0.2,0.6,1.0]\n\n# Maximum number of levels in tree\nmax_depth = [2,8,None]\n\n# Number of samples\nmax_samples = [0.5,0.75,1.0]\n\n# 108 diff random forest with different parameters\n# here we are storing the whole tuning to be tested by gridsearch,\n# in a dictionary called param_grid, which can be called directly later on.\n\nparam_grid = {'n_estimators': n_estimators,\n               'max_features': max_features,\n               'max_depth': max_depth,\n              'max_samples':max_samples\n             }\nprint(param_grid)\n\n"

In [22]:
"""
from sklearn.model_selection import GridSearchCV

rf_grid = GridSearchCV(estimator = rf, # selectiong the model,
                       param_grid = param_grid, # calling the tunings to be checked stored in a dict
                       cv = 5, # number of times to be done
                       verbose=2, # this is for selecting the type of output
                       n_jobs = -1) # this just for increasing the speed for testing 540 Random Forests
"""

'\nfrom sklearn.model_selection import GridSearchCV\n\nrf_grid = GridSearchCV(estimator = rf, # selectiong the model,\n                       param_grid = param_grid, # calling the tunings to be checked stored in a dict\n                       cv = 5, # number of times to be done\n                       verbose=2, # this is for selecting the type of output\n                       n_jobs = -1) # this just for increasing the speed for testing 540 Random Forests\n'

In [23]:
"""

# Training the Model

rf_grid.fit(X_train,y_train)

# Since the grid search model has been trained, now we can extract the best
# possible parameters found by our algo so that we don't have to run the model repeatedly

rf_grid.best_params_

"""

"\n\n# Training the Model\n\nrf_grid.fit(X_train,y_train)\n\n# Since the grid search model has been trained, now we can extract the best\n# possible parameters found by our algo so that we don't have to run the model repeatedly\n\nrf_grid.best_params_\n\n"

## Support Vector Machines


First we will compare the accuracy of SVC through binary output AUC ROC scores with that of Random Forest

In [24]:
# SVC Classifier objects:

svc2 = SVC(probability = True,
    C = 10,gamma = 0.01, kernel = 'rbf'
)

svc1 = SVC(probability = True,
    C = 10,gamma = 0.1, kernel = 'rbf'
)

In [25]:
print("For SVC (binary labels):\n")

svc1.fit(X_train,y_train.iloc[:,0])
y_svcpred = svc1.predict(X_test)
roc_auc = roc_auc_score(y_test.iloc[:,0], y_svcpred)

# Step 4: Print the ROC AUC score
print("ROC AUC Score for target1 :", roc_auc)

svc2.fit(X_train,y_train.iloc[:,1])
y_svcpred2 = svc2.predict(X_test)
roc_auc = roc_auc_score(y_test.iloc[:,1], y_svcpred2)

# Step 4: Print the ROC AUC score
print("ROC AUC Score for target2 :", roc_auc)

For SVC (binary labels):

ROC AUC Score for target1 : 0.7054727747943087
ROC AUC Score for target2 : 0.7830412085170977


In [26]:
# Fitting the model and getting the desired probability prediction arrays:

svc1.fit(X_train,y_train.iloc[:,0])
y_svcpred = svc1.predict_proba(X_test)


svc2.fit(X_train,y_train.iloc[:,1])
y_svcpred2 = svc2.predict_proba(X_test)


For SVC (normal):



## testing


In [27]:
testdf = pd.read_csv("/content/drive/MyDrive/dataset and all/test_set_features.csv")
testdf.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"> $75,000",Not Married,Rent,Employed,mlyzmhmf,"MSA, Not Principle City",1.0,0.0,atmlpfrs,hfxkjkmi
1,26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,Non-MSA,3.0,0.0,atmlpfrs,xqwwgdyp
2,26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,"> $75,000",Married,Own,Employed,lrircsnp,Non-MSA,1.0,0.0,nduyfdeo,pvmttkik
3,26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Not in Labor Force,lrircsnp,"MSA, Not Principle City",1.0,0.0,,
4,26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,lzgpxyit,Non-MSA,0.0,1.0,fcxhlnwr,mxkfnird


In [28]:
encoded_test  = encoder(testdf)

In [29]:
non_encoded_test = testdf[[cols for cols in testdf.columns if cols not in encoded_test.columns]]
non_encoded_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26708 entries, 0 to 26707
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26708 non-null  int64  
 1   xyz_concern                  26623 non-null  float64
 2   xyz_knowledge                26586 non-null  float64
 3   behavioral_antiviral_meds    26629 non-null  float64
 4   behavioral_avoidance         26495 non-null  float64
 5   behavioral_face_mask         26689 non-null  float64
 6   behavioral_wash_hands        26668 non-null  float64
 7   behavioral_large_gatherings  26636 non-null  float64
 8   behavioral_outside_home      26626 non-null  float64
 9   behavioral_touch_face        26580 non-null  float64
 10  doctor_recc_xyz              24548 non-null  float64
 11  doctor_recc_seasonal         24548 non-null  float64
 12  chronic_med_condition        25776 non-null  float64
 13  child_under_6_mo

In [30]:
newtest = pd.concat([non_encoded_test,encoded_test],axis = 1)
newtest.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,employment_industry,employment_occupation
0,26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,1,0,1,1,1,0,7,0,1,7
1,26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3,1,2,1,1,0,1,2,1,20
2,26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,3,1,1,0,0,0,5,2,10,12
3,26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3,0,0,0,0,1,5,0,21,23
4,26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0,0,0,1,0,0,6,2,4,10


In [31]:
# Simple Imputation for missing values


num_imputer = SimpleImputer(strategy='mean')
impptest = num_imputer.fit_transform(newtest)

imptest = pd.DataFrame(impptest, columns = newtest.columns)

In [32]:
# Scaling the data :

min_max_scaler = MinMaxScaler()
scaled_test = pd.DataFrame(min_max_scaler.fit_transform(imptest))

In [33]:
scaled_test[0] = testdf['respondent_id']

In [34]:
scaled_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,26707,0.666667,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,0.333333,0.0,0.333333,0.5,0.5,0.0,0.777778,0.0,0.047619,0.304348
1,26708,0.333333,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.666667,0.5,0.5,0.0,0.111111,1.0,0.047619,0.869565
2,26709,0.666667,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,0.333333,0.0,0.0,0.0,0.555556,1.0,0.47619,0.521739
3,26710,0.333333,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.333333,0.555556,0.0,1.0,1.0
4,26711,1.0,0.5,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.5,0.0,0.0,0.666667,1.0,0.190476,0.434783


In [35]:
h1n1_vaccine = svc1.predict_proba(scaled_test.iloc[:,1:])
seasonal_vaccine = svc2.predict_proba(scaled_test.iloc[:,1:])

In [36]:
print(type(seasonal_vaccine))

<class 'numpy.ndarray'>


In [37]:
respondent_id = scaled_test[0].to_numpy()

In [48]:
submitdf = pd.DataFrame({
    'respondent_id': respondent_id ,
    'h1n1_vaccine': h1n1_vaccine[:,0] ,
    'seasonal_vaccine': seasonal_vaccine[:,0]
})

## Final Result:

In [51]:
submitdf.head()

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.928236,0.757312
1,26708,0.767084,0.944479
2,26709,0.812173,0.31889
3,26710,0.249182,0.147931
4,26711,0.510658,0.5


In [52]:
# Downloading the Submission.csv file:
from google.colab import files
submitdf.to_csv('Submission.csv', index=False)
files.download('Submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>