In [1]:
#Import required modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
#Data reading from file
data= pd.read_csv("training_set_features.csv")
op=pd.read_csv("training_set_labels.csv")

In [3]:
#Data checking for data manipulation
data.shape
data.columns

Index(['respondent_id', 'xyz_concern', 'xyz_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children', 'employment_industry',
       'employment_occupation'],
      dtype='object')

In [4]:
#Data checking for data label manipulation 
op.shape
op.columns
respondent_id1=op['respondent_id']

In [5]:
#For classification into numerical values for mapping 
data['education'].unique()
data['age_group'].unique()

array(['55 - 64 Years', '35 - 44 Years', '18 - 34 Years', '65+ Years',
       '45 - 54 Years'], dtype=object)

In [6]:
selector = VarianceThreshold(threshold=0.01)#feature_selection initiation

In [7]:
#Formatting string data to usable number format
agegroupformat={"55 - 64 Years":59.5,"35 - 44 Years":39.5,"18 - 34 Years":26.0,'65+ Years':65.0,'45 - 54 Years':49.5}
data['age_group']=data['age_group'].apply(lambda x:agegroupformat.get(x,0))
raceformat={'White':4,'Black':3,'Other or Multiple':2,'Hispanic':1}
data['race']=data['race'].apply(lambda x:raceformat.get(x,2))
data['rent_or_own'] = data['rent_or_own'].apply(lambda x: 1 if x == "Own" else 0)
data['sex']=data['sex'].apply(lambda x: 1 if x == "Male" else 0)
data['marital_status']=data['marital_status'].apply(lambda x: 1 if x == "Married" else 0)
employmenteffect={"Unemployed":1,"Not in Labor Force":0,"Employed":2}
data['employment_status']=data['employment_status'].apply(lambda x:employmenteffect.get(x,0))
msaeffect={"Non-MSA":0,"MSA, Not Principle  City":1,"MSA, Principle City":2}
data['census_msa']=data['census_msa'].apply(lambda x:msaeffect.get(x,0))
data['income_poverty']=data['income_poverty'].apply(lambda x: 0 if x == "Below Poverty" else 1)
educationeffect = {'< 12 Years': 0,'12 Years': 1,'College Graduate': 2,'Some College': 3}
data['education']=data['education'].apply(lambda x:educationeffect.get(x,0))

In [8]:
#Selecting the best columns with maximum effect
selector.fit(data.drop(columns=['hhs_geo_region','employment_industry','employment_occupation']))
high_variance_features = data.drop(columns=['hhs_geo_region','employment_industry','employment_occupation']).columns[selector.get_support()]
data = data[high_variance_features.to_list()]

In [9]:
#Data making ready for training
data=data.drop("respondent_id",axis=1)

In [10]:
#Datalabel making ready for training
datalabel=op
datalabel=datalabel.drop("respondent_id",axis=1)
datalabel

Unnamed: 0,xyz_vaccine,seasonal_vaccine
0,0,0
1,0,1
2,0,0
3,0,1
4,0,0
...,...,...
26702,0,0
26703,0,0
26704,0,1
26705,0,0


In [11]:
data

Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,census_msa,household_adults,household_children
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0,4,0,0,0,1,0,0,0.0,0.0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,1,4,1,0,0,0,2,1,0.0,0.0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,...,2,4,1,1,0,1,2,1,2.0,0.0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1,4,0,0,0,0,0,2,0.0,0.0
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,3,4,0,1,1,1,2,1,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,3,4,0,1,0,1,0,0,0.0,0.0
26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,2,4,1,1,0,0,2,2,1.0,0.0
26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,3,4,0,1,0,1,0,1,0.0,0.0
26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,...,3,1,0,1,1,0,2,0,1.0,0.0


In [12]:
data=data.fillna(0.0)# fill na with 0.0 only for the data columns ; 
#this is done as the low variance features have been filtered out so minimal data is manipulated

In [13]:
data.columns #For ensuring data has fixed data columns with properly fixed values

Index(['xyz_concern', 'xyz_knowledge', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'census_msa', 'household_adults',
       'household_children'],
      dtype='object')

In [14]:
#logistic regression

In [15]:
model1 = LogisticRegression(max_iter=10000)
model1.fit(data,datalabel['xyz_vaccine'])
model2=LogisticRegression(max_iter=10000)
model2.fit(data,datalabel['seasonal_vaccine'])

In [16]:
#output formatted similar to input test data    
req=pd.read_csv("test_set_features.csv")
agegroupformat={"55 - 64 Years":59.5,"35 - 44 Years":39.5,"18 - 34 Years":26.0,'65+ Years':65.0,'45 - 54 Years':49.5}
req['age_group']=req['age_group'].apply(lambda x:agegroupformat.get(x,0))
raceformat={'White':4,'Black':3,'Other or Multiple':2,'Hispanic':1}
req['race']=req['race'].apply(lambda x:raceformat.get(x,2))
req['rent_or_own'] = req['rent_or_own'].apply(lambda x: 1 if x == "Own" else 0)
req['sex']=req['sex'].apply(lambda x: 1 if x == "Male" else 0)
req['marital_status']=req['marital_status'].apply(lambda x: 1 if x == "Married" else 0)
employmenteffect={"Unemployed":1,"Not in Labor Force":0,"Employed":2}
req['employment_status']=req['employment_status'].apply(lambda x:employmenteffect.get(x,0))
msaeffect={"Non-MSA":0,"MSA, Not Principle  City":1,"MSA, Principle City":2}
req['census_msa']=req['census_msa'].apply(lambda x:msaeffect.get(x,0))
req['income_poverty']=req['income_poverty'].apply(lambda x: 0 if x == "Below Poverty" else 1)
educationeffect = {'< 12 Years': 0,'12 Years': 1,'College Graduate': 2,'Some College': 3}
req['education']=req['education'].apply(lambda x:educationeffect.get(x,0))
high_variance_features = req.drop(columns=['hhs_geo_region','employment_industry','employment_occupation']).columns[selector.get_support()]
req = req[high_variance_features.to_list() ]
req=req.fillna(0.0)
req.head

<bound method NDFrame.head of        respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              26707          2.0            2.0                        0.0   
1              26708          1.0            1.0                        0.0   
2              26709          2.0            2.0                        0.0   
3              26710          1.0            1.0                        0.0   
4              26711          3.0            1.0                        1.0   
...              ...          ...            ...                        ...   
26703          53410          1.0            1.0                        0.0   
26704          53411          3.0            1.0                        0.0   
26705          53412          0.0            1.0                        0.0   
26706          53413          3.0            1.0                        0.0   
26707          53414          2.0            1.0                        0.0   

       behavioral_avo

In [17]:
respondent_id=req['respondent_id']
req=req.drop('respondent_id',axis=1)

In [18]:
#process through model
xyz= model1.predict(req)
seasonal=model2.predict(req)
result =  pd.DataFrame({
    'respondent_id': respondent_id,  # Assuming respondent_id is correctly defined
    'xyz_vaccine': xyz,
    'seasonal_vaccine': seasonal})
outputfile='predictions.csv'
result.to_csv(outputfile,index=False)

In [19]:
#model ROC AUC test
from sklearn.metrics import roc_auc_score

In [20]:
xyz = model1.predict_proba(data)[:, 1]  # Assuming 1 is the positive class probability
seasonal = model2.predict_proba(data)[:, 1]
# Assuming `respondent_id1` is defined correctly
test = pd.DataFrame({
    'respondent_id': respondent_id1,
    'xyz_vaccine': xyz,
    'seasonal_vaccine': seasonal
})

In [21]:
roc_auc_scores1 = roc_auc_score(op['xyz_vaccine'], test['xyz_vaccine'])
roc_auc_scores2 = roc_auc_score(op['seasonal_vaccine'], test['seasonal_vaccine'])

print("ROC AUC Score (xyz_vaccine):", roc_auc_scores1)
print("ROC AUC Score (seasonal_vaccine):", roc_auc_scores2)

ROC AUC Score (xyz_vaccine): 0.8481349682953693
ROC AUC Score (seasonal_vaccine): 0.840116002315178


In [23]:
#model ROC AUC test for input