In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import column_or_1d
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
import re

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
#data = pd.read_csv('training_set_features (1).csv', 'training_set_labels (1).csv', 'test_set_features (1).csv')
notebook_path = os.path.abspath("") + '/'

data = pd.read_csv(notebook_path + 'training_set_features (1).csv')
labels = pd.read_csv(notebook_path + 'training_set_labels (1).csv')
test = pd.read_csv(notebook_path + 'test_set_features (1).csv')
feature_list = []

In [3]:
class CustomOrderLabelEncoder(LabelEncoder):
  def fit(self, y):
    y = column_or_1d(y, warn=True)
    self.classes_ = pd.Series(y).unique()
    return self

In [4]:
data.count()

respondent_id                  26707
h1n1_concern                   26615
h1n1_knowledge                 26591
behavioral_antiviral_meds      26636
behavioral_avoidance           26499
behavioral_face_mask           26688
behavioral_wash_hands          26665
behavioral_large_gatherings    26620
behavioral_outside_home        26625
behavioral_touch_face          26579
doctor_recc_h1n1               24547
doctor_recc_seasonal           24547
chronic_med_condition          25736
child_under_6_months           25887
health_worker                  25903
health_insurance               14433
opinion_h1n1_vacc_effective    26316
opinion_h1n1_risk              26319
opinion_h1n1_sick_from_vacc    26312
opinion_seas_vacc_effective    26245
opinion_seas_risk              26193
opinion_seas_sick_from_vacc    26170
age_group                      26707
education                      25300
race                           26707
sex                            26707
income_poverty                 22284
m

In [5]:
data.values

array([[0, 1.0, 0.0, ..., 0.0, nan, nan],
       [1, 3.0, 2.0, ..., 0.0, 'pxcmvdjn', 'xgwztkwe'],
       [2, 1.0, 1.0, ..., 0.0, 'rucpziij', 'xtkaffoo'],
       ...,
       [26704, 2.0, 2.0, ..., 0.0, nan, nan],
       [26705, 1.0, 1.0, ..., 0.0, 'fcxhlnwr', 'haliazsg'],
       [26706, 0.0, 0.0, ..., 0.0, nan, nan]], dtype=object)

In [6]:
for (columnName, columnData) in data.items():
    print('Column Name : ', columnName, columnData.dtype)
    print('Unique Contents : ', columnData.unique(), "\n")

Column Name :  respondent_id int64
Unique Contents :  [    0     1     2 ... 26704 26705 26706] 

Column Name :  h1n1_concern float64
Unique Contents :  [ 1.  3.  2.  0. nan] 

Column Name :  h1n1_knowledge float64
Unique Contents :  [ 0.  2.  1. nan] 

Column Name :  behavioral_antiviral_meds float64
Unique Contents :  [ 0.  1. nan] 

Column Name :  behavioral_avoidance float64
Unique Contents :  [ 0.  1. nan] 

Column Name :  behavioral_face_mask float64
Unique Contents :  [ 0.  1. nan] 

Column Name :  behavioral_wash_hands float64
Unique Contents :  [ 0.  1. nan] 

Column Name :  behavioral_large_gatherings float64
Unique Contents :  [ 0.  1. nan] 

Column Name :  behavioral_outside_home float64
Unique Contents :  [ 1.  0. nan] 

Column Name :  behavioral_touch_face float64
Unique Contents :  [ 1.  0. nan] 

Column Name :  doctor_recc_h1n1 float64
Unique Contents :  [ 0. nan  1.] 

Column Name :  doctor_recc_seasonal float64
Unique Contents :  [ 0. nan  1.] 

Column Name :  chronic

In [7]:
#Are we creating two different lists when we do the encoded data into feature_list and all numeric data into numeric_list?

le_age_group = CustomOrderLabelEncoder()

le_age_group.fit(['18 - 34 Years', '35 - 44 Years', '45 - 54 Years', '55 - 64 Years', '65+ Years'])

 

data['enc_age_group'] = le_age_group.transform(data['age_group'])
if 'enc_age_group' not in feature_list:
    feature_list.append('enc_age_group')

 

data.loc[1:10, ['enc_age_group', 'age_group']]

Unnamed: 0,enc_age_group,age_group
1,1,35 - 44 Years
2,0,18 - 34 Years
3,4,65+ Years
4,2,45 - 54 Years
5,4,65+ Years
6,3,55 - 64 Years
7,2,45 - 54 Years
8,2,45 - 54 Years
9,3,55 - 64 Years
10,2,45 - 54 Years


In [8]:
le_education = CustomOrderLabelEncoder()

le_education.fit(['< 12 Years', '12 Years', 'Some College', 'College Graduate'])

 

data['enc_education'] = pd.Series(

    le_education.transform(data['education'][data['education'].notnull()]),

    index=data['education'][data['education'].notnull()].index

)
if 'enc_education' not in feature_list:
    feature_list.append('enc_education')

 

data.loc[1:10, ['enc_education', 'education']]

Unnamed: 0,enc_education,education
1,1.0,12 Years
2,3.0,College Graduate
3,1.0,12 Years
4,2.0,Some College
5,1.0,12 Years
6,0.0,< 12 Years
7,2.0,Some College
8,3.0,College Graduate
9,1.0,12 Years
10,1.0,12 Years


In [9]:
le_income_poverty = CustomOrderLabelEncoder()

le_income_poverty.fit(['Below Poverty', '<= $75,000, Above Poverty', '> $75,000'])

 

data['enc_income_poverty'] = pd.Series(
    
    le_income_poverty.transform(data['income_poverty'][data['income_poverty'].notnull()]),
    
    index=data['income_poverty'][data['income_poverty'].notnull()].index
               
)
if 'enc_income_poverty' not in feature_list:
    feature_list.append('enc_income_poverty')
 

data.loc[1:10, ['enc_income_poverty', 'income_poverty']]

Unnamed: 0,enc_income_poverty,income_poverty
1,0.0,Below Poverty
2,1.0,"<= $75,000, Above Poverty"
3,0.0,Below Poverty
4,1.0,"<= $75,000, Above Poverty"
5,1.0,"<= $75,000, Above Poverty"
6,1.0,"<= $75,000, Above Poverty"
7,1.0,"<= $75,000, Above Poverty"
8,2.0,"> $75,000"
9,1.0,"<= $75,000, Above Poverty"
10,1.0,"<= $75,000, Above Poverty"


In [10]:
# define One_Hot(prefix, x, y, z= 'blah'):

In [11]:
data = data.drop(columns=['race_White', 'race_Black', 'race_Other or Multiple', 'race_Hispanic'], errors='ignore')

data = pd.concat([data, pd.get_dummies(data.race, prefix='race')], axis=1)

 

data.loc[4107:4111, ['race', 'race_White', 'race_Black', 'race_Other or Multiple', 'race_Hispanic']]
column_list = list(data)
pattern = re.compile(r'^race_(?!.*_nan$).*')
column_list =  list(filter(pattern.match, column_list))
for x in column_list: 
    if x not in feature_list: feature_list.append(x)
print (column_list) 

['race_Black', 'race_Hispanic', 'race_Other or Multiple', 'race_White']


In [12]:
le_sex = CustomOrderLabelEncoder()

le_sex.fit(['Female', 'Male'])

 

data['enc_sex'] = pd.Series(
    
    le_sex.transform(data['sex'][data['sex'].notnull()]),
    
    index=data['sex'][data['sex'].notnull()].index
               
)
if 'enc_sex' not in feature_list:
    feature_list.append('enc_sex')
 

data.loc[1:10, ['enc_sex', 'sex']]

Unnamed: 0,enc_sex,sex
1,1,Male
2,1,Male
3,0,Female
4,0,Female
5,1,Male
6,1,Male
7,0,Female
8,1,Male
9,1,Male
10,1,Male


In [13]:
le_marital_status = CustomOrderLabelEncoder()

le_marital_status.fit(['Not Married', 'Married'])

 

data['enc_marital_status'] = pd.Series(
    
    le_marital_status.transform(data['marital_status'][data['marital_status'].notnull()]),
    
    index=data['marital_status'][data['marital_status'].notnull()].index
               
)
if 'enc_marital_status' not in feature_list:
    feature_list.append('enc_marital_status')
 

data.loc[1:10, ['enc_marital_status', 'marital_status']]

Unnamed: 0,enc_marital_status,marital_status
1,0.0,Not Married
2,0.0,Not Married
3,0.0,Not Married
4,1.0,Married
5,1.0,Married
6,0.0,Not Married
7,1.0,Married
8,1.0,Married
9,0.0,Not Married
10,1.0,Married


In [14]:
le_rent_or_own = CustomOrderLabelEncoder()

le_rent_or_own.fit(['Own', 'Rent'])

 

data['enc_rent_or_own'] = pd.Series(
    
    le_rent_or_own.transform(data['rent_or_own'][data['rent_or_own'].notnull()]),
    
    index=data['rent_or_own'][data['rent_or_own'].notnull()].index
               
)
if 'enc_rent_or_own' not in feature_list:
    feature_list.append('enc_rent_or_own')
 

data.loc[1:10, ['enc_rent_or_own', 'rent_or_own']]

Unnamed: 0,enc_rent_or_own,rent_or_own
1,1.0,Rent
2,0.0,Own
3,1.0,Rent
4,0.0,Own
5,0.0,Own
6,0.0,Own
7,0.0,Own
8,0.0,Own
9,0.0,Own
10,1.0,Rent


In [15]:
le_employment_status = CustomOrderLabelEncoder()

le_employment_status.fit(['Not in Labor Force', 'Employed', 'Unemployed'])

 

data['enc_employment_status'] = pd.Series(
    
    le_employment_status.transform(data['employment_status'][data['employment_status'].notnull()]),
    
    index=data['employment_status'][data['employment_status'].notnull()].index
               
)
if 'enc_employment_staus' not in feature_list:
    feature_list.append('enc_employment_status')
 

data.loc[1:10, ['enc_employment_status', 'employment_status']]

Unnamed: 0,enc_employment_status,employment_status
1,1.0,Employed
2,1.0,Employed
3,0.0,Not in Labor Force
4,1.0,Employed
5,1.0,Employed
6,1.0,Employed
7,1.0,Employed
8,1.0,Employed
9,0.0,Not in Labor Force
10,1.0,Employed


In [16]:
data = data.drop(columns=['region_oxchjgsf', 'region_bhuqouqj', 'region_qufhixun', 'region_lrircsnp', 'region_atmpeygn', 'region_lzgpxyit',
 'region_fpwskwrf', 'region_mlyzmhmf', 'region_dqpwygqj', 'region_kbazzjca'], errors='ignore')

data = pd.concat([data, pd.get_dummies(data.hhs_geo_region, prefix='region')], axis=1)

 

data.loc[4107:4111, ['hhs_geo_region', 'region_oxchjgsf', 'region_bhuqouqj', 'region_qufhixun', 'region_lrircsnp', 'region_atmpeygn', 'region_lzgpxyit',
 'region_fpwskwrf', 'region_mlyzmhmf', 'region_dqpwygqj', 'region_kbazzjca']]
column_list = list(data)
pattern = re.compile(r'^region_(?!.*_nan$).*')
column_list =  list(filter(pattern.match, column_list))
for x in column_list: 
    if x not in feature_list: feature_list.append(x)
print (column_list) 

['region_atmpeygn', 'region_bhuqouqj', 'region_dqpwygqj', 'region_fpwskwrf', 'region_kbazzjca', 'region_lrircsnp', 'region_lzgpxyit', 'region_mlyzmhmf', 'region_oxchjgsf', 'region_qufhixun']


In [17]:
data = data.drop(columns=['msa_Non-MSA', 'msa_MSA, Not Principle  City', 'msa_MSA, Principle City'], errors='ignore')

data = pd.concat([data, pd.get_dummies(data.census_msa, prefix='msa')], axis=1)

 

data.loc[4107:4111, ['census_msa', 'msa_Non-MSA', 'msa_MSA, Not Principle  City', 'msa_MSA, Principle City']]
column_list = list(data)
pattern = re.compile(r'^msa_(?!.*_nan$).*')
column_list =  list(filter(pattern.match, column_list))
for x in column_list: 
    if x not in feature_list: feature_list.append(x)
print (column_list) 

['msa_MSA, Not Principle  City', 'msa_MSA, Principle City', 'msa_Non-MSA']


In [18]:
data = data.drop(columns=['pxcmvdjn', 'rucpziij', 'wxleyezf', 'saaquncn', 'xicduogh', 'ldnlellj',
 'wlfvacwt', 'nduyfdeo', 'fcxhlnwr', 'vjjrobsf', 'arjwrbjb', 'atmlpfrs',
 'msuufmds', 'xqicxuve', 'phxvnwax', 'dotnnunm', 'mfikgejo', 'cfqqtusy',
 'mcubkhph', 'haxffmxo', 'qnlwzans'], errors='ignore')

data = pd.concat([data, pd.get_dummies(data.employment_industry, prefix='industry')], axis=1)
 

data.loc[4107:4111, ['employment_industry', 'industry_pxcmvdjn', 'industry_rucpziij', 'industry_wxleyezf', 'industry_saaquncn', 'industry_xicduogh', 'industry_ldnlellj',
 'industry_wlfvacwt', 'industry_nduyfdeo', 'industry_fcxhlnwr', 'industry_vjjrobsf', 'industry_arjwrbjb', 'industry_atmlpfrs',
 'industry_msuufmds', 'industry_xqicxuve', 'industry_phxvnwax', 'industry_dotnnunm', 'industry_mfikgejo', 'industry_cfqqtusy',
 'industry_mcubkhph', 'industry_haxffmxo', 'industry_qnlwzans']]
column_list = list(data)
pattern = re.compile(r'^industry_(?!.*_nan$).*')
column_list =  list(filter(pattern.match, column_list))
for x in column_list: 
    if x not in feature_list: feature_list.append(x)
print (column_list) 

['industry_arjwrbjb', 'industry_atmlpfrs', 'industry_cfqqtusy', 'industry_dotnnunm', 'industry_fcxhlnwr', 'industry_haxffmxo', 'industry_ldnlellj', 'industry_mcubkhph', 'industry_mfikgejo', 'industry_msuufmds', 'industry_nduyfdeo', 'industry_phxvnwax', 'industry_pxcmvdjn', 'industry_qnlwzans', 'industry_rucpziij', 'industry_saaquncn', 'industry_vjjrobsf', 'industry_wlfvacwt', 'industry_wxleyezf', 'industry_xicduogh', 'industry_xqicxuve']


In [19]:
data = data.drop(columns=['xgwztkwe', 'xtkaffoo', 'emcorrxb', 'vlluhbov', 'xqwwgdyp', 'ccgxvspp',
 'qxajmpny', 'kldqjyjy', 'mxkfnird', 'hfxkjkmi', 'bxpfxfdn', 'ukymxvdu',
 'cmhcxjea', 'haliazsg', 'dlvbwzss', 'xzmlyyjv', 'oijqvulv', 'rcertsgn',
 'tfqavkke', 'hodpvpew', 'uqqtjvyb', 'pvmttkik', 'dcjcmpih'], errors='ignore')

data = pd.concat([data, pd.get_dummies(data.employment_occupation, prefix='occupation')], axis=1)

 

data.loc[4107:4111, ['employment_occupation', 'occupation_xgwztkwe', 'occupation_xtkaffoo', 'occupation_emcorrxb', 'occupation_vlluhbov', 'occupation_xqwwgdyp', 'occupation_ccgxvspp',
 'occupation_qxajmpny', 'occupation_kldqjyjy', 'occupation_mxkfnird', 'occupation_hfxkjkmi', 'occupation_bxpfxfdn', 'occupation_ukymxvdu',
 'occupation_cmhcxjea', 'occupation_haliazsg', 'occupation_dlvbwzss', 'occupation_xzmlyyjv', 'occupation_oijqvulv', 'occupation_rcertsgn',
 'occupation_tfqavkke', 'occupation_hodpvpew', 'occupation_uqqtjvyb', 'occupation_pvmttkik', 'occupation_dcjcmpih']]
column_list = list(data)
pattern = re.compile(r'^occupation_(?!.*_nan$).*')
column_list =  list(filter(pattern.match, column_list))
for x in column_list: 
    if x not in feature_list: feature_list.append(x)
print (column_list) 

['occupation_bxpfxfdn', 'occupation_ccgxvspp', 'occupation_cmhcxjea', 'occupation_dcjcmpih', 'occupation_dlvbwzss', 'occupation_emcorrxb', 'occupation_haliazsg', 'occupation_hfxkjkmi', 'occupation_hodpvpew', 'occupation_kldqjyjy', 'occupation_mxkfnird', 'occupation_oijqvulv', 'occupation_pvmttkik', 'occupation_qxajmpny', 'occupation_rcertsgn', 'occupation_tfqavkke', 'occupation_ukymxvdu', 'occupation_uqqtjvyb', 'occupation_vlluhbov', 'occupation_xgwztkwe', 'occupation_xqwwgdyp', 'occupation_xtkaffoo', 'occupation_xzmlyyjv']


In [20]:
# Changed List to Numeric_List

numeric_lists = ['h1n1_concern','h1n1_knowledge','behavioral_antiviral_meds','behavioral_avoidance',
         'behavioral_face_mask','behavioral_wash_hands','behavioral_large_gatherings','behavioral_outside_home',
         'behavioral_touch_face','doctor_recc_h1n1','doctor_recc_seasonal','chronic_med_condition','child_under_6_months',
         'health_worker','health_insurance','opinion_h1n1_vacc_effective','opinion_h1n1_risk','opinion_h1n1_sick_from_vacc',
         'opinion_seas_vacc_effective','opinion_seas_risk','opinion_seas_sick_from_vacc','household_adults','household_children'] 
for x in numeric_lists: 
    if x not in feature_list: feature_list.append(x)
print (feature_list)


['enc_age_group', 'enc_education', 'enc_income_poverty', 'race_Black', 'race_Hispanic', 'race_Other or Multiple', 'race_White', 'enc_sex', 'enc_marital_status', 'enc_rent_or_own', 'enc_employment_status', 'region_atmpeygn', 'region_bhuqouqj', 'region_dqpwygqj', 'region_fpwskwrf', 'region_kbazzjca', 'region_lrircsnp', 'region_lzgpxyit', 'region_mlyzmhmf', 'region_oxchjgsf', 'region_qufhixun', 'msa_MSA, Not Principle  City', 'msa_MSA, Principle City', 'msa_Non-MSA', 'industry_arjwrbjb', 'industry_atmlpfrs', 'industry_cfqqtusy', 'industry_dotnnunm', 'industry_fcxhlnwr', 'industry_haxffmxo', 'industry_ldnlellj', 'industry_mcubkhph', 'industry_mfikgejo', 'industry_msuufmds', 'industry_nduyfdeo', 'industry_phxvnwax', 'industry_pxcmvdjn', 'industry_qnlwzans', 'industry_rucpziij', 'industry_saaquncn', 'industry_vjjrobsf', 'industry_wlfvacwt', 'industry_wxleyezf', 'industry_xicduogh', 'industry_xqicxuve', 'occupation_bxpfxfdn', 'occupation_ccgxvspp', 'occupation_cmhcxjea', 'occupation_dcjcmpih'

In [21]:
features = data.get(feature_list)

print (features.count())

mean_features = data.dropna().mean

imputed_features = data.fillna(mean_features)

enc_age_group                  26707
enc_education                  25300
enc_income_poverty             22284
race_Black                     26707
race_Hispanic                  26707
                               ...  
opinion_seas_vacc_effective    26245
opinion_seas_risk              26193
opinion_seas_sick_from_vacc    26170
household_adults               26458
household_children             26458
Length: 91, dtype: int64


In [22]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):

    print(imputed_features.count())


respondent_id                   26707
h1n1_concern                    26707
h1n1_knowledge                  26707
behavioral_antiviral_meds       26707
behavioral_avoidance            26707
behavioral_face_mask            26707
behavioral_wash_hands           26707
behavioral_large_gatherings     26707
behavioral_outside_home         26707
behavioral_touch_face           26707
doctor_recc_h1n1                26707
doctor_recc_seasonal            26707
chronic_med_condition           26707
child_under_6_months            26707
health_worker                   26707
health_insurance                26707
opinion_h1n1_vacc_effective     26707
opinion_h1n1_risk               26707
opinion_h1n1_sick_from_vacc     26707
opinion_seas_vacc_effective     26707
opinion_seas_risk               26707
opinion_seas_sick_from_vacc     26707
age_group                       26707
education                       26707
race                            26707
sex                             26707
income_pover

In [26]:
rf = RandomForestClassifier (n_estimators=300)

rf_scores = cross_val_score(rf, imputed_features.get(feature_list), labels['h1n1_vaccine'], cv=10, n_jobs=4, scoring= 'roc_auc')

print('h1n1', rf_scores.min(), rf_scores.mean(), rf_scores.max())

rf_scores = cross_val_score(rf, imputed_features.get(feature_list), labels['seasonal_vaccine'], cv=10, n_jobs=4, scoring= 'roc_auc')

print('seas', rf_scores.min(), rf_scores.mean(), rf_scores.max())

ValueError: 
All the 10 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/intel/inteloneapi/intelpython/latest/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/intel/inteloneapi/intelpython/latest/lib/python3.9/site-packages/daal4py/sklearn/_device_offload.py", line 88, in wrapper_with_self
    return wrapper_impl(self, *args, **kwargs)
  File "/opt/intel/inteloneapi/intelpython/latest/lib/python3.9/site-packages/daal4py/sklearn/_device_offload.py", line 74, in wrapper_impl
    result = _run_on_device(func, q, obj, *hostargs, **hostkwargs)
  File "/opt/intel/inteloneapi/intelpython/latest/lib/python3.9/site-packages/daal4py/sklearn/_device_offload.py", line 65, in _run_on_device
    return dispatch_by_obj(obj, func, *args, **kwargs)
  File "/opt/intel/inteloneapi/intelpython/latest/lib/python3.9/site-packages/daal4py/sklearn/_device_offload.py", line 53, in dispatch_by_obj
    return func(obj, *args, **kwargs)
  File "/opt/intel/inteloneapi/intelpython/latest/lib/python3.9/site-packages/daal4py/sklearn/ensemble/_forest.py", line 672, in fit
    return _fit_classifier(self, X, y, sample_weight=sample_weight)
  File "/opt/intel/inteloneapi/intelpython/latest/lib/python3.9/site-packages/daal4py/sklearn/ensemble/_forest.py", line 323, in _fit_classifier
    X = check_array(X, dtype=[np.float32, np.float64])
  File "/opt/intel/inteloneapi/intelpython/latest/lib/python3.9/site-packages/sklearn/utils/validation.py", line 856, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
  File "/opt/intel/inteloneapi/intelpython/latest/lib/python3.9/site-packages/pandas/core/generic.py", line 2070, in __array__
    return np.asarray(self._values, dtype=dtype)
TypeError: float() argument must be a string or a number, not 'method'
