In [1]:
import pandas as pd
import numpy as np

In [2]:
test_df = pd.read_csv('test_set_features.csv', index_col='respondent_id')
training_df = pd.read_csv('training_set_features.csv', index_col='respondent_id')
training_labels = pd.read_csv('training_set_labels.csv', index_col='respondent_id')

In [3]:
df = pd.concat([training_labels, training_df], axis=1)
df.isna().sum()

h1n1_vaccine                       0
seasonal_vaccine                   0
h1n1_concern                      92
h1n1_knowledge                   116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_h1n1                2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_h1n1_vacc_effective      391
opinion_h1n1_risk                388
opinion_h1n1_sick_from_vacc      395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
i

In [4]:
numeric_col = training_df.columns[training_df.dtypes != 'object'].values
cat_col = training_df.columns[training_df.dtypes == 'object'].values

df[cat_col] = df[cat_col].fillna('None')

In [6]:
not_in_labor = df.loc[df['employment_status'].str.contains('Not in Labor Force')]
unemployed = df.loc[df['employment_status'].str.contains('Unemployed')]
none = df.loc[df['employment_status'].str.contains('None')]

no_job = pd.concat([not_in_labor, unemployed, none])
len(no_job)

13147

In [8]:
for i in no_job.index:
    df.employment_industry.iloc[i] == df.employment_industry.iloc[i].replace('None', 'Not working')
    df.employment_occupation.iloc[i] == df.employment_occupation.iloc[i].replace('None', 'No Job Title')
    
for i in no_job.loc[no_job['health_worker'].isna()].index:
    df.health_worker.iloc[i] == 0

In [10]:
X = df.drop(['h1n1_vaccine', 'seasonal_vaccine'], axis=1)
y_h1n1 = training_labels['h1n1_vaccine']
y_seasonal = training_labels['seasonal_vaccine']

None        13470
xtkaffoo     1778
mxkfnird     1509
emcorrxb     1270
cmhcxjea     1247
xgwztkwe     1082
hfxkjkmi      766
qxajmpny      548
xqwwgdyp      485
kldqjyjy      469
uqqtjvyb      452
tfqavkke      388
ukymxvdu      372
vlluhbov      354
oijqvulv      344
ccgxvspp      341
bxpfxfdn      331
haliazsg      296
rcertsgn      276
xzmlyyjv      248
dlvbwzss      227
hodpvpew      208
dcjcmpih      148
pvmttkik       98
Name: employment_occupation, dtype: int64

In [76]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')

X_train_h1n1_num = X_train_h1n1[num_col(h1n1_df)]
X_test_h1n1_num = X_test_h1n1[num_col(h1n1_df)]

imputer.fit(X_train_h1n1_num)

X_train_filled_h1n1_num = pd.DataFrame(imputer.transform(X_train_h1n1_num),
                                       index=X_train_h1n1_num.index,
                                       columns=X_train_h1n1_num.columns)
X_test_filled_h1n1_num = pd.DataFrame(imputer.transform(X_test_h1n1_num),
                                      index=X_test_h1n1_num.index,
                                      columns=X_test_h1n1_num.columns)

In [77]:
from sklearn.preprocessing import StandardScaler

std = StandardScaler()
std.fit(X_train_filled_h1n1_num)

X_train_scaled_h1n1_num = pd.DataFrame(std.transform(X_train_filled_h1n1_num),
                                       index=X_train_filled_h1n1_num.index,
                                       columns=X_train_filled_h1n1_num.columns)
X_test_scaled_h1n1_num = pd.DataFrame(std.transform(X_test_filled_h1n1_num),
                                      index=X_test_filled_h1n1_num.index,
                                      columns=X_test_filled_h1n1_num.columns)

In [79]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

X_train_h1n1_cat = X_train_h1n1[cat_col(h1n1_df)]
X_test_h1n1_cat = X_test_h1n1[cat_col(h1n1_df)]
ohe.fit(X_train_h1n1_cat)
X_train_h1n1_ohe = pd.DataFrame(ohe.transform(X_train_h1n1_cat),
                                index=X_train_h1n1_cat.index,
                                columns=np.hstack(ohe.categories_))
X_test_h1n1_ohe = pd.DataFrame(ohe.transform(X_test_h1n1_cat),
                                index=X_test_h1n1_cat.index,
                                columns=np.hstack(ohe.categories_))

In [81]:
X_train_h1n1_processed = pd.concat([X_train_scaled_h1n1_num, X_train_h1n1_ohe], axis=1)
X_test_h1n1_processed = pd.concat([X_test_scaled_h1n1_num, X_test_h1n1_ohe], axis=1)

In [83]:
X_train_seasonal, X_test_seasonal, y_train_seasonal, y_test_seasonal = train_test_split(seasonal_df, y_seasonal, test_size=0.25, random_state=42)

In [84]:
X_train_seasonal_num = X_train_seasonal[num_col(seasonal_df)]
X_test_seasonal_num = X_test_seasonal[num_col(seasonal_df)]

imputer.fit(X_train_seasonal_num)

X_train_filled_seasonal_num = pd.DataFrame(imputer.transform(X_train_seasonal_num),
                                       index=X_train_seasonal_num.index,
                                       columns=X_train_seasonal_num.columns)
X_test_filled_seasonal_num = pd.DataFrame(imputer.transform(X_test_seasonal_num),
                                      index=X_test_seasonal_num.index,
                                      columns=X_test_seasonal_num.columns)

In [85]:
std.fit(X_train_filled_seasonal_num)

X_train_scaled_seasonal_num = pd.DataFrame(std.transform(X_train_filled_seasonal_num),
                                       index=X_train_filled_seasonal_num.index,
                                       columns=X_train_filled_seasonal_num.columns)
X_test_scaled_seasonal_num = pd.DataFrame(std.transform(X_test_filled_seasonal_num),
                                      index=X_test_filled_seasonal_num.index,
                                      columns=X_test_filled_seasonal_num.columns)

In [86]:
X_train_seasonal_cat = X_train_seasonal[cat_col(seasonal_df)]
X_test_seasonal_cat = X_test_seasonal[cat_col(seasonal_df)]

ohe.fit(X_train_seasonal_cat)
X_train_seasonal_ohe = pd.DataFrame(ohe.transform(X_train_seasonal_cat),
                                index=X_train_seasonal_cat.index,
                                columns=np.hstack(ohe.categories_))
X_test_seasonal_ohe = pd.DataFrame(ohe.transform(X_test_seasonal_cat),
                                index=X_test_seasonal_cat.index,
                                columns=np.hstack(ohe.categories_))

In [87]:
X_train_seasonal_processed = pd.concat([X_train_scaled_seasonal_num, X_train_seasonal_ohe], axis=1)
X_test_seasonal_processed = pd.concat([X_test_scaled_seasonal_num, X_test_seasonal_ohe], axis=1)

In [91]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_model = LogisticRegression(random_state=42, max_iter=200)
log_loss_cv = cross_val_score(log_model, X_train_h1n1_processed, y_train_h1n1, scoring='neg_log_loss')

log_loss_h1n1 = -(log_loss_cv.mean())
log_loss_h1n1

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.3866392792171969

In [92]:
log_loss_cv = cross_val_score(log_model, X_train_seasonal_processed, y_train_seasonal, scoring='neg_log_loss')

log_loss_seasonal = -(log_loss_cv.mean())
log_loss_seasonal



0.4868397228883469