In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from pathlib import Path
import nbimporter

root = Path(".")

%matplotlib inline
sns.set()

plt.rcParams['figure.figsize'] = [8.0, 8.0]
plt.rcParams['figure.dpi'] = 120

In [32]:
X_train_full = pd.read_csv('input_data\\training_set_features.csv', index_col = 'respondent_id')
y_train_full = pd.read_csv('input_data\\training_set_labels.csv', index_col = 'respondent_id')

In [33]:
X_test = pd.read_csv('input_data\\test_set_features.csv', index_col = 'respondent_id')

# Loading Help Functions

In [34]:
def simplify_col_names(df):
    df['income_poverty'].replace('Below Poverty', 'Low', inplace = True)
    df['income_poverty'].replace('<= $75,000, Above Poverty', 'Medium', inplace = True)
    df['income_poverty'].replace('> $75,000', 'High', inplace = True)
    
    df['age_group'].replace('65+ Years', '65+', inplace = True)
    df['age_group'].replace('55 - 64 Years', '55+', inplace = True)
    df['age_group'].replace('45 - 54 Years', '45+', inplace = True)
    df['age_group'].replace('35 - 44 Years', '35+', inplace = True)
    df['age_group'].replace('18 - 34 Years', '18+', inplace = True)
    
    df['education'].replace('College Graduate', 'Very High', inplace = True)
    df['education'].replace('Some College', 'High', inplace = True)
    df['education'].replace('12 Years', 'Medium', inplace = True)
    df['education'].replace('< 12 Years', 'Low', inplace = True)

In [35]:
def engineer_features(df):
    behavioral_cols = ['behavioral_antiviral_meds',
                   'behavioral_avoidance',
                   'behavioral_face_mask',
                   'behavioral_wash_hands',
                   'behavioral_large_gatherings',
                   'behavioral_outside_home',
                   'behavioral_touch_face']

    #df['general_behavior'] = pd.Series(np.zeros(df.shape[0]), index = df.index)
    #for b_col in behavioral_cols:
    #   df['general_behavior'] += df[b_col]
        
    df['general_effective'] = df['opinion_h1n1_vacc_effective'] + df['opinion_seas_vacc_effective']

    df['general_risk'] = df['opinion_h1n1_risk'] + df['opinion_seas_risk']
    
    df['general_eff_risk'] = df['general_effective'] + df['general_risk']
    
    df['general_reccomendation'] = df['doctor_recc_h1n1'] + df['doctor_recc_seasonal']

# Loading Models

In [36]:
preprocessor = pickle.load(open('models\\preprocessor.pkl', 'rb'))

In [37]:
estimator_h1n1 = pickle.load(open('models\\estimator_h1n1.pkl', 'rb'))
estimator_seas = pickle.load(open('models\\estimator_seas.pkl', 'rb'))

# Modelling

Build final pipelines:

In [38]:
from sklearn.pipeline import Pipeline

full_pipeline_h1n1 = Pipeline(steps=[('preprocessor', preprocessor),
                                     ('estimator', estimator_h1n1)])

full_pipeline_seas = Pipeline(steps=[('preprocessor', preprocessor),
                                     ('estimator', estimator_seas)])

Prepare input data:

In [39]:
simplify_col_names(X_train_full)

In [40]:
engineer_features(X_train_full)

Fit pipeline to full train data:

In [41]:
full_pipeline_h1n1.fit(X_train_full, y_train_full['h1n1_vaccine'])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   16.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   30.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    9.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   22.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.5min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  5.9min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concur

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numerical_cols',
                                                  SimpleImputer(strategy='most_frequent'),
                                                  ['h1n1_concern',
                                                   'h1n1_knowledge',
                                                   'behavioral_antiviral_meds',
                                                   'behavioral_avoidance',
                                                   'behavioral_face_mask',
                                                   'behavioral_wash_hands',
                                                   'behavioral_large_gatherings',
                                                   'behavioral_outside_home',
                                                   'behavioral_touch_face',
                                                   'doctor_recc_h1n1...
                                                          

In [42]:
full_pipeline_seas.fit(X_train_full, y_train_full['seasonal_vaccine'])

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numerical_cols',
                                                  SimpleImputer(strategy='most_frequent'),
                                                  ['h1n1_concern',
                                                   'h1n1_knowledge',
                                                   'behavioral_antiviral_meds',
                                                   'behavioral_avoidance',
                                                   'behavioral_face_mask',
                                                   'behavioral_wash_hands',
                                                   'behavioral_large_gatherings',
                                                   'behavioral_outside_home',
                                                   'behavioral_touch_face',
                                                   'doctor_recc_h1n1...
                                                          

# Making predictions

In [48]:
simplify_col_names(X_test)

In [49]:
engineer_features(X_test)

In [45]:
pred_h1n1 = full_pipeline_h1n1.predict_proba(X_test)[:, 1]
pred_seas = full_pipeline_seas.predict_proba(X_test)[:, 1]

predictions = pd.DataFrame({'respondent_id': X_test.index,
                            'h1n1_vaccine': pred_h1n1,
                            'seasonal_vaccine': pred_seas
                           })

In [46]:
predictions

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.074206,0.157077
1,26708,0.059458,0.076690
2,26709,0.392030,0.791130
3,26710,0.635329,0.890688
4,26711,0.220992,0.443258
...,...,...,...
26703,53410,0.384730,0.558179
26704,53411,0.098033,0.237435
26705,53412,0.089594,0.152253
26706,53413,0.070351,0.323656


In [47]:
predictions.to_csv('output_data\\predictions.csv', index = False)