In [1]:
from __future__ import print_function
from __future__ import division
%matplotlib inline
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns
from sklearn import preprocessing

from sklearn.model_selection import train_test_split
import statsmodels.api as sm

# just for the sake of this blog post!
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
# load the provided data
train_features = pd.read_csv('data-processed/dengue_features_train.csv',
                             index_col=[0,1,2])

train_labels = pd.read_csv('data-processed/dengue_labels_train.csv',
                           index_col=[0,1,2])

In [3]:
# Remove `week_start_date` string.
train_features.drop('week_start_date', axis=1, inplace=True)
#train_features.drop('week_start_date', axis=1, inplace=True)

In [4]:
train_features.shape

(1456, 20)

In [5]:
train_features_shifted1 = train_features.shift(1,axis = 0)
train_features_shifted2 = train_features.shift(2,axis = 0)

In [6]:
train_features_shifted1 = train_features_shifted1.rename(columns={'week_start_date':'week_start_date_pr1',
 'ndvi_ne':'ndvi_ne_pr1',
 'ndvi_nw':'ndvi_nw_pr1',
 'ndvi_se':'ndvi_se_Pr1',
 'ndvi_sw':'ndvi_sw_pr1',
 'precipitation_amt_mm':'precipitation_amt_mm_pr1',
 'reanalysis_air_temp_k':'reanalysis_air_temp_k_pr1',
 'reanalysis_avg_temp_k':'reanalysis_avg_temp_k_pr1',
 'reanalysis_dew_point_temp_k':'reanalysis_dew_point_temp_k_pr1',
 'reanalysis_max_air_temp_k':'reanalysis_max_air_temp_k_pr1',
 'reanalysis_min_air_temp_k':'reanalysis_min_air_temp_k_pr1',
 'reanalysis_precip_amt_kg_per_m2':'reanalysis_precip_amt_kg_per_m2_pr1',
 'reanalysis_relative_humidity_percent':'reanalysis_relative_humidity_percent_pr1',
 'reanalysis_sat_precip_amt_mm':'reanalysis_sat_precip_amt_mm_pr1',
 'reanalysis_specific_humidity_g_per_kg':'reanalysis_specific_humidity_g_per_kg_pr1',
 'reanalysis_tdtr_k':'reanalysis_tdtr_k_pr1',
 'station_avg_temp_c':'station_avg_temp_c_pr1',
 'station_diur_temp_rng_c':'station_diur_temp_rng_c_pr1',
 'station_max_temp_c':'station_max_temp_c_pr1',
 'station_min_temp_c':'station_min_temp_c_pr1',
 'station_precip_mm':'station_precip_mm_pr1'})

In [7]:
train_features_shifted1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ndvi_ne_pr1,ndvi_nw_pr1,ndvi_se_Pr1,ndvi_sw_pr1,precipitation_amt_mm_pr1,reanalysis_air_temp_k_pr1,reanalysis_avg_temp_k_pr1,reanalysis_dew_point_temp_k_pr1,reanalysis_max_air_temp_k_pr1,reanalysis_min_air_temp_k_pr1,reanalysis_precip_amt_kg_per_m2_pr1,reanalysis_relative_humidity_percent_pr1,reanalysis_sat_precip_amt_mm_pr1,reanalysis_specific_humidity_g_per_kg_pr1,reanalysis_tdtr_k_pr1,station_avg_temp_c_pr1,station_diur_temp_rng_c_pr1,station_max_temp_c_pr1,station_min_temp_c_pr1,station_precip_mm_pr1
city,year,weekofyear,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
sj,1990,18,,,,,,,,,,,,,,,,,,,,
sj,1990,19,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,297.742857,292.414286,299.8,295.9,32.0,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0
sj,1990,20,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,298.442857,293.951429,300.9,296.4,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6
sj,1990,21,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,298.878571,295.434286,300.5,297.3,26.1,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4
sj,1990,22,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,299.228571,295.31,301.4,297.0,13.9,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0


In [8]:
train_features_shifted2 = train_features_shifted2.rename(columns={'week_start_date':'week_start_date_pr2',
 'ndvi_ne':'ndvi_ne_pr2',
 'ndvi_nw':'ndvi_nw_pr2',
 'ndvi_se':'ndvi_se_Pr2',
 'ndvi_sw':'ndvi_sw_pr2',
 'precipitation_amt_mm':'precipitation_amt_mm_pr2',
 'reanalysis_air_temp_k':'reanalysis_air_temp_k_pr2',
 'reanalysis_avg_temp_k':'reanalysis_avg_temp_k_pr2',
 'reanalysis_dew_point_temp_k':'reanalysis_dew_point_temp_k_pr2',
 'reanalysis_max_air_temp_k':'reanalysis_max_air_temp_k_pr2',
 'reanalysis_min_air_temp_k':'reanalysis_min_air_temp_k_pr2',
 'reanalysis_precip_amt_kg_per_m2':'reanalysis_precip_amt_kg_per_m2_pr2',
 'reanalysis_relative_humidity_percent':'reanalysis_relative_humidity_percent_pr2',
 'reanalysis_sat_precip_amt_mm':'reanalysis_sat_precip_amt_mm_pr2',
 'reanalysis_specific_humidity_g_per_kg':'reanalysis_specific_humidity_g_per_kg_pr2',
 'reanalysis_tdtr_k':'reanalysis_tdtr_k_pr2',
 'station_avg_temp_c':'station_avg_temp_c_pr2',
 'station_diur_temp_rng_c':'station_diur_temp_rng_c_pr2',
 'station_max_temp_c':'station_max_temp_c_pr2',
 'station_min_temp_c':'station_min_temp_c_pr2',
 'station_precip_mm':'station_precip_mm_pr2'})

In [9]:
train_features_shifted2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ndvi_ne_pr2,ndvi_nw_pr2,ndvi_se_Pr2,ndvi_sw_pr2,precipitation_amt_mm_pr2,reanalysis_air_temp_k_pr2,reanalysis_avg_temp_k_pr2,reanalysis_dew_point_temp_k_pr2,reanalysis_max_air_temp_k_pr2,reanalysis_min_air_temp_k_pr2,reanalysis_precip_amt_kg_per_m2_pr2,reanalysis_relative_humidity_percent_pr2,reanalysis_sat_precip_amt_mm_pr2,reanalysis_specific_humidity_g_per_kg_pr2,reanalysis_tdtr_k_pr2,station_avg_temp_c_pr2,station_diur_temp_rng_c_pr2,station_max_temp_c_pr2,station_min_temp_c_pr2,station_precip_mm_pr2
city,year,weekofyear,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
sj,1990,18,,,,,,,,,,,,,,,,,,,,
sj,1990,19,,,,,,,,,,,,,,,,,,,,
sj,1990,20,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,297.742857,292.414286,299.8,295.9,32.0,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0
sj,1990,21,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,298.442857,293.951429,300.9,296.4,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6
sj,1990,22,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,298.878571,295.434286,300.5,297.3,26.1,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4


In [10]:
df_cd = pd.merge(train_features_shifted1, train_features_shifted2, how='inner',on =['city','year','weekofyear' ])

In [11]:
shifted_dataset_full = pd.merge(df_cd, train_features, how='inner',on =['city','year','weekofyear' ])

In [12]:
shifted_dataset_full.shape

(1456, 60)

In [13]:
# Seperate data for San Juan
sj_train_features = train_features.loc['sj']
sj_train_labels = train_labels.loc['sj']

# Separate data for Iquitos
iq_train_features = train_features.loc['iq']
iq_train_labels = train_labels.loc['iq']

In [14]:
# Seperate data for San Juan
#sj_train_features = shifted_dataset_full.loc['sj']
#sj_train_labels = train_labels.loc['sj']

# Separate data for Iquitos
#iq_train_features = shifted_dataset_full.loc['iq']
#iq_train_labels = train_labels.loc['iq']


In [15]:
# Remove `week_start_date` string.
#sj_train_features.drop('week_start_date', axis=1, inplace=True)
#iq_train_features.drop('week_start_date', axis=1, inplace=True)
#sj_train_features.drop('week_start_date_pr1', axis=1, inplace=True)
#iq_train_features.drop('week_start_date_pr1', axis=1, inplace=True)
#sj_train_features.drop('week_start_date_pr2', axis=1, inplace=True)
#iq_train_features.drop('week_start_date_pr2', axis=1, inplace=True)



In [16]:
sj_train_features.fillna(sj_train_features.mean(), inplace=True)
iq_train_features.fillna(iq_train_features.mean(), inplace=True)

In [19]:
sj_train_features = preprocessing.normalize(sj_train_features)
iq_train_features = preprocessing.normalize(iq_train_features)

In [25]:
pd.DataFrame(sj_train_features=sj_train_features[1:,1:],
             index=sj_train_features[1:,0],
             columns=sj_train_features[0,1:]) 

TypeError: __init__() got an unexpected keyword argument 'sj_train_features'

In [18]:
sj_train_features['total_cases'] = sj_train_labels.total_cases
iq_train_features['total_cases'] = iq_train_labels.total_cases

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
# compute the correlations
sj_correlations = sj_train_features.corr()
iq_correlations = iq_train_features.corr()

In [None]:
# San Juan
(sj_correlations
     .total_cases
     .drop('total_cases') # don't compare with myself
     .sort_values(ascending=False)
     .plot
     .barh())

In [None]:
sj_train_features[sj_train_features.columns[1:]].corr()['total_cases'][:]

In [None]:
iq_train_features[iq_train_features.columns[1:]].corr()['total_cases'][:]

In [None]:
def preprocess_data_iq(data_path, labels_path=None):
    # load data and set index to city, year, weekofyear
    df = data_path
    
    # select features we want
    features = ['reanalysis_dew_point_temp_k',
                'reanalysis_min_air_temp_k',
                'reanalysis_precip_amt_kg_per_m2',
                'reanalysis_relative_humidity_percent',
                'reanalysis_specific_humidity_g_per_kg',
                'reanalysis_tdtr_k',
                'station_min_temp_c',
                
                'reanalysis_dew_point_temp_k_pr1',
                'reanalysis_min_air_temp_k_pr1',
                'reanalysis_precip_amt_kg_per_m2_pr1',
                'reanalysis_relative_humidity_percent_pr1',
                'reanalysis_specific_humidity_g_per_kg_pr1',
                'reanalysis_tdtr_k_pr1',
                'station_min_temp_c_pr1',
                
                'reanalysis_dew_point_temp_k_pr2',
                'reanalysis_min_air_temp_k_pr2',
                'reanalysis_precip_amt_kg_per_m2_pr2',
                'reanalysis_relative_humidity_percent_pr2',
                'reanalysis_specific_humidity_g_per_kg_pr2',
                'reanalysis_tdtr_k_pr2',
                'station_min_temp_c_pr2'
                
             ]
    df = df[features]
    
    # fill missing values
    df.fillna(df.mean(), inplace=True)

    # add labels to dataframe
    if labels_path:
        labels = pd.read_csv(labels_path, index_col=[0, 1, 2])
        df = df.join(labels)
    
    # separate saoin juan and iquitos
    iq = df.loc['iq']
   # iq = df.loc['iq']
    
    return iq

In [None]:
def preprocess_data_sj(data_path, labels_path=None):
    # load data and set index to city, year, weekofyear
    df = data_path
    
    # select features we want
    features = ['reanalysis_air_temp_k',
                'reanalysis_avg_temp_k',
                'reanalysis_dew_point_temp_k',
                'reanalysis_max_air_temp_k',
                'reanalysis_min_air_temp_k',
                'reanalysis_precip_amt_kg_per_m2',
                'reanalysis_relative_humidity_percent',
                'reanalysis_specific_humidity_g_per_kg',
                'station_avg_temp_c',
                'station_max_temp_c',
                'station_min_temp_c',
                'reanalysis_air_temp_k_pr1',
                'reanalysis_avg_temp_k_pr1',
                'reanalysis_dew_point_temp_k_pr1',
                'reanalysis_max_air_temp_k_pr1',
                'reanalysis_min_air_temp_k_pr1',
                'reanalysis_precip_amt_kg_per_m2_pr1',
                'reanalysis_relative_humidity_percent_pr1',
                'reanalysis_specific_humidity_g_per_kg_pr1',
                'station_avg_temp_c_pr1',
                'station_max_temp_c_pr1',
                'station_min_temp_c_pr1',
                'reanalysis_air_temp_k_pr2',
                'reanalysis_avg_temp_k_pr2',
                'reanalysis_dew_point_temp_k_pr2',
                'reanalysis_max_air_temp_k_pr2',
                'reanalysis_min_air_temp_k_pr2',
                'reanalysis_precip_amt_kg_per_m2_pr2',
                'reanalysis_relative_humidity_percent_pr2',
                'reanalysis_specific_humidity_g_per_kg_pr2',
                'station_avg_temp_c_pr2',
                'station_max_temp_c_pr2',
                'station_min_temp_c_pr2'
             ]
    df = df[features]
    
    # fill missing values
    df.fillna(df.mean(), inplace=True)

    # add labels to dataframe
    if labels_path:
        labels = pd.read_csv(labels_path, index_col=[0, 1, 2])
        df = df.join(labels)
    
    # separate saoin juan and iquitos
    sj = df.loc['sj']
   # iq = df.loc['iq']
    
    return sj

In [None]:
sj_train = preprocess_data_sj(shifted_dataset_full,
                                    labels_path="data-processed/dengue_labels_train.csv")
iq_train = preprocess_data_iq(shifted_dataset_full,
                                    labels_path="data-processed/dengue_labels_train.csv")

In [None]:
sj_train_subtrain = sj_train.head(800)
sj_train_subtest = sj_train.tail(sj_train.shape[0] - 800)

iq_train_subtrain = iq_train.head(400)
iq_train_subtest = iq_train.tail(iq_train.shape[0] - 400)

In [None]:
sj_train.shape

In [None]:
iq_train.shape

In [None]:
from statsmodels.tools import eval_measures
import statsmodels.formula.api as smf

def get_best_model_sj(train, test):
    # Step 1: specify the form of the model
    model_formula = "total_cases ~ 1 + " \
    "reanalysis_air_temp_k + " \
                "reanalysis_avg_temp_k + " \
                "reanalysis_dew_point_temp_k + " \
                "reanalysis_max_air_temp_k+"\
                "reanalysis_min_air_temp_k + " \
                "reanalysis_precip_amt_kg_per_m2 + " \
                "reanalysis_relative_humidity_percent + " \
                "reanalysis_specific_humidity_g_per_kg + " \
                "station_avg_temp_c + " \
                "station_max_temp_c + " \
                "station_min_temp_c + " \
                "reanalysis_air_temp_k_pr1 + " \
                "reanalysis_avg_temp_k_pr1 + " \
                "reanalysis_dew_point_temp_k_pr1 + " \
                "reanalysis_max_air_temp_k_pr1 + " \
                "reanalysis_min_air_temp_k_pr1 + " \
                "reanalysis_precip_amt_kg_per_m2_pr1 + " \
                "reanalysis_relative_humidity_percent_pr1 + " \
                "reanalysis_specific_humidity_g_per_kg_pr1 + " \
                "station_avg_temp_c_pr1 + " \
                "station_max_temp_c_pr1 + " \
                "station_min_temp_c_pr1 + " \
                "reanalysis_air_temp_k_pr2 + " \
                "reanalysis_avg_temp_k_pr2 + " \
                "reanalysis_dew_point_temp_k_pr2 + " \
                "reanalysis_max_air_temp_k_pr2 + " \
                "reanalysis_min_air_temp_k_pr2 + " \
                "reanalysis_precip_amt_kg_per_m2_pr2 + " \
                "reanalysis_relative_humidity_percent_pr2 + " \
                "reanalysis_specific_humidity_g_per_kg_pr2 + " \
                "station_avg_temp_c_pr2 + " \
                "station_max_temp_c_pr2 + " \
                "station_min_temp_c_pr2"
    
    grid = 10 ** np.arange(-8, -3, dtype=np.float64)
                    
    best_alpha = []
    best_score = 1000
        
    # Step 2: Find the best hyper parameter, alpha
    for alpha in grid:
        model = smf.glm(formula=model_formula,
                        data=train,
                        family=sm.families.NegativeBinomial(alpha=alpha))

        results = model.fit()
        predictions = results.predict(test).astype(int)
        score = eval_measures.meanabs(predictions, test.total_cases)

        if score < best_score:
            best_alpha = alpha
            best_score = score

    print('best alpha = ', best_alpha)
    print('best score = ', best_score)
            
    # Step 3: refit on entire dataset
    full_dataset = pd.concat([train, test])
    model = smf.glm(formula=model_formula,
                    data=full_dataset,
                    family=sm.families.NegativeBinomial(alpha=best_alpha))

    fitted_model = model.fit()
    return fitted_model
    
sj_best_model = get_best_model_sj(sj_train_subtrain, sj_train_subtest)
#iq_best_model = get_best_model_sj(iq_train_subtrain, iq_train_subtest)

figs, axes = plt.subplots(nrows=2, ncols=1)

# plot sj
sj_train['fitted'] = sj_best_model.fittedvalues
sj_train.fitted.plot(ax=axes[0], label="Predictions")
sj_train.total_cases.plot(ax=axes[0], label="Actual")

# plot iq
#iq_train['fitted'] = iq_best_model.fittedvalues
#iq_train.fitted.plot(ax=axes[1], label="Predictions")
#iq_train.total_cases.plot(ax=axes[1], label="Actual")

plt.suptitle("Dengue Predicted Cases vs. Actual Cases")
plt.legend()

In [None]:
def get_best_model_iq(train, test):
    # Step 1: specify the form of the model
    model_formula = "total_cases ~ 1 + " \
                "reanalysis_dew_point_temp_k + " \
                "reanalysis_min_air_temp_k + " \
                "reanalysis_precip_amt_kg_per_m2 + " \
                "reanalysis_relative_humidity_percent + " \
                "reanalysis_specific_humidity_g_per_kg + " \
                "reanalysis_tdtr_k + " \
                "station_min_temp_c + " \
                "reanalysis_dew_point_temp_k_pr1 + " \
                "reanalysis_min_air_temp_k_pr1 + " \
                "reanalysis_precip_amt_kg_per_m2_pr1 + " \
                "reanalysis_relative_humidity_percent_pr1 + " \
                "reanalysis_specific_humidity_g_per_kg_pr1 + " \
                "reanalysis_tdtr_k_pr1 + " \
                "station_min_temp_c_pr1 + " \
                "reanalysis_dew_point_temp_k_pr2 + " \
                "reanalysis_min_air_temp_k_pr2 + " \
                "reanalysis_precip_amt_kg_per_m2_pr2 + " \
                "reanalysis_relative_humidity_percent_pr2 + " \
                "reanalysis_specific_humidity_g_per_kg_pr2 + " \
                "reanalysis_tdtr_k_pr2 + " \
                "station_min_temp_c_pr2 "
    
    grid = 10 ** np.arange(-8, -3, dtype=np.float64)
                    
    best_alpha = []
    best_score = 1000
        
    # Step 2: Find the best hyper parameter, alpha
    for alpha in grid:
        model = smf.glm(formula=model_formula,
                        data=train,
                        family=sm.families.NegativeBinomial(alpha=alpha))

        results = model.fit()
        predictions = results.predict(test).astype(int)
        score = eval_measures.meanabs(predictions, test.total_cases)

        if score < best_score:
            best_alpha = alpha
            best_score = score

    print('best alpha = ', best_alpha)
    print('best score = ', best_score)
            
    # Step 3: refit on entire dataset
    full_dataset = pd.concat([train, test])
    model = smf.glm(formula=model_formula,
                    data=full_dataset,
                    family=sm.families.NegativeBinomial(alpha=best_alpha))

    fitted_model = model.fit()
    return fitted_model
    
#sj_best_model = get_best_model_iq(sj_train_subtrain, sj_train_subtest)
iq_best_model = get_best_model_iq(iq_train_subtrain, iq_train_subtest)

figs, axes = plt.subplots(nrows=2, ncols=1)

# plot sj
#sj_train['fitted'] = sj_best_model.fittedvalues
#sj_train.fitted.plot(ax=axes[0], label="Predictions")
#sj_train.total_cases.plot(ax=axes[0], label="Actual")

# plot iq
iq_train['fitted'] = iq_best_model.fittedvalues
iq_train.fitted.plot(ax=axes[1], label="Predictions")
iq_train.total_cases.plot(ax=axes[1], label="Actual")

plt.suptitle("Dengue Predicted Cases vs. Actual Cases")
plt.legend()

In [None]:
test_features = pd.read_csv('data-processed/dengue_features_test.csv',
                             index_col=[0,1,2])

In [None]:
test_features_shifted1 = test_features.shift(1,axis = 0)
test_features_shifted2 = test_features.shift(2,axis = 0)

In [None]:
test_features_shifted1 = test_features_shifted1.rename(columns={'week_start_date':'week_start_date_pr1',
 'ndvi_ne':'ndvi_ne_pr1',
 'ndvi_nw':'ndvi_nw_pr1',
 'ndvi_se':'ndvi_se_Pr1',
 'ndvi_sw':'ndvi_sw_pr1',
 'precipitation_amt_mm':'precipitation_amt_mm_pr1',
 'reanalysis_air_temp_k':'reanalysis_air_temp_k_pr1',
 'reanalysis_avg_temp_k':'reanalysis_avg_temp_k_pr1',
 'reanalysis_dew_point_temp_k':'reanalysis_dew_point_temp_k_pr1',
 'reanalysis_max_air_temp_k':'reanalysis_max_air_temp_k_pr1',
 'reanalysis_min_air_temp_k':'reanalysis_min_air_temp_k_pr1',
 'reanalysis_precip_amt_kg_per_m2':'reanalysis_precip_amt_kg_per_m2_pr1',
 'reanalysis_relative_humidity_percent':'reanalysis_relative_humidity_percent_pr1',
 'reanalysis_sat_precip_amt_mm':'reanalysis_sat_precip_amt_mm_pr1',
 'reanalysis_specific_humidity_g_per_kg':'reanalysis_specific_humidity_g_per_kg_pr1',
 'reanalysis_tdtr_k':'reanalysis_tdtr_k_pr1',
 'station_avg_temp_c':'station_avg_temp_c_pr1',
 'station_diur_temp_rng_c':'station_diur_temp_rng_c_pr1',
 'station_max_temp_c':'station_max_temp_c_pr1',
 'station_min_temp_c':'station_min_temp_c_pr1',
 'station_precip_mm':'station_precip_mm_pr1'})

In [None]:
test_features_shifted2 = test_features_shifted2.rename(columns={'week_start_date':'week_start_date_pr2',
 'ndvi_ne':'ndvi_ne_pr2',
 'ndvi_nw':'ndvi_nw_pr2',
 'ndvi_se':'ndvi_se_Pr2',
 'ndvi_sw':'ndvi_sw_pr2',
 'precipitation_amt_mm':'precipitation_amt_mm_pr2',
 'reanalysis_air_temp_k':'reanalysis_air_temp_k_pr2',
 'reanalysis_avg_temp_k':'reanalysis_avg_temp_k_pr2',
 'reanalysis_dew_point_temp_k':'reanalysis_dew_point_temp_k_pr2',
 'reanalysis_max_air_temp_k':'reanalysis_max_air_temp_k_pr2',
 'reanalysis_min_air_temp_k':'reanalysis_min_air_temp_k_pr2',
 'reanalysis_precip_amt_kg_per_m2':'reanalysis_precip_amt_kg_per_m2_pr2',
 'reanalysis_relative_humidity_percent':'reanalysis_relative_humidity_percent_pr2',
 'reanalysis_sat_precip_amt_mm':'reanalysis_sat_precip_amt_mm_pr2',
 'reanalysis_specific_humidity_g_per_kg':'reanalysis_specific_humidity_g_per_kg_pr2',
 'reanalysis_tdtr_k':'reanalysis_tdtr_k_pr2',
 'station_avg_temp_c':'station_avg_temp_c_pr2',
 'station_diur_temp_rng_c':'station_diur_temp_rng_c_pr2',
 'station_max_temp_c':'station_max_temp_c_pr2',
 'station_min_temp_c':'station_min_temp_c_pr2',
 'station_precip_mm':'station_precip_mm_pr2'})

In [None]:
df_test = pd.merge(test_features_shifted1, test_features_shifted2, how='inner',on =['city','year','weekofyear' ])

In [None]:
shifted_testset_full = pd.merge(df_test, test_features, how='inner',on =['city','year','weekofyear' ])

In [None]:
shifted_testset_full.shape

In [None]:
sj_test = preprocess_data_sj(shifted_testset_full)
iq_test = preprocess_data_iq(shifted_testset_full)

In [None]:
sj_test = preprocessing.normalize(sj_test)
iq_test = preprocessing.normalize(iq_test)

In [None]:
sj_predictions = sj_best_model.predict(sj_test).astype(int)
iq_predictions = iq_best_model.predict(iq_test).astype(int)

submission = pd.read_csv("data-processed/submission_format.csv",
                         index_col=[0, 1, 2])

print(sj_predictions.shape)
print(iq_predictions.shape)
submission.total_cases = np.concatenate([sj_predictions, iq_predictions])
submission.to_csv("data-processed/benchmark.csv")
