In [526]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from sklearn.preprocessing import StandardScaler,RobustScaler
import os

In [527]:
daily_ts = pd.read_csv('../data/daily_ts_feature.csv')
daily_weather_ts = pd.read_csv('../data/daily_weather_feature.csv')
daily_policy_ts = pd.read_csv('../data/daily_policy_feature.csv')

In [528]:
ts_features = [item for item in daily_ts.columns if item not in ['country','date']]
weather_features = [item for item in daily_weather_ts.columns if item not in ['country','date']]
policy_features = [item for item in daily_policy_ts.columns if item not in ['country','date']]

In [529]:
ts = pd.merge(daily_ts,daily_weather_ts,how='left',on=['country','date'])
ts = pd.merge(ts, daily_policy_ts,how='left',on=['country','date'])

In [530]:
ts['ah'] = ts['ah'].replace(np.inf,25.0)
ts[weather_features] = ts[weather_features].fillna(ts[weather_features].mean(axis=0))
ts.fillna(0.0,inplace=True)

In [531]:
date_selected = daily_ts.date.unique()[10:70]  # 2-1->3.31 

In [532]:
# countries_selected = ts.groupby('country')['confirmed'].max().sort_values(ascending=False)[:100].index.values
countries_selected = ['Afghanistan', 'Albania', 'Algeria', 'Argentina', 'Armenia',
       'Australia', 'Austria', 'Azerbaijan', 'Bahrain', 'Bangladesh',
       'Belarus', 'Belgium', 'Bolivia', 'Bosnia and Herzegovina',
       'Brazil', 'Bulgaria', 'Cameroon', 'Canada', 'Chile', 'China',
       'Colombia', 'Congo (Kinshasa)', "Cote d'Ivoire", 'Croatia', 'Cuba',
       'Cyprus', 'Czechia', 'Denmark', 'Djibouti', 'Dominican Republic',
       'Ecuador', 'Egypt', 'Estonia', 'Finland', 'France', 'Germany',
       'Ghana', 'Greece', 'Guatemala', 'Guinea', 'Honduras', 'Hungary',
       'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland',
       'Israel', 'Italy', 'Japan', 'Kazakhstan', 'Kuwait', 'Kyrgyzstan',
       'Latvia', 'Lithuania', 'Luxembourg', 'Malaysia', 'Mexico',
       'Moldova', 'Morocco', 'Netherlands', 'New Zealand', 'Niger',
       'Nigeria', 'North Macedonia', 'Norway', 'Oman', 'Pakistan',
       'Panama', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Qatar',
       'Romania', 'Russia', 'Saudi Arabia', 'Senegal', 'Serbia',
       'Singapore', 'Slovakia', 'Slovenia', 'Somalia', 'South Africa',
       'Spain', 'Sri Lanka', 'Sudan', 'Sweden', 'Switzerland', 'Thailand',
       'Tunisia', 'Turkey', 'US', 'Ukraine', 'United Arab Emirates',
       'United Kingdom', 'Uzbekistan']

In [533]:
ts = ts[ts.country.isin(countries_selected)].sort_values(['country','date']).reset_index(drop=True)

In [534]:
healthcare = pd.read_csv('../data/healthcare_features.csv')
population = pd.read_csv('../data/population_features.csv')

In [535]:
healthcare = healthcare[healthcare.country.isin(countries_selected)].sort_values('country').reset_index(drop=True)
population = population[population.country.isin(countries_selected)].sort_values('country').reset_index(drop=True)

In [536]:
healthcare_features = [item for item in healthcare.columns if item not in ['country',
                                                                           'h1n1_Geographic_spread',
                                                                          'h1n1_Intensity',
                                                                          'h1n1_Impact_on_healthcare_services']]
population_features = [item for item in population.columns if item not in ['country']]

In [537]:
healthcare[healthcare_features] = healthcare[healthcare_features].fillna(healthcare[healthcare_features].mean(axis=0))

In [538]:
def normalized_digit(x):
    try:
        return float(x)
    except:
        x = str(x)
        x = x.replace(',','')
        if x[-1] == '%':
            x = int(x[:-1]) / 100.0
        if x == 'N.A.':
            return np.nan
        if x == '-':
            return np.nan
        return float(x)
population['Urban_pop_pct'] = population.Urban_pop_pct.map(normalized_digit)
population['Density_KM2m'] = population.Density_KM2m.map(normalized_digit)
population['Fertility_rate'] = population.Fertility_rate.map(normalized_digit)
population['Median_age'] = population.Median_age.map(normalized_digit)
population['sex_male_to_female_total'] = population.sex_male_to_female_total.map(normalized_digit)

In [539]:
population[population_features] = population[population_features].fillna(population[population_features].mean(axis=0))

In [540]:
def prepare_data(data_list,feature_list,input_length=14,output_length=7, start_index=10, end_index=70, test_len=14):
    
    ts,healthcare,population = data_list
    ts_features,weather_features,policy_features,healthcare_features,population_features = feature_list
    
    ts_npy = ts[ts_features].values
    weather_npy = ts[weather_features].values
    policy_npy = ts[policy_features].values
    healthcare_npy = healthcare[healthcare_features].values
    population_npy = population[population_features].values
    
    
    ts_npy = np.log1p(ts_npy)
    ss = RobustScaler()
    weather_npy = ss.fit_transform(weather_npy)
    policy_npy = ss.fit_transform(policy_npy)
    healthcare_npy = ss.fit_transform(healthcare_npy)
    population_npy = ss.fit_transform(population_npy)
    
    ts_len = ts.country.value_counts()[0]   
    country_len = ts.country.value_counts().shape[0]
    
    train_ts_set , test_ts_set =  [],[]
    train_weather_set , test_weather_set =  [],[]
    train_policy_set , test_policy_set =  [],[]
    train_healthcare_set , test_healthcare_set =  [],[]
    train_population_set , test_population_set =  [],[]
    train_y , test_y = [],[]
    
    for _country_idx in tqdm(range(country_len)):
        _start_idx = _country_idx * ts_len
        _end_idx = (_country_idx + 1) * ts_len
        for slot in range(start_index,end_index-test_len):
            
            train_ts_set.append(ts_npy[_start_idx:_end_idx,:][slot:slot+input_length])
            train_weather_set.append(weather_npy[_start_idx:_end_idx,:][slot:slot+input_length])        
            train_policy_set.append(policy_npy[_start_idx:_end_idx,:][slot:slot+input_length])  
            
            train_healthcare_set.append(healthcare_npy[_country_idx,:])
            train_population_set.append(population_npy[_country_idx,:])
            
            train_y.append(ts_npy[_start_idx:_end_idx,:][slot+input_length:slot+input_length+output_length])
            
            
        for slot in [end_index-test_len]:
            
            test_ts_set.append(ts_npy[_start_idx:_end_idx,:][slot:slot+input_length])
            test_weather_set.append(weather_npy[_start_idx:_end_idx,:][slot:slot+input_length])        
            test_policy_set.append(policy_npy[_start_idx:_end_idx,:][slot:slot+input_length])  
            
            test_healthcare_set.append(healthcare_npy[_country_idx,:])
            test_population_set.append(population_npy[_country_idx,:])            
            
            test_y.append(ts_npy[_start_idx:_end_idx,:][slot+input_length:slot+input_length+output_length])
    
    
    train_ts_set = np.array(train_ts_set)
    train_weather_set = np.array(train_weather_set)
    train_policy_set = np.array(train_policy_set)
    train_healthcare_set = np.array(train_healthcare_set)
    train_population_set = np.array(train_population_set)
    train_y = np.array(train_y)
    
    test_ts_set = np.array(test_ts_set)
    test_weather_set = np.array(test_weather_set)
    test_policy_set = np.array(test_policy_set)
    test_healthcare_set = np.array(test_healthcare_set)
    test_population_set = np.array(test_population_set)    
    test_y = np.array(test_y)
    
    train_set = [train_ts_set, train_weather_set, train_policy_set, train_population_set, train_healthcare_set]
    test_set = [test_ts_set, test_weather_set, test_policy_set, test_population_set, test_healthcare_set]
    
    return train_set,train_y,test_set,test_y

In [541]:
data_list = [ts,population,healthcare]
feature_list = [ts_features,weather_features,policy_features,population_features,healthcare_features]

In [542]:
os.makedirs('../features',exist_ok=True)
pd.to_pickle(data_list,'../features/data_list.5.7.pkl')
pd.to_pickle(feature_list,'../features/feature_list.5.7.pkl')

In [543]:
train_set,train_y,test_set,test_y = prepare_data(data_list,feature_list)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=98.0), HTML(value='')))




In [544]:
train_set[0].shape,train_set[1].shape,train_set[2].shape,train_set[3].shape,train_set[4].shape,train_y.shape
test_set[0].shape,test_set[1].shape,test_set[2].shape,test_set[3].shape,test_set[4].shape,test_y.shape

((98, 14, 3), (98, 14, 11), (98, 14, 18), (98, 11), (98, 18), (98, 7, 3))

In [545]:
pd.to_pickle([train_set,train_y],'../features/train_set.5.7.pkl')
pd.to_pickle([test_set,test_y],'../features/test_set.5.7.pkl')
pd.to_pickle(countries_selected,'../features/countries.5.7.pkl')