In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from xgboost import XGBRegressor

In [2]:
def mae(pred, true):
    return np.mean(np.abs(pred - true))

In [3]:
# Main source for the training data
DATA_URL = 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv'
# Local file
DATA_FILE = 'data/OxCGRT_latest.csv'

In [4]:
import os
import urllib.request
if not os.path.exists('data'):
    os.mkdir('data')
urllib.request.urlretrieve(DATA_URL, DATA_FILE)

('data/OxCGRT_latest.csv', <http.client.HTTPMessage at 0x7f7f03c48c10>)

In [5]:
df = pd.read_csv(DATA_FILE, 
                 parse_dates=['Date'],
                 encoding="ISO-8859-1",
                 dtype={"RegionName": str,
                        "RegionCode": str},
                 error_bad_lines=False)

In [19]:
df = df[df.Date >= np.datetime64("2020-07-02")]
df = df[df.Date <= np.datetime64("2020-08-31")]

In [20]:
df['GeoID'] = df['CountryName'] + '__' + df['RegionName'].astype(str)

In [21]:
df['NewCases'] = df.groupby('GeoID').ConfirmedCases.diff().fillna(0)

In [23]:
id_cols = ['CountryName',
           'RegionName',
           'GeoID',
           'Date']
cases_col = ['NewCases']
npi_cols=['C1_School closing',
       'C2_Workplace closing','C3_Cancel public events',
       'C4_Restrictions on gatherings', 'C5_Close public transport',
       'C6_Stay at home requirements', 
       'C7_Restrictions on internal movement', 
       'H2_Testing policy',
       'H6_Facial Coverings', 'ConfirmedCases',
       'GovernmentResponseIndex', 
       'EconomicSupportIndex']
df = df[id_cols + cases_col + npi_cols]

In [24]:
df.update(df.groupby('GeoID').NewCases.apply(
    lambda group: group.interpolate()).fillna(0))
for npi_col in npi_cols:
    df.update(df.groupby('GeoID')[npi_col].ffill().fillna(0))

In [25]:
df

Unnamed: 0,CountryName,RegionName,GeoID,Date,NewCases,C1_School closing,C2_Workplace closing,C3_Cancel public events,C4_Restrictions on gatherings,C5_Close public transport,C6_Stay at home requirements,C7_Restrictions on internal movement,H2_Testing policy,H6_Facial Coverings,ConfirmedCases,GovernmentResponseIndex,EconomicSupportIndex
183,Aruba,,Aruba__nan,2020-07-02,0.0,0.0,1.0,2.0,0.0,0.0,1.0,1.0,2.0,0.0,103.0,48.81,87.5
184,Aruba,,Aruba__nan,2020-07-03,1.0,0.0,1.0,2.0,0.0,0.0,1.0,1.0,2.0,0.0,104.0,48.81,87.5
185,Aruba,,Aruba__nan,2020-07-04,0.0,0.0,1.0,2.0,0.0,0.0,1.0,1.0,2.0,0.0,104.0,48.81,87.5
186,Aruba,,Aruba__nan,2020-07-05,1.0,0.0,1.0,2.0,0.0,0.0,1.0,1.0,2.0,0.0,105.0,48.81,87.5
187,Aruba,,Aruba__nan,2020-07-06,0.0,0.0,1.0,2.0,0.0,0.0,1.0,1.0,2.0,0.0,105.0,48.81,87.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87095,Zimbabwe,,Zimbabwe__nan,2020-08-27,55.0,3.0,1.0,2.0,3.0,1.0,2.0,2.0,1.0,4.0,6251.0,68.45,25.0
87096,Zimbabwe,,Zimbabwe__nan,2020-08-28,41.0,3.0,1.0,2.0,3.0,1.0,2.0,2.0,1.0,4.0,6292.0,68.45,25.0
87097,Zimbabwe,,Zimbabwe__nan,2020-08-29,96.0,3.0,1.0,2.0,3.0,1.0,2.0,2.0,1.0,4.0,6388.0,68.45,25.0
87098,Zimbabwe,,Zimbabwe__nan,2020-08-30,18.0,3.0,1.0,2.0,3.0,1.0,2.0,2.0,1.0,4.0,6406.0,68.45,25.0


In [26]:
# Set number of past days to use to make predictions
nb_lookback_days = 30

# Create training data across all countries for predicting one day ahead
X_cols = cases_col + npi_cols
y_col = cases_col
X_samples = []
y_samples = []
geo_ids = df.GeoID.unique()
for g in geo_ids:
    gdf = df[df.GeoID == g]
    all_case_data = np.array(gdf[cases_col])
    all_npi_data = np.array(gdf[npi_cols])

    # Create one sample for each day where we have enough data
    # Each sample consists of cases and npis for previous nb_lookback_days
    nb_total_days = len(gdf)
    for d in range(nb_lookback_days, nb_total_days - 1):
        X_cases = all_case_data[d-nb_lookback_days:d]

        # Take negative of npis to support positive
        # weight constraint in Lasso.
        X_npis = -all_npi_data[d - nb_lookback_days:d]

        # Flatten all input data so it fits Lasso input format.
        X_sample = np.concatenate([X_cases.flatten(),
                                   X_npis.flatten()])
        y_sample = all_case_data[d + 1]
        X_samples.append(X_sample)
        y_samples.append(y_sample)

X_samples = np.array(X_samples)
y_samples = np.array(y_samples).flatten()

In [29]:
X_samples.shape

(7950, 390)