In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from xgboost import XGBRegressor

In [2]:
def mae(pred, true):
    return np.mean(np.abs(pred - true))

In [7]:
# Main source for the training data
DATA_URL = 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv'
# Local file
DATA_FILE = "data/2020-09-30_historical_ip_new.csv" 

In [21]:
# Main source for the training data
DATA_URL2 = 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv'
# Local file
DATA_FILE2 = 'data/OxCGRT_latest.csv'

In [22]:
df = pd.read_csv(DATA_FILE2, 
                 parse_dates=['Date'],
                 encoding="ISO-8859-1",
                 dtype={"RegionName": str,
                        "RegionCode": str},
                 error_bad_lines=False)

In [23]:
HYPOTHETICAL_SUBMISSION_DATE = np.datetime64("2020-07-31")
df = df[df.Date <= HYPOTHETICAL_SUBMISSION_DATE]
df['GeoID'] = df['CountryName'] + '__' + df['RegionName'].astype(str)
df['NewCases'] = df.groupby('GeoID').ConfirmedCases.diff().fillna(0)

In [8]:
import os
import urllib.request
if not os.path.exists('data'):
    os.mkdir('data')
urllib.request.urlretrieve(DATA_URL, DATA_FILE)

('data/2020-09-30_historical_ip_new.csv',
 <http.client.HTTPMessage at 0x7f2dc96b50d0>)

In [9]:
dftest = pd.read_csv(DATA_FILE, 
                 parse_dates=['Date'],
                 encoding="ISO-8859-1",
                 dtype={"RegionName": str,
                        "RegionCode": str},
                 error_bad_lines=False)

In [10]:
dftest = dftest[dftest.Date >= np.datetime64("2020-07-02")]
dftest = dftest[dftest.Date <= np.datetime64("2020-08-31")]

In [11]:
dftest['GeoID'] = dftest['CountryName'] + '__' + dftest['RegionName'].astype(str)

In [12]:
dftest['NewCases'] = dftest.groupby('GeoID').ConfirmedCases.diff().fillna(0)

In [13]:
id_cols = ['CountryName',
           'RegionName',
           'GeoID',
           'Date']
cases_col = ['NewCases']
npi_cols=['C1_School closing',
       'C2_Workplace closing','C3_Cancel public events',
       'C4_Restrictions on gatherings', 'C5_Close public transport',
       'C6_Stay at home requirements', 
       'C7_Restrictions on internal movement', 
       'H2_Testing policy',
       'H6_Facial Coverings', 'ConfirmedCases',
       'GovernmentResponseIndex', 
       'EconomicSupportIndex']
dftest = dftest[id_cols + cases_col + npi_cols]

In [14]:
dftest.update(dftest.groupby('GeoID').NewCases.apply(
    lambda group: group.interpolate()).fillna(0))
for npi_col in npi_cols:
    dftest.update(dftest.groupby('GeoID')[npi_col].ffill().fillna(0))

In [24]:
# Set number of past days to use to make predictions
nb_lookback_days = 30

# Create training data across all countries for predicting one day ahead
X_cols = cases_col + npi_cols
y_col = cases_col
X_samples = []
y_samples = []
geo_ids = dftest.GeoID.unique()
for g in geo_ids:
    gdf = dftest[dftest.GeoID == g]
    all_case_data = np.array(gdf[cases_col])
    all_npi_data = np.array(gdf[npi_cols])

    # Create one sample for each day where we have enough data
    # Each sample consists of cases and npis for previous nb_lookback_days
    nb_total_days = len(gdf)
    for d in range(nb_lookback_days, nb_total_days - 1):
        X_cases = all_case_data[d-nb_lookback_days:d]

        # Take negative of npis to support positive
        # weight constraint in Lasso.
        X_npis = -all_npi_data[d - nb_lookback_days:d]

        # Flatten all input data so it fits Lasso input format.
        X_sample = np.concatenate([X_cases.flatten(),
                                   X_npis.flatten()])
        y_sample = all_case_data[d + 1]
        X_samples.append(X_sample)
        y_samples.append(y_sample)

X_samples = np.array(X_samples)
y_samples = np.array(y_samples).flatten()

In [25]:
df = df[id_cols + cases_col + npi_cols]

In [27]:
df.update(df.groupby('GeoID').NewCases.apply(
    lambda group: group.interpolate()).fillna(0))
for npi_col in npi_cols:
    df.update(df.groupby('GeoID')[npi_col].ffill().fillna(0))

In [29]:
# Set number of past days to use to make predictions
nb_lookback_days = 30

# Create training data across all countries for predicting one day ahead
X_cols = cases_col + npi_cols
y_col = cases_col
X_samples2 = []
y_samples2 = []
geo_ids = df.GeoID.unique()
for g in geo_ids:
    gdf = df[df.GeoID == g]
    all_case_data = np.array(gdf[cases_col])
    all_npi_data = np.array(gdf[npi_cols])

    # Create one sample for each day where we have enough data
    # Each sample consists of cases and npis for previous nb_lookback_days
    nb_total_days = len(gdf)
    for d in range(nb_lookback_days, nb_total_days - 1):
        X_cases = all_case_data[d-nb_lookback_days:d]

        # Take negative of npis to support positive
        # weight constraint in Lasso.
        X_npis = -all_npi_data[d - nb_lookback_days:d]

        # Flatten all input data so it fits Lasso input format.
        X_sample = np.concatenate([X_cases.flatten(),
                                   X_npis.flatten()])
        y_sample = all_case_data[d + 1]
        X_samples2.append(X_sample)
        y_samples2.append(y_sample)

X_samples2 = np.array(X_samples2)
y_samples2 = np.array(y_samples2).flatten()

In [32]:
X_samples.shape

(7950, 390)

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X_samples2,
                                                    y_samples2,
                                                    test_size=0.2,
                                                    random_state=301)

In [42]:
from xgboost import XGBRegressor
model = XGBRegressor(n_estimators=200, max_depth=4, learning_rate=.75, reg_lambda=2, reg_alpha=0,silent=0, subsample = .9)
model.fit(X_train, y_train)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.75, max_delta_step=0, max_depth=4,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=200, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=2, scale_pos_weight=1, silent=0,
             subsample=0.9, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [43]:
train_preds = model.predict(X_train)
train_preds = np.maximum(train_preds, 0) # Don't predict negative cases
print('Train MAE:', mae(train_preds, y_train))

test_preds = model.predict(X_test)
test_preds = np.maximum(test_preds, 0) # Don't predict negative cases
print('Test MAE:', mae(test_preds, y_test))

Train MAE: 47.27653791643285
Test MAE: 144.66691271072474


In [48]:
test=model.predict(X_samples)

In [49]:
test.shape

(7950,)

In [50]:
# Set number of past days to use to make predictions
nb_lookback_days = 30

# Create training data across all countries for predicting one day ahead
X_cols = cases_col + npi_cols
y_col = cases_col
X_samples2 = []
y_samples2 = []
geo_ids = df.GeoID.unique()

In [51]:
Date=["01","02","03","04","05","06","07","08","09"]+[str(i) for i in range(10,32)]

In [76]:
predictor = pd.DataFrame(columns=['CountryName','RegionName','GeoID','Date','PredictedDailyNewCases','UID'])

In [77]:
n=0
day="2020-08-"
for g in geo_ids:
    pays=g.split("__")[0]
    region=g.split("__")[1]
    for i in range(31):
        predictor.loc[n]=[pays,region,g,day+Date[i],test[i],g+day+Date[i]]
        n+=1

In [82]:
predictor.to_csv("prediction2.csv" , index = False)

In [83]:
predictor[predictor['CountryName']=='France']

Unnamed: 0,CountryName,RegionName,GeoID,Date,PredictedDailyNewCases,UID
2635,France,,France__nan,2020-08-01,1.027055,France__nan2020-08-01
2636,France,,France__nan,2020-08-02,1.027055,France__nan2020-08-02
2637,France,,France__nan,2020-08-03,1.027055,France__nan2020-08-03
2638,France,,France__nan,2020-08-04,1.027055,France__nan2020-08-04
2639,France,,France__nan,2020-08-05,1.027055,France__nan2020-08-05
2640,France,,France__nan,2020-08-06,1.027055,France__nan2020-08-06
2641,France,,France__nan,2020-08-07,1.027055,France__nan2020-08-07
2642,France,,France__nan,2020-08-08,27.98695,France__nan2020-08-08
2643,France,,France__nan,2020-08-09,71.930817,France__nan2020-08-09
2644,France,,France__nan,2020-08-10,72.343468,France__nan2020-08-10
