In [1]:
import os
import pandas as pd
import requests

# Training the LSTM model with latest data

In [15]:
# Main source for the training data
DATA_URL = 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv'
DATA_FILE = 'data/OxCGRT_latest.csv'

# Download the data set
data = requests.get(DATA_URL)

# Persist the data set locally in order to use it after submission to make predictions,
# as the sandbox won't have access to the internet anymore.
if not os.path.exists('data'):
    os.mkdir('data')
open(DATA_FILE, 'wb').write(data.content)

42998284

In [12]:
# Reload the module to get the latest changes
import xprize_predictor
from importlib import reload
reload(xprize_predictor)
from xprize_predictor import XPrizePredictor

In [7]:
predictor = XPrizePredictor(None, DATA_FILE)

In [8]:
%%time
predictor_model = predictor.train()

Creating numpy arrays for Keras for each country...
Numpy arrays created
Trial 0
Train Loss: 0.04547623172402382
Val Loss: 0.048068657517433167
Test Loss: 0.0442829355597496
Trial 1
Train Loss: 0.04695827513933182
Val Loss: 0.04376831650733948
Test Loss: 0.04482118785381317
Trial 2
Train Loss: 0.046080488711595535
Val Loss: 0.039886172860860825
Test Loss: 0.044399961829185486
Trial 3
Train Loss: 0.039605915546417236
Val Loss: 0.045067813247442245
Test Loss: 0.039661042392253876
Trial 4
Train Loss: 0.04550674185156822
Val Loss: 0.04531722515821457
Test Loss: 0.044185295701026917
Trial 5
Train Loss: 0.04542742297053337
Val Loss: 0.045172035694122314
Test Loss: 0.04438651353120804
Trial 6
Train Loss: 0.04189738258719444
Val Loss: 0.0456903874874115
Test Loss: 0.041700396686792374
Trial 7
Train Loss: 0.04640112817287445
Val Loss: 0.039955753833055496
Test Loss: 0.04436793923377991
Trial 8
Train Loss: 0.04358962923288345
Val Loss: 0.04510807618498802
Test Loss: 0.042336173355579376
Trial 9


In [9]:
if not os.path.exists('models'):
    os.mkdir('models')
predictor_model.save_weights("models/trained_model_weights.h5")

# Predicting 4 days gap using the trained model with latest data

## Load candidate model

In [16]:
model_weights_file = "models/trained_model_weights.h5"

In [17]:
predictor = XPrizePredictor(model_weights_file, DATA_FILE)

## Make prediction

In [18]:
NPIS_INPUT_FILE = "../../../validation/data/2020-09-30_historical_ip.csv"
start_date = "2020-08-01"
end_date = "2020-08-4"
output_file_path = "predictions/2020-08-01_2020-08-4_latest_data.csv"

In [19]:
%%time
preds_df = predictor.predict(start_date, end_date, NPIS_INPUT_FILE)

CPU times: user 28.8 s, sys: 928 ms, total: 29.7 s
Wall time: 25.2 s


In [20]:
# Create the output path
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
# Save to a csv file
preds_df.to_csv(output_file_path, index=False)
print(f"Saved predictions to {output_file_path}")

Saved predictions to predictions/2020-08-01_2020-08-4_latest_data.csv


In [21]:
preds_df.head()

Unnamed: 0,CountryName,RegionName,Date,PredictedDailyNewCases
0,Aruba,,2020-08-01,29.044608
1,Aruba,,2020-08-02,31.191441
2,Aruba,,2020-08-03,0.0
3,Aruba,,2020-08-04,33.244748
4,Afghanistan,,2020-08-01,83.481197


# Predicting the August covid wave 2021 with NPI-LSTM trained with latest data till July of 2021 

## Training the lstm with latest data till 31 July of 2021

### Filtering and saving data till 31 July of 2021 from latest data

In [3]:
latest_df = pd.read_csv(DATA_FILE,
                       parse_dates=['Date'],
                       dtype={"RegionName": str},
                       encoding="ISO-8859-1")
latest_july_df = latest_df[(latest_df.Date < '2021-08-01')]

NameError: name 'DATA_FILE' is not defined

In [14]:
output_file_path = "data/OxCGRT_2021_07_31.csv";
# Create the output path
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
# Save to a csv file
latest_july_df.to_csv(output_file_path, index=False)
print(f"Saved dataframe to {output_file_path}")

Saved dataframe to data/OxCGRT_2021_07_31.csv


### Training

Data for training is randomly splited on 90% for training, and 10% for validation. While the 14 latest days data of the global dataframe is keeped out for testing.

In [16]:
DATA_FILE = output_file_path


In [17]:
predictor = XPrizePredictor(None, DATA_FILE)

We have set number or trials to 10, in order to get the best model lstm which minimize the  val loss MAE 

In [18]:
%%time
predictor_model = predictor.train()

Creating numpy arrays for Keras for each country...
Numpy arrays created
Trial 0
Train Loss: 0.04586868733167648
Val Loss: 0.04298095032572746
Test Loss: 0.04465483874082565
Done
CPU times: user 11min 30s, sys: 1min 40s, total: 13min 10s
Wall time: 5min 42s


Val loss MAE 

In [19]:
if not os.path.exists('models'):
    os.mkdir('models')
predictor_model.save_weights("models/trained_model_weights_2021_07_31.h5")

## Predicting the August 2021 wave

### Preparing historical ip

In [21]:
DATA_FILE = "data/OxCGRT_latest.csv"
latest_df = pd.read_csv(DATA_FILE,
                       parse_dates=['Date'],
                       dtype={"RegionName": str},
                       encoding="ISO-8859-1")
latest_historical_ip_df = latest_df[["CountryName", "RegionName",
                                    "Date","C1_School closing",
                                    "C2_Workplace closing","C3_Cancel public events",
                                    "C4_Restrictions on gatherings","C5_Close public transport",
                                    "C6_Stay at home requirements","C7_Restrictions on internal movement",
                                    "C8_International travel controls","H1_Public information campaigns",
                                    "H2_Testing policy","H3_Contact tracing","H6_Facial Coverings"]]

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [22]:
output_file_path = "data/latest_historical_ip.csv";
# Create the output path
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
# Save to a csv file
latest_historical_ip_df.to_csv(output_file_path, index=False)
print(f"Saved dataframe to {output_file_path}")

Saved dataframe to data/latest_historical_ip.csv


### Prediction

In [23]:
NPIS_INPUT_FILE = "data/latest_historical_ip.csv"
start_date = "2021-08-01"
end_date = "2021-08-31"
output_file_path = "predictions/2021-08-01_2021-08-31_latest_data.csv"

In [25]:
DATA_FILE = "data/OxCGRT_2021_07_31.csv"
model_weights_file = "models/trained_model_weights_2021_07_31.h5"
predictor = XPrizePredictor(model_weights_file, DATA_FILE)

In [26]:
%%time
preds_df = predictor.predict(start_date, end_date, NPIS_INPUT_FILE)

CPU times: user 3min 20s, sys: 3.43 s, total: 3min 23s
Wall time: 3min 3s


In [27]:
# Create the output path
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
# Save to a csv file
preds_df.to_csv(output_file_path, index=False)
print(f"Saved predictions to {output_file_path}")

Saved predictions to predictions/2021-08-01_2021-08-31_latest_data.csv


In [28]:
preds_df.head()

Unnamed: 0,CountryName,RegionName,Date,PredictedDailyNewCases
0,Aruba,,2021-08-01,67.57848
1,Aruba,,2021-08-02,59.285661
2,Aruba,,2021-08-03,87.248634
3,Aruba,,2021-08-04,89.011122
4,Aruba,,2021-08-05,87.798978
