In [218]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader

import random

import numpy as np
import pandas as pd

In [6]:
index = pd.read_csv("./index.csv")
demographics = pd.read_csv("./demographics.csv")
epidemiology = pd.read_csv("./epidemiology.csv")
geography = pd.read_csv("./geography.csv")

In [241]:
class COVID19Dataset(Dataset):
  def __init__(self, index, demographics, epidemiology):
    super().__init__()
    self.locations = set(index["location_key"].dropna())
    self.base_locations = ["TW", "US", "BR"]
    self.used_indices = []
    self.populations = []
    self.split = 0.7
    self.range = 14
    data = {}
    labels = {}
    populations = {}
    for loc, group in demographics.groupby("location_key"):
      data[loc] = group[group.columns[1:]].to_numpy()
    for loc, group in epidemiology.groupby("location_key"):
      if loc not in data: continue
      population = data[loc][0][0]
      if population == 0: continue
      history = (group[group.columns[2:]] / population).to_numpy()
      data[loc] = np.concatenate((data[loc].repeat(len(history), 0), history), 1) / population
      labels[loc] = (group[group.columns[2:]]["new_confirmed"]).to_numpy() / population
      data[loc] = np.nan_to_num(data[loc])
      labels[loc] = np.nan_to_num(labels[loc])
      populations[loc] = population
    for loc in [loc for loc in self.locations if len([0 for ul in self.base_locations if ul in loc])]:
      if loc in data and loc in labels and int(len(data[loc]) * (1 - self.split)) > self.range: self.used_indices.append(loc)
    self.samples = [torch.tensor(data[loc], dtype=torch.float) for loc in self.used_indices]
    self.labels = [torch.tensor(labels[loc], dtype=torch.float) for loc in self.used_indices]
    self.populations = [populations[loc] for loc in self.used_indices]

  def __getitem__(self, loc):
    r = random.randint(self.range, int(len(self.samples[loc]) * self.split))
    return self.samples[loc][r - self.range:r], self.labels[loc][r - 1]

  def get_validation(self):
    loc = random.randint(0, len(self.used_indices) - 1)
    r = random.randint(int(len(self.samples[loc]) * self.split) + self.range, len(self.samples[loc]))
    return self.samples[loc][r - self.range:r].unsqueeze(0), self.labels[loc][r - 1].unsqueeze(0), self.populations[loc]

  def __len__(self): return len(self.used_indices)

In [297]:
BATCH_SIZE = 4
NUM_STEPS = 1000
NUM_VALID = 100

In [243]:
dataset = COVID19Dataset(index, demographics, epidemiology)
torch.save(dataset, "./data/dataset.pt")
print(f"number of used locations: {len(dataset)}")

number of used locations: 9064


In [244]:
print(np.where(dataset.labels != 0))
print(dataset.labels[0][0])

(array([0], dtype=int64),)
tensor(0.)


In [222]:
dataset = torch.load("./data/dataset.pt")
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)

In [302]:
class Model(nn.Module):
  def __init__(self, r, p = 7):
    super().__init__()
    self.r = r # range of tracking
    self.p = p # period of sampling
    self.seq1 = nn.Sequential(
      nn.Conv1d(26, 64, self.p),
      nn.ReLU(True),
    )
    self.lstm = nn.LSTM(8, 256, 2, batch_first=True, bidirectional=True)
    self.seq2 = nn.Sequential(
      nn.ReLU(),
      nn.Flatten(),
      nn.Dropout(0.3),
      nn.Linear(64 * 256 * 2, 1024),
      nn.ReLU(True),
      nn.Linear(1024, 256),
      nn.ReLU(True),
      nn.Linear(256, 1),
      nn.Sigmoid()
    )

  def forward(self, inputs):
    outputs = self.seq1(inputs.permute(0, 2, 1))
    outputs, _ = self.lstm(outputs)
    outputs = self.seq2(outputs)
    return outputs.squeeze(1)

In [233]:
print(next(iter(dataloader))[0].shape)
print(next(iter(dataloader))[1].shape)
print(next(iter(dataloader))[1])

torch.Size([4, 14, 26])
torch.Size([4])
tensor([0.0000e+00, 0.0000e+00, 3.0487e-05, 0.0000e+00])


In [303]:
loss_fn = nn.MSELoss()
model = Model(dataset.range)
samples, labels = next(iter(dataloader))
outputs = model(samples)
loss = loss_fn(outputs, labels)
print(labels)
print(outputs)
print(loss.item())

tensor([4.9234e-05, 8.6159e-06, 4.1294e-04, 2.0968e-04])
tensor([0.5065, 0.5068, 0.5062, 0.5062], grad_fn=<SqueezeBackward1>)
0.2562870681285858


In [304]:
torch.backends.cudnn.benchmark = True
device = "cuda" if torch.cuda.is_available() else "cpu"

model = Model(dataset.range)
optimizer = optim.Adam(model.parameters(), lr=3e-5, weight_decay=1e-5)
loss_fn = nn.MSELoss()
losses = []

model.train()

if device == "cuda": model.cuda()
for step in range(1, NUM_STEPS + 1):
  optimizer.zero_grad()
  samples, labels = next(iter(dataloader))
  outputs = model(samples.to(device))
  loss = torch.sqrt(loss_fn(outputs.to(device), labels.to(device)))
  losses.append(loss.item())
  loss.backward()
  optimizer.step()
  if step % 100 == 0:
    print(f"{step}/{NUM_STEPS}:\t{np.mean(losses):.4f}")
    losses = []
print("Training Ended!")

100/1000:	0.2204
200/1000:	0.0004
300/1000:	0.0004
400/1000:	0.0004
500/1000:	0.0003
600/1000:	0.0006
700/1000:	0.0003
800/1000:	0.0003
900/1000:	0.0003
1000/1000:	0.0003
Training Ended!


In [305]:
unacceptable = 0
error_rate_sum = 0

with torch.no_grad():
  for step in range(1, NUM_VALID + 1):
    samples, labels, population = dataset.get_validation()
    outputs = model(samples.to(device))
    error_rate = (outputs - labels.to(device)).item()
    error = int(error_rate * population)
    if abs(error_rate) < 1e-3 and abs(error) < 1000: continue
    print(f"population: {int(population)}")
    print(f"predict: {outputs.item():.4f}")
    print(f"reality: {labels.item():.4f}")
    print(f"error rate: {error_rate:.4f}")
    print(f"error: {error}")
    unacceptable += 1
    error_rate_sum += error_rate

print(f"unacceptable cases: {unacceptable}")
print(f"mean error rate: {error_rate_sum / NUM_VALID:.4f}")

population: 8693
predict: 0.0001
reality: 0.0012
error rate: -0.0011
error: -9
population: 51659
predict: 0.0001
reality: 0.0032
error rate: -0.0031
error: -159
population: 35509
predict: 0.0001
reality: 0.0036
error rate: -0.0035
error: -125
population: 41354
predict: 0.0001
reality: 0.0022
error rate: -0.0021
error: -85
population: 9881
predict: 0.0005
reality: 0.0033
error rate: -0.0028
error: -27
population: 2514
predict: 0.0001
reality: 0.0119
error rate: -0.0118
error: -29
population: 2077
predict: 0.0001
reality: 0.0140
error rate: -0.0139
error: -28
population: 17723
predict: 0.0001
reality: 0.0038
error rate: -0.0037
error: -66
population: 5751
predict: 0.0001
reality: 0.0023
error rate: -0.0021
error: -12
population: 22373
predict: 0.0001
reality: 0.0032
error rate: -0.0031
error: -68
population: 6961
predict: 0.0001
reality: 0.0011
error rate: -0.0011
error: -7
population: 14595
predict: 0.0001
reality: 0.0018
error rate: -0.0018
error: -25
population: 3950
predict: 0.0001
r