In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader

import random

import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler

In [26]:
index = pd.read_csv("./index.csv")
demographics = pd.read_csv("./demographics.csv")
epidemiology = pd.read_csv("./epidemiology.csv")
geography = pd.read_csv("./geography.csv")

In [54]:
class COVID19Dataset(Dataset):
  def __init__(self, index, demographics, epidemiology):
    super().__init__()
    self.locations = set(index["location_key"].dropna())
    self.base_locations = ["TW", "US", "BR"]
    self.used_locations = []
    self.split = 0.7
    self.range = 14
    self.bins = (-np.inf, -20, -5, -1, 0, 1, 5, 20, np.inf)
    data = {}
    labels = {}
    scaler = MinMaxScaler()
    for location, group in demographics.groupby("location_key"):
      data[location] = scaler.fit_transform(group[group.columns[1:]].replace([np.inf, -np.inf, np.nan], 0).to_numpy())
    for location, group in epidemiology.groupby("location_key"):
      if location not in data: continue
      previous = group[group.columns[2:]].shift(1, axis=0, fill_value=0)
      history = ((group[group.columns[2:]] - previous) / previous).replace([np.inf, -np.inf, np.nan], 0).to_numpy()
      data[location] = np.concatenate((data[location].repeat(len(history), 0), history), 1)
      digitized = np.digitize(((group[group.columns[2:]] - previous) / previous)["new_confirmed"].to_numpy(), self.bins)
      labels[location] = F.one_hot(torch.tensor(digitized, dtype=torch.long), len(self.bins) + 1).float()
    for location in [location for location in self.locations if len([0 for ul in self.base_locations if ul in location])]:
      if location in data and location in labels and int(len(data[location]) * (1 - self.split)) > self.range: self.used_locations.append(location)
    self.samples = [torch.tensor(data[loc], dtype=torch.float) for loc in self.used_locations]
    self.labels = [labels[loc] for loc in self.used_locations]

  def __getitem__(self, loc):
    r = random.randint(self.range, int(len(self.samples[loc]) * self.split))
    return self.samples[loc][r - self.range:r], self.labels[loc][r - 1]

  def get_validation(self):
    loc = random.randint(0, len(self.used_locations) - 1)
    r = random.randint(int(len(self.samples[loc]) * self.split) + self.range, len(self.samples[loc]))
    return self.samples[loc][r - self.range:r].unsqueeze(0), self.labels[loc][r - 1].unsqueeze(0)

  def __len__(self): return len(self.used_locations)

In [60]:
BATCH_SIZE = 4
NUM_STEPS = 10000
NUM_VALID = 10000

In [55]:
dataset = COVID19Dataset(index, demographics, epidemiology)
torch.save(dataset, "./data/dataset.pt")
print(f"number of used locations: {len(dataset)}")

number of used locations: 9064


In [56]:
dataset = torch.load("./data/dataset.pt")
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)

In [7]:
class Model(nn.Module):
  def __init__(self, n, r, p = 7):
    super().__init__()
    self.n = n # number of bins
    self.r = r # range of tracking
    self.p = p # period of sampling
    self.seq = nn.Sequential(
      nn.Conv1d(26, 64, self.p),
      # nn.BatchNorm1d(64),
      nn.ReLU(True),
      nn.Flatten(),
      nn.Dropout(0.3),
      nn.Linear(64 * (self.r - self.p + 1), 256),
      nn.ReLU(True),
      nn.Linear(256, 1024),
      nn.ReLU(True),
      nn.Linear(1024, self.n),
      nn.Softmax(1)
    )

  def forward(self, inputs):
    return self.seq(inputs.permute(0, 2, 1))

In [8]:
print(next(iter(dataloader))[0].shape)
print(next(iter(dataloader))[1].shape)
print(next(iter(dataloader))[1])

torch.Size([4, 14, 26])
torch.Size([4, 10])
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])


In [65]:
loss_fn1 = nn.CrossEntropyLoss()
model = Model(len(dataset.bins) + 1, dataset.range)
samples, labels = next(iter(dataloader))
outputs = model(samples)
loss = loss_fn1(outputs, labels)
print(outputs)
print(loss.item())

tensor([[0.0990, 0.1007, 0.1020, 0.1013, 0.1023, 0.0957, 0.1033, 0.0969, 0.0972,
         0.1015],
        [0.0994, 0.1016, 0.1017, 0.1017, 0.1015, 0.0960, 0.1021, 0.0966, 0.0981,
         0.1013],
        [0.0984, 0.1013, 0.1016, 0.0978, 0.1040, 0.0941, 0.1085, 0.0949, 0.1001,
         0.0993],
        [0.0984, 0.1005, 0.1021, 0.1023, 0.1019, 0.0948, 0.1031, 0.0965, 0.0988,
         0.1017]], grad_fn=<SoftmaxBackward0>)
2.301525115966797


In [62]:
torch.backends.cudnn.benchmark = True
device = "cuda" if torch.cuda.is_available() else "cpu"

model = Model(len(dataset.bins) + 1, dataset.range)
optimizer = optim.Adam(model.parameters(), lr=3e-5, weight_decay=1e-5)
loss_fn1 = nn.CrossEntropyLoss()
loss_fn2 = nn.MSELoss()
losses = []

model.train()

if device == "cuda": model.cuda()
for step in range(1, NUM_STEPS + 1):
  optimizer.zero_grad()
  samples, labels = next(iter(dataloader))
  outputs = model(samples.to(device))
  loss = loss_fn1(outputs.to(device), labels.to(device))
  losses.append(loss.item())
  loss.backward()
  optimizer.step()
  if step % 100 == 0:
    print(f"{step}/{NUM_STEPS}:\t{np.mean(losses):.4f}")
    losses = []
print("Training Ended!")

100/10000:	2.2929
200/10000:	2.2633
300/10000:	2.1785
400/10000:	2.0785
500/10000:	2.0457
600/10000:	1.9881
700/10000:	2.0001
800/10000:	2.0109
900/10000:	1.9892
1000/10000:	1.9867
1100/10000:	1.9750
1200/10000:	2.0010
1300/10000:	1.9833
1400/10000:	1.9688
1500/10000:	1.9584
1600/10000:	1.9763
1700/10000:	1.9168
1800/10000:	1.9233
1900/10000:	1.9067
2000/10000:	1.8807
2100/10000:	1.8996
2200/10000:	1.8882
2300/10000:	1.9175
2400/10000:	1.8688
2500/10000:	1.8659
2600/10000:	1.9057
2700/10000:	1.9255
2800/10000:	1.8823
2900/10000:	1.8504
3000/10000:	1.8574
3100/10000:	1.8805
3200/10000:	1.8803
3300/10000:	1.8841
3400/10000:	1.8877
3500/10000:	1.8770
3600/10000:	1.8713
3700/10000:	1.8714
3800/10000:	1.8560
3900/10000:	1.8549
4000/10000:	1.8603
4100/10000:	1.8676
4200/10000:	1.8503
4300/10000:	1.8569
4400/10000:	1.8532
4500/10000:	1.8731
4600/10000:	1.8458
4700/10000:	1.8463
4800/10000:	1.8759
4900/10000:	1.8463
5000/10000:	1.8450
5100/10000:	1.8532
5200/10000:	1.8366
5300/10000:	1.8204
54

In [63]:
error_count = 0

with torch.no_grad():
  for step in range(1, NUM_VALID + 1):
    samples, labels = dataset.get_validation()
    outputs = model(samples.to(device))
    value1, index1 = outputs.max(1)
    value2, index2 = labels.to(device).max(1)
    if index1 != index2: error_count += 1
    if (index1 - index2).abs() > 2:
      print(f"index: {index1.item()} / {index2.item()}")
      print(f"value: {value1.item():.4f} / {value2.item():.4f}")

print(f"Error Count: {error_count}")

index: 4 / 9
value: 0.6691 / 1.0000
index: 4 / 9
value: 1.0000 / 1.0000
index: 9 / 6
value: 0.9998 / 1.0000
index: 4 / 9
value: 0.5657 / 1.0000
index: 9 / 4
value: 0.9900 / 1.0000
index: 9 / 5
value: 0.9948 / 1.0000
index: 4 / 9
value: 0.7266 / 1.0000
index: 9 / 5
value: 0.9329 / 1.0000
index: 9 / 6
value: 0.9437 / 1.0000
index: 4 / 9
value: 0.6324 / 1.0000
index: 4 / 9
value: 0.7868 / 1.0000
index: 9 / 6
value: 1.0000 / 1.0000
index: 4 / 9
value: 0.9991 / 1.0000
index: 9 / 5
value: 0.8004 / 1.0000
index: 4 / 9
value: 1.0000 / 1.0000
index: 4 / 9
value: 0.7096 / 1.0000
index: 9 / 5
value: 0.9876 / 1.0000
index: 4 / 9
value: 0.6884 / 1.0000
index: 9 / 5
value: 0.7926 / 1.0000
index: 4 / 9
value: 0.9985 / 1.0000
index: 9 / 5
value: 0.9343 / 1.0000
index: 9 / 4
value: 0.5422 / 1.0000
index: 9 / 6
value: 0.9996 / 1.0000
index: 9 / 6
value: 1.0000 / 1.0000
index: 4 / 9
value: 0.6998 / 1.0000
index: 9 / 5
value: 0.8368 / 1.0000
index: 9 / 6
value: 1.0000 / 1.0000
index: 9 / 6
value: 0.9999 /

In [12]:
for samples, labels in iter(dataloader):
  print(samples.shape, labels.shape)

torch.Size([4, 14, 26]) torch.Size([4, 10])
torch.Size([4, 14, 26]) torch.Size([4, 10])
torch.Size([4, 14, 26]) torch.Size([4, 10])
torch.Size([4, 14, 26]) torch.Size([4, 10])
torch.Size([4, 14, 26]) torch.Size([4, 10])
torch.Size([4, 14, 26]) torch.Size([4, 10])
torch.Size([4, 14, 26]) torch.Size([4, 10])
torch.Size([4, 14, 26]) torch.Size([4, 10])
torch.Size([4, 14, 26]) torch.Size([4, 10])
torch.Size([4, 14, 26]) torch.Size([4, 10])
torch.Size([4, 14, 26]) torch.Size([4, 10])
torch.Size([4, 14, 26]) torch.Size([4, 10])
torch.Size([4, 14, 26]) torch.Size([4, 10])
torch.Size([4, 14, 26]) torch.Size([4, 10])
torch.Size([4, 14, 26]) torch.Size([4, 10])
torch.Size([4, 14, 26]) torch.Size([4, 10])
torch.Size([4, 14, 26]) torch.Size([4, 10])
torch.Size([4, 14, 26]) torch.Size([4, 10])
torch.Size([4, 14, 26]) torch.Size([4, 10])
torch.Size([4, 14, 26]) torch.Size([4, 10])
torch.Size([4, 14, 26]) torch.Size([4, 10])
torch.Size([4, 14, 26]) torch.Size([4, 10])
torch.Size([4, 14, 26]) torch.Si