<a href="https://colab.research.google.com/github/Jinpeng-Yu/NTU-ML2021-Spring/blob/master/HW01/HW01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
torch.cuda.is_available()

True

# **Homework 1: COVID-19 Cases Prediction (Regression)**

Objectives:
* Solve a regression problem with deep neural networks (DNN).
* Understand basic DNN training tips.
* Get familiar with PyTorch.

# **Download Data**


If the Google drive links are dead, you can download data from [kaggle](https://www.kaggle.com/c/ml2021spring-hw1/data), and upload data manually to the workspace.

In [None]:
tr_path = 'covid.train.csv'
tt_path = 'covid.test.csv'

!gdown --id '19CCyCgJrUxtvgZF53vnctJiOJ23T5mqF' --output covid.train.csv
!gdown --id '1CE240jLm2npU-tdz81-oVKEF3T2yfT1O' --output covid.test.csv

Downloading...
From: https://drive.google.com/uc?id=19CCyCgJrUxtvgZF53vnctJiOJ23T5mqF
To: /content/covid.train.csv
100% 2.00M/2.00M [00:00<00:00, 63.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1CE240jLm2npU-tdz81-oVKEF3T2yfT1O
To: /content/covid.test.csv
100% 651k/651k [00:00<00:00, 42.9MB/s]


# **Import Some Packages**

In [None]:
# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# For data preprocess
import numpy as np
import csv
import os

# For plotting
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

myseed = 42069 #set a random seed for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():
  torch.cuda.manual_seed_all(myseed)

# **Some Utilities**

do not need to modify this part

In [None]:
def get_device():
  '''Get device (if GPU is available, use GPU)'''
  return 'cuda' if torch.cuda.is_available() else 'cpu'

def plot_learning_curve(loss_record, title=''):
  '''Plot learning curve of your DNN (train & dev loss)'''
  total_steps = len(loss_record['train'])
  x_1 = range(total_steps)
  x_2 = x_1[::len(loss_record['train']) // len(loss_record['dev'])]
  figure(figsize=(6, 4))
  plt.plot(x_1, loss_record['train'], c='tab:red', label='train')
  plt.plot(x_2, loss_record['dev'], c = 'tab:cyan', label='dev')
  plt.ylim(0.0, 5.)
  plt.xlabel('Training steps')
  plt.ylabel('MSE loss')
  plt.title('Learning curve of {}'.format(title))
  plt.legend()
  plt.show()

def plot_pred(dv_set, model, device, lim=35, preds=None, targets=None):
  '''Plot prediction of your DNN '''
  if preds is None or target is None:
    model.eval()
    preds, targets = [],[]
    for x,y in dv_set:
      x,y = x.to(device), y.to(device)
      with torch.no_grad():
        pred = model(x)
        preds.append(pred.detach().cpu())
        targets.append(y.detach().cpu())
    preds = torch.cat(preds, dim=0).numpy()
    targets = torch.cat(targets, dim=0).numpy()

  figure(figsize=(5,5))
  plt(scatter(targets, preds, c='r', alpha=0.5))
  plt.plot([-0.2, lim], [-0.2, lim], c='b')
  plt.xlim(-0.2, lim)
  plt.ylim(-0.2, lim)
  plt.xlabel('ground truth value')
  plt.ylabel('predicted value')
  plt.title('Ground Turth v.s. Prediction')
  plt.show()

# **Preprocess**
*`train`:training   
*`dev`:validation   
*`test`:testing   

# **Dataset**

The `COVID19Dataset` below does:
* read `.csv` files
* extract features
* split `covid.train.csv` into train/dev sets
* normalize features

In [None]:
# with open('covid.train.csv', 'r') as fp:
#   data = list(csv.reader(fp))
#   #print(np.array(data[1:])[:,1:])
#   data = np.array(data[1:])[:,1:].astype(float)

list(range(3))

[0, 1, 2]

In [None]:
class COVID19Dataset(Dataset):
    ''' Dataset for loading and preprocessing the COVID19 dataset '''
    def __init__(self,
                 path,
                 mode='train',
                 target_only=False):
      self.mode = mode

      #Read data into numpy arrays
      with open(path, 'r') as fp:
        data = list(csv.reader(fp))
        data = np.array(data[1:])[:,1:].astype(float)
      
      if not target_only:
        feats = list(range(93))
      else:
        # TODO: Using 40 states & 2 tested_positive features (indices = 57 & 75)
        
        pass
      
      if mode == 'test':
        # Testing data
        # data: 893 x 93 (40 states + day 1 (18) + day 2 (18) + day 3 (17))
        data = data[:, feats]
        self.data = torch.FloatTensor(data)
      else:
        # Training data (train/dev sets)
        # data: 2700 x 94 (40 states + day 1 (18) + day 2 (18) + day 3 (18))
        target = data[:,-1]
        data = data[:, feats]

        # Splitting training data into train & dev sets
        if mode == 'train':
          indices = [i for i in range(len(data)) if i % 10 != 0]
        elif mode == 'dev':
          indices = [i for i in range(len(data)) if i % 10 == 0]

        # Convert data into Pytorch tensors
        self.data = torch.FloatTensor(data[indices])
        self.target = torch.FloatTensor(target[indices])

    # Normalize features (you may remove this part to see what will happen)
    self.data[:, 40:] = \
      (self.data)