# COVID-19 Deep Learning

## Dataset

The file covid19.csv contains daily new cases for a selection of countries thru April 1, which was scrapped from https://www.worldometers.info/coronavirus/.

A more comprehensive dataset from John Hopkins University can be obtained as a Google Cloud hosted public dataset using BigQuery.

## Helper Functions

*blankImage(size)*

Creates an blank (0's) grayscale image of shape (size, size)

*getData(csv_file)*

Reads in the tabular data from the specified CSV file into a pandas dataframe. The columns are:

ISO 3166A2 Country Code (e.g., US for United States)
Date (US Standard Format mm/dd/yyyy)
New Cases

*preData(pd)*

Extracts the data from the panda dataframe into list format which can be used to generate bar graph images.<br/>

\[ country code, [ list of daily counts ] ]

*genImages(all_data)*

Uses the data from *prepData* to generate bar graph images, as follows:

    1. The bar graphs are placed under the folder covid-images.
    2. Bar graph images  are generated for each country and are prefixed with the country code.
    3. For each country, the range of new cases is normalized between 0 and 1.
    3. A date sequence (progression) of bar graphs is generated for each date and the sequence order is appended to the image filename.
    4. For each bar graph image, the normalized new cases value for the next subsequent data is appended to the image filename.
    
Ex.  US-1-0.017.jpg

US, Sequence Count 1, normalized next day new cases 0.017

*preprocessData()*

Reads the bar graph images into memory as a training dataset:

    1. The normalized next day new cases in filename is used as the label.
    2. Image pixel data is normalized.
    3. The training dataset is randomly shuffled.
    
*prepInput(path)*

Prepares a bar graph image for input to the model for prediction.

In [1]:
import cv2
import numpy as np
import pandas as pd
import os
import shutil
from sklearn.model_selection import train_test_split

def blankImage(size):
    ''' Create a blank grayscale image as size x size '''
    image = np.zeros([size, size])
    return image

def getData(csv):
    ''' Get the COVID-19 data '''
    data = pd.read_csv(csv)
    return data

def prepData(pd, source='worldometers'):
    ''' Prep the COVID-19 panda data into form for generating images '''
        
    # Per Country Data (normalized new case counts in date sequential order)
    data_by_cc = []

    c_cc   = None # current country code
    c_data = []   # new case list for current country
    for index, row in pd.iterrows():
        if source == 'worldometers':
            # extract feature values
            cc = row['ISO 3166 A2']
            date = row['Date']
            cases = row['New Cases']
        elif source == 'ecdc':
            try:
                if 'JPG' in row['geoId']:
                    continue
            except:
                continue
            if row['year'] == 1999:
                continue
            if ['geoId'] != 'CN' and row['month'] == 1:
                continue
            if ['geoId'] != 'CN' and row['month'] == 2 and row['day'] < 15:
                continue
            cc = row['geoId']
            date = row['dateRep']
            cases = row['cases']
        
        # special case, 0's are a problem, change to a 1
        if cases == 0:
            cases = 1
            
        # special case, CH one day correction
        if cc == 'CN' and cases > 5000:
            cases = 2500
            print("FIXED CN")
            
        # start new country
        if c_cc != cc:
            # complete preparation of previous country
            if c_cc != None:
                 # Reverse the row order if source is ECDC (https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide)
                if source == 'ecdc':
                    c_data = np.flip(c_data)
                # normalize the data
                max = np.max(c_data)
                c_data = np.asarray(c_data) / max
                data_by_cc.append((c_cc, c_data, max))
            c_cc = cc
            c_data = []
        c_data.append(cases)
        
    return data_by_cc

def genImages(all_data, size=224, bar=2):
    ''' Generate Images for Training Data '''
    try:
        shutil.rmtree("covid-images")
    except:
        pass
    os.mkdir("covid-images")
    for cc_data in all_data:
        cc = cc_data[0]
        x_data = cc_data[1]

        image = blankImage(size)
        for index, cases in enumerate(x_data):
            if cases < 2e-3:
                cases = 1e-2
            r = int(size * cases)
            c = index * (bar * 2)
            image[-r:,c:c+bar] = 255
            if index < len(x_data) - 1:
                next = x_data[index+1]
                if next < 2e-3:
                    next = 1e-2
                cv2.imwrite("covid-images/" + cc + "-" + str(index) + "-" + str(next) + ".jpg", image)
            # special case: last date won't have next day's total
            else:
                cv2.imwrite("covid-images/" + cc + "- last.jpg", image)
                
def preprocessData():
    ''' Read the Bar Graph Images into memory as a training dataset '''
    x_data = []
    y_data = [] # the next day count will be the label
    files = os.scandir('covid-images')
    for file in files:
        # don't use the last date in training data
        if "last" in file.name:
            continue
        next_day = file.name.split('-')[2][:-4]
        image = cv2.imread(file.path, cv2.IMREAD_GRAYSCALE)
        x_data.append(image)
        y_data.append(next_day)
        
    # normalize the image pixel data
    x_data = (np.asarray(x_data) / 255.0).astype(np.float32)
    y_data = np.asarray(y_data).astype(np.float32)
    
    # randomly shuffle the data
    return train_test_split(x_data, y_data, test_size=0.05, shuffle=True)
                
def preprocess_input(path):
    image = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    image = (image / 255.0).astype(np.float32)
    return image

## Prepare the Dataset for Training

In [2]:
# Prepare the training data
data = getData("covid19.csv")
all_data = prepData(data, source='worldometers')
genImages(all_data)
x_train, x_test, y_train, y_test = preprocessData()
# Add a single channel, ie (224, 224) -> (224, 224, 1)
x_train = np.expand_dims(x_train, -1)
x_test  = np.expand_dims(x_test, -1)

FIXED CN
FIXED CN


## Create the ResNet V2 model

In [3]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

# TODO: Subclass the Model

from resnet_v2_c import ResNetV2
unet = ResNetV2(50, input_shape=(224, 224, 1), reg=l2(0.001))
unet.compile(loss='mse', optimizer=Adam(lr=0.001), metrics=['mse'])
unet.summary()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 224, 224, 1) 0                                            
__________________________________________________________________________________________________
zero_padding2d (ZeroPadding2D)  (None, 230, 230, 1)  0           input_1[0][0]                    
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 112, 112, 64) 3136        zero_padding2d[0][0]             
__________________________________________________________________________________________________
batch_normalization (BatchNorma (None, 112, 112, 64) 256         conv2d[0][0]                     
_____________

## Pre-Train the Model

### Warmup (Numerical Stability)

Do warmup training on a small number of epochs, starting at a very tiny learning rate and incremently raising it to the (presumed) initial learning rate for full training.

In [4]:
# Hyperparameters
LR=0.001 # The initial learning rate for full training
BS=32     # Batch Size

# Do warmup training for numerical stability of the weights
unet.warmup(x_train, y_train, e_lr=LR, batch_size=BS, loss='mse', metrics=['mse'])

*** Warmup (for numerical stability)
Train on 832 samples

Epoch 00001: LearningRateScheduler reducing learning rate to 9.999999974752427e-07.
Epoch 1/5

Epoch 00002: LearningRateScheduler reducing learning rate to 0.00019980000000000003.
Epoch 2/5
*** Loss is diverging, Reducing Warmnup Rate

Epoch 00003: LearningRateScheduler reducing learning rate to 3.9960000000000004e-05.
Epoch 3/5

Epoch 00004: LearningRateScheduler reducing learning rate to 5.994000000000001e-05.
Epoch 4/5

Epoch 00005: LearningRateScheduler reducing learning rate to 7.992000000000001e-05.
Epoch 5/5


### Hyperparameter Tuning

Do a grid search on the best initial learning rate, using very few epochs/steps per trial.

In [5]:
# Do grid search on three magnitudes of learning rate
LR, _ = unet.grid_search(x_train, y_train, x_test, y_test, epochs=3, steps=250, lr_range=[0.0001, 0.001, 0.01], batch_range=[32], loss='mse', metrics=['mse'])

*** Hyperparameter Grid Search
*** Learning Rate 0.0001
Epoch 1/3
Epoch 2/3
Epoch 3/3
*** Learning Rate 0.001
Epoch 1/3
Epoch 2/3
Epoch 3/3
*** Learning Rate 0.01
Epoch 1/3
Epoch 2/3
Epoch 3/3
*** Selected best learning rate: 0.001
*** Selected best batch size: 32


## Train the model

Train the model using the learning rate picked by automatic hyperparameter tuning with cosine decay.

In [6]:
EPOCHS=45

# add the test data back into the training data. During training, 5% will be split off as validation data
x_train = np.concatenate((x_train, x_test))
y_train = np.concatenate((y_train, y_test))
#unet.training(x_train, y_train, epochs=EPOCHS, batch_size=BS, lr=LR, decay=('cosine', 0), loss='mse', metrics=['mse'], split=0.05)
unet.training(x_train, y_train, epochs=EPOCHS, batch_size=BS, lr=LR, loss='mse', metrics=['mse'], split=0.05)

*** Full Training
Train on 832 samples, validate on 44 samples

Epoch 00001: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 1/45

Epoch 00002: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 2/45

Epoch 00003: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 3/45

Epoch 00004: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 4/45

Epoch 00005: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 5/45

Epoch 00006: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 6/45

Epoch 00007: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 7/45

Epoch 00008: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 8/45

Epoch 00009: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 9/45

Epoch 00010: LearningRateScheduler reducing learning rate to 0.001000000

## Predictions

### Today

Let's first predict using yesterday's data for daily new cases for today and compare to the actual values.

In [None]:
# US
image = preprocess_input('covid-images/US-47-1.0.jpg')
batch = np.asarray([image])
batch = np.expand_dims(batch, -1)
print("US", unet.model.predict(batch), 1.0)

# Canada
image = preprocess_input('covid-images/CA-47-0.7426367461430575.jpg')
batch = np.asarray([image])
batch = np.expand_dims(batch, -1)
print("CA", unet.model.predict(batch), 0.74)

# Great Britian
image = preprocess_input('covid-images/UK-47-1.0.jpg')
batch = np.asarray([image])
batch = np.expand_dims(batch, -1)
print("GB", unet.model.predict(batch), 1.0)

# Germany
image = preprocess_input('covid-images/DE-47-0.9780743565300286.jpg')
batch = np.asarray([image])
batch = np.expand_dims(batch, -1)
print("DE", unet.model.predict(batch), 0.97)

# Spain
image = preprocess_input('covid-images/ES-47-0.8370201691607027.jpg')
batch = np.asarray([image])
batch = np.expand_dims(batch, -1)
print("ES", unet.model.predict(batch), 0.84)

# Iran
image = preprocess_input('covid-images/IR-47-0.9375392341494037.jpg')
batch = np.asarray([image])
batch = np.expand_dims(batch, -1)
print("IR", unet.model.predict(batch), 0.94)

# Italy
image = preprocess_input('covid-images/IT-47-0.729296934573738.jpg')
batch = np.asarray([image])
batch = np.expand_dims(batch, -1)
print("IT", unet.model.predict(batch), 0.73)

# France
image = preprocess_input('covid-images/FR-47-0.6414621272103458.jpg')
batch = np.asarray([image])
batch = np.expand_dims(batch, -1)
print("FR", unet.model.predict(batch), 0.64)

# Austrialia
image = preprocess_input('covid-images/AU-47-0.4402618657937807.jpg')
batch = np.asarray([image])
batch = np.expand_dims(batch, -1)
print("AU", unet.model.predict(batch), 0.44)


### Tomorrow

Let's now predict tomorrow's daily new cases.

In [7]:
GB="GB"  # ECDC code

countries = ['US', 'CA', GB, 'AU', 'IT', 'ES', 'DE', 'FR', 'IR']
for country in countries:
    image = preprocess_input('covid-images/' + country + '- last.jpg')
    batch = np.asarray([image])
    batch = np.expand_dims(batch, -1)
    for data in all_data:
        if data[0] == country:
            max = data[2]
            break
    print(country, int(unet.model.predict(batch) * max))


US 23379
CA 980
GB 2482
AU 203
IT 2994
ES 7642
DE 5107
FR 2082
IR 2902
