In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


# Make numpy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)

In [None]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

print(tf.__version__)

## Dataset Importing

In [None]:
url = 'covid_data.csv'
column_names = ['Country (label)', 'Date (label)', 'new_cases', 'new_cases_smoothed', 'new_deaths_smoothed', 'total_cases', 'new_deaths', 'total_deaths', 'new_tests', 'total_tests', 'stringency_index', 'population', 'population_density', 'median_age', 'aged_65_older', 'gdp_per_capita', 'cardiovasc_death_rate', 'diabetes_prevalence', 'human_development_index']

raw_dataset = pd.read_csv(url, names=column_names, sep=',')

In [None]:
dataset = raw_dataset.copy()
dataset.tail()

In [None]:
dataset.isna().sum()

In [None]:
dataset = dataset.dropna()
dataset.isna().sum()

In [None]:
dataset.pop('total_deaths')
dataset.pop('new_deaths')
dataset.tail()

In [None]:
split_by_country = [dataset.where(dataset['Country (label)'] == i) for i in range(1,157)]
split_by_country = [split_by_country[i].dropna() for i in range(0,156)]

In [None]:
TRAIN_PERCENT = .7
VALIDATION_PERCENT = .15
train_split_index = round(TRAIN_PERCENT * len(split_by_country))
valid_split_index = round((TRAIN_PERCENT + VALIDATION_PERCENT) * len(split_by_country))
np.random.seed(4)

np.random.shuffle(split_by_country)
country_train_dataset = split_by_country[0:train_split_index]
country_validation_dataset = split_by_country[train_split_index:valid_split_index]
country_test_dataset = split_by_country[valid_split_index:len(split_by_country)]

In [None]:
train_dataset = pd.concat(country_train_dataset)
validation_dataset = pd.concat(country_validation_dataset)
test_dataset = pd.concat(country_test_dataset)

In [None]:
pd.unique(sorted(train_dataset['Country (label)']))

In [None]:
pd.unique(sorted(validation_dataset['Country (label)']))

In [None]:
pd.unique(sorted(test_dataset['Country (label)']))

## Visual Analysis

In [None]:
def plot_covid_deaths(data, country_label):
    country_data = data.where(data['Country (label)'] == country_label)
    country_data = country_data.dropna()
    plt.plot(country_data['Date (label)'], country_data['new_deaths_smoothed'])
    plt.xlabel('Days since 4/1/2020')
    plt.ylabel('new_deaths_smoothed')
    plt.title('Country {}'.format(country_label))
    plt.grid(True)

In [None]:
plot_covid_deaths(train_dataset,150)

In [None]:
sns.pairplot(train_dataset[['Date (label)', 'new_cases_smoothed', 'new_deaths_smoothed', 'new_tests']], diag_kind='kde')

In [None]:
sns.pairplot(train_dataset[['new_deaths_smoothed', 'stringency_index', 'population_density', 'gdp_per_capita']], diag_kind='kde')

In [None]:
train_dataset.describe().transpose()

In [None]:
train_features = train_dataset.copy()
validation_features = validation_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('new_deaths_smoothed')
validation_labels = validation_features.pop('new_deaths_smoothed')
test_labels = test_features.pop('new_deaths_smoothed')

In [None]:
train_dataset.describe().transpose()[['mean', 'std']]

## Normalization

In [None]:
normalizer = preprocessing.Normalization()

In [None]:
normalizer.adapt(np.array(train_features))

In [None]:
print(normalizer.mean.numpy())

In [None]:
first = np.array(train_features[:1])

with np.printoptions(precision=2, suppress=True):
  print('First example:', first)
  print()
  print('Normalized:', normalizer(first).numpy())

## Linear Regression

In [None]:
linear_model = tf.keras.Sequential([
    normalizer,
    layers.Dense(units=1)
])

In [None]:
linear_model.predict(train_features[:10])

In [None]:
linear_model.layers[1].kernel

In [None]:
linear_model.compile(
    optimizer=tf.optimizers.Adam(learning_rate=0.1),
    loss='mean_absolute_error')

In [None]:
%%time
history = linear_model.fit(
    train_features, train_labels, 
    epochs=100,
    # suppress logging
    verbose=0,
    # Calculate validation results on 20% of the training data
    validation_data=(validation_features, validation_labels))

In [None]:
def plot_loss(history, model_str):
  plt.plot(history.history['loss'], label='loss')
  plt.plot(history.history['val_loss'], label='val_loss')
  plt.ylim([0, 30])
  plt.xlabel('Epoch')
  plt.ylabel('Error [new_deaths]')
  plt.title('{} Mean Absolute Error'.format(model_str))
  plt.legend()
  plt.grid(True)

In [None]:
plot_loss(history, 'Linear Regression')

In [None]:
test_results = {}

test_results['linear_model'] = linear_model.evaluate(
    test_features, test_labels, verbose=0)

## DNN Model

In [None]:
def build_and_compile_model(norm):
  model = keras.Sequential([
      norm,
      layers.Dense(16, activation='elu'),
      layers.Dense(16, activation='elu'),
      layers.Dense(1)
  ])
  model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(0.001))
  return model

In [None]:
dnn_model = build_and_compile_model(normalizer)
dnn_model.summary()

In [None]:
%%time
history = dnn_model.fit(
    train_features, train_labels,
    validation_data=(validation_features, validation_labels),
    verbose=0, epochs=100)

In [None]:
plot_loss(history, 'DNN')

In [None]:
test_results['dnn_model'] = dnn_model.evaluate(test_features, test_labels, verbose=0)

## Performance Analysis

In [None]:
pd.DataFrame(test_results, index=['Mean absolute error [new_deaths]']).T

In [None]:
linear_test_predictions = linear_model.predict(test_features).flatten()

a = plt.axes(aspect='equal')
plt.scatter(test_labels, linear_test_predictions)
plt.xlabel('True Values [new_deaths]')
plt.ylabel('Predictions [new_deaths]')
plt.title('Linear Regression True VS Predicted')
lims = [0, 500]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)

In [None]:
dnn_test_predictions = dnn_model.predict(test_features).flatten()

a = plt.axes(aspect='equal')
plt.scatter(test_labels, dnn_test_predictions)
plt.xlabel('True Values [new_deaths]')
plt.ylabel('Predictions [new_deaths]')
plt.title('DNN True VS Predicted')
lims = [0, 500]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)

In [None]:
error = linear_test_predictions - test_labels
plt.hist(error, bins=25)
plt.xlabel('Prediction Error [new_deaths]')
_ = plt.ylabel('Count')
plt.title('Linear Regression Prediction Error')


In [None]:
error = dnn_test_predictions - test_labels
plt.hist(error, bins=25)
plt.xlabel('Prediction Error [new_deaths]')
_ = plt.ylabel('Count')
plt.title('DNN Prediction Error')


In [None]:
def predicted_covid_deaths_then_plot(model, test_dataset, country_id):
    test_dataset = test_dataset.where(test_dataset['Country (label)'] == country_id)
    test_dataset = test_dataset.dropna()

    test_features = test_dataset.copy()
    test_labels = test_features.pop('new_deaths_smoothed')
    
    test_predictions = model.predict(test_features).flatten()
    test_predictions = [round(test_predictions[i]) for i in range(len(test_predictions))]

    plt.plot(test_dataset['Date (label)'], test_dataset['new_deaths_smoothed'], label='True Values')
    plt.plot(test_dataset['Date (label)'], test_predictions, label='Predicted Values')

    plt.xlabel('Days since 4/1/2020')
    plt.ylabel('new_deaths_smoothed')
    plt.title('Country {}'.format(country_id))
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
for i in pd.unique(sorted(test_dataset['Country (label)'])):
    predicted_covid_deaths_then_plot(linear_model, test_dataset, i)

In [None]:
for i in pd.unique(sorted(test_dataset['Country (label)'])):
    predicted_covid_deaths_then_plot(dnn_model, test_dataset, i)