# Installs and Imports

In [160]:
! pip install geopandas



In [0]:
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import os
import tensorflow as tf
from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_log_error

from datetime import datetime
from datetime import timedelta

from tensorflow.keras import layers
from tensorflow.keras import Input
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import copy

# Global Variables

In [0]:
data_dir = "drive/My Drive/Classes/CSCE_5933_Deep_Learning/HW_12_Kaggle/Data/"
train_file = "%strain.csv"%data_dir
extra_data_file = "%senriched_covid_19_week_2.csv"%data_dir
test_file = "%stest.csv"%data_dir
num_trends = 2

# Load and Process Training Data

In [0]:
# Load base data.
train_df = gpd.read_file(train_file)
train_df["ConfirmedCases"] = train_df["ConfirmedCases"].astype("float")
train_df["Fatalities"] = train_df["Fatalities"].astype("float")
#The country_region got modified in the enriched dataset by @optimo, 
# so we have to apply the same change to this Dataframe to facilitate the merge.
train_df["Country_Region"] = [ row.Country_Region.replace("'","").strip(" ") if row.Province_State=="" else str(row.Country_Region+"_"+row.Province_State).replace("'","").strip(" ") for idx,row in train_df.iterrows()]

In [164]:
train_df.head()

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,geometry
0,1,,Afghanistan,2020-01-22,0.0,0.0,
1,2,,Afghanistan,2020-01-23,0.0,0.0,
2,3,,Afghanistan,2020-01-24,0.0,0.0,
3,4,,Afghanistan,2020-01-25,0.0,0.0,
4,5,,Afghanistan,2020-01-26,0.0,0.0,


In [165]:
#Still using the enriched data from week 2 as there is everything required for the model's training
extra_data_df = gpd.read_file(extra_data_file)
extra_data_df["Country_Region"] = [country_name.replace("'", "") for country_name in extra_data_df["Country_Region"]]
extra_data_df["restrictions"] = extra_data_df["restrictions"].astype("int")
extra_data_df["quarantine"] = extra_data_df["quarantine"].astype("int")
extra_data_df["schools"] = extra_data_df["schools"].astype("int")
extra_data_df["total_pop"] = extra_data_df["total_pop"].astype("float")
extra_data_df["density"] = extra_data_df["density"].astype("float")
extra_data_df["hospibed"] = extra_data_df["hospibed"].astype("float")
extra_data_df["lung"] = extra_data_df["lung"].astype("float")
extra_data_df["total_pop"] = extra_data_df["total_pop"]/max(extra_data_df["total_pop"])
extra_data_df["density"] = extra_data_df["density"]/max(extra_data_df["density"])
extra_data_df["hospibed"] = extra_data_df["hospibed"]/max(extra_data_df["hospibed"])
extra_data_df["lung"] = extra_data_df["lung"]/max(extra_data_df["lung"])
extra_data_df["age_100+"] = extra_data_df["age_100+"].astype("float")
extra_data_df["age_100+"] = extra_data_df["age_100+"]/max(extra_data_df["age_100+"])

extra_data_df = extra_data_df[["Country_Region", 
                               "Date", 
                               "restrictions", 
                               "quarantine", 
                               "schools", 
                               "hospibed", 
                               "lung", 
                               "total_pop", 
                               "density", 
                               "age_100+"
                               ]]
extra_data_df.head()

Unnamed: 0,Country_Region,Date,restrictions,quarantine,schools,hospibed,lung,total_pop,density,age_100+
0,Afghanistan,2020-01-22,0,0,0,0.036232,0.329191,0.027046,0.002278,0.001411
1,Afghanistan,2020-01-23,0,0,0,0.036232,0.329191,0.027046,0.002278,0.001411
2,Afghanistan,2020-01-24,0,0,0,0.036232,0.329191,0.027046,0.002278,0.001411
3,Afghanistan,2020-01-25,0,0,0,0.036232,0.329191,0.027046,0.002278,0.001411
4,Afghanistan,2020-01-26,0,0,0,0.036232,0.329191,0.027046,0.002278,0.001411


In [166]:
# Drop columns that have changing values. (quarantine, schools, restrictions)
changes = set()
for cr in extra_data_df["Country_Region"].unique():
    tmp = extra_data_df.loc[extra_data_df["Country_Region"] == cr]
    for c in tmp.columns:
        if c == "Date":
            continue
        u = tmp[c].unique()
        if len(u) > 1:
            changes.add(c)
for c in changes:
    del extra_data_df[c]
print([x for x in changes])
extra_data_df.head()

['quarantine', 'restrictions', 'schools']


Unnamed: 0,Country_Region,Date,hospibed,lung,total_pop,density,age_100+
0,Afghanistan,2020-01-22,0.036232,0.329191,0.027046,0.002278,0.001411
1,Afghanistan,2020-01-23,0.036232,0.329191,0.027046,0.002278,0.001411
2,Afghanistan,2020-01-24,0.036232,0.329191,0.027046,0.002278,0.001411
3,Afghanistan,2020-01-25,0.036232,0.329191,0.027046,0.002278,0.001411
4,Afghanistan,2020-01-26,0.036232,0.329191,0.027046,0.002278,0.001411


In [167]:
# Merge extra data with training data.
train_df = train_df.merge(extra_data_df, how="left", on=["Country_Region", "Date"]).drop_duplicates()
train_df.head()

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,geometry,hospibed,lung,total_pop,density,age_100+
0,1,,Afghanistan,2020-01-22,0.0,0.0,,0.036232,0.329191,0.027046,0.002278,0.001411
1,2,,Afghanistan,2020-01-23,0.0,0.0,,0.036232,0.329191,0.027046,0.002278,0.001411
2,3,,Afghanistan,2020-01-24,0.0,0.0,,0.036232,0.329191,0.027046,0.002278,0.001411
3,4,,Afghanistan,2020-01-25,0.0,0.0,,0.036232,0.329191,0.027046,0.002278,0.001411
4,5,,Afghanistan,2020-01-26,0.0,0.0,,0.036232,0.329191,0.027046,0.002278,0.001411


In [168]:
# Fill countries/regions not in the extra data with median values.
median_pop = np.median(extra_data_df.total_pop)
median_hospibed = np.median(extra_data_df.hospibed)
median_density = np.median(extra_data_df.density)
median_lung = np.median(extra_data_df.lung)
median_centenarian_pop = np.median(extra_data_df["age_100+"])
#need to replace that with a joint using Pandas
print("The missing countries/region are:")
missing = []
for country_region in train_df.Country_Region.unique():
    if extra_data_df.query("Country_Region=='"+country_region+"'").empty:
        print(country_region)
        
        train_df.loc[train_df["Country_Region"]==country_region, "total_pop"] = median_pop
        train_df.loc[train_df["Country_Region"]==country_region, "hospibed"] = median_hospibed
        train_df.loc[train_df["Country_Region"]==country_region, "density"] = median_density
        train_df.loc[train_df["Country_Region"]==country_region, "lung"] = median_lung
        train_df.loc[train_df["Country_Region"]==country_region, "age_100+"] = median_centenarian_pop
        # train_df.loc[train_df["Country_Region"]==country_region,"restrictions"] = 0
        # train_df.loc[train_df["Country_Region"]==country_region,"quarantine"] = 0
        # train_df.loc[train_df["Country_Region"]==country_region,"schools"] = 0

The missing countries/region are:
Botswana
Burma
Burundi
Canada_Northwest Territories
Canada_Yukon
France_Saint Pierre and Miquelon
Kosovo
MS Zaandam
Malawi
Netherlands_Bonaire, Sint Eustatius and Saba
Sao Tome and Principe
Sierra Leone
South Sudan
United Kingdom_Anguilla
United Kingdom_British Virgin Islands
United Kingdom_Falkland Islands (Malvinas)
United Kingdom_Turks and Caicos Islands
West Bank and Gaza
Western Sahara


In [183]:
train_df.columns

Index(['Id', 'Province_State', 'Country_Region', 'Date', 'ConfirmedCases',
       'Fatalities', 'geometry', 'hospibed', 'lung', 'total_pop', 'density',
       'age_100+'],
      dtype='object')

In [185]:
from sklearn import linear_model

for country in train_df.Country_Region.unique():
    for province in train_df.query(f"Country_Region=='{country}'").Province_State.unique():
        df = pd.DataFrame(train_df.query(f"Country_Region=='{country}' and Province_State=='{province}'"))
        df["Date"] = pd.to_datetime(df["Date"])
        df = df.sort_values(by="Date")
        df = df[["hospibed",
                 "lung", 
                 "total_pop", 
                 "density", 
                 "age_100+",
                 "ConfirmedCases", 
                 "Fatalities", 
                 ]]
        y = df[["ConfirmedCases", "Fatalities"]].values
        cols = list(df.columns)
        x = df.values
        for i, row in enumerate(x):
            if i == 0:
                continue
            x[i][cols.index("ConfirmedCases")] = x[i-1][cols.index("ConfirmedCases")]
            x[i][cols.index("Fatalities")] = x[i-1][cols.index("Fatalities")]

        reg = linear_model.LinearRegression().fit(x, y)
        print(reg.score(x, y))

        # linear_model.LogisticRegression



ValueError: ignored

In [187]:
df

Unnamed: 0,hospibed,lung,total_pop,density,age_100+,ConfirmedCases,Fatalities
0,0.036232,0.329191,0.027046,0.002278,0.001411,0.0,0.0
1,0.036232,0.329191,0.027046,0.002278,0.001411,0.0,0.0
2,0.036232,0.329191,0.027046,0.002278,0.001411,0.0,0.0
3,0.036232,0.329191,0.027046,0.002278,0.001411,0.0,0.0
4,0.036232,0.329191,0.027046,0.002278,0.001411,0.0,0.0
...,...,...,...,...,...,...,...
78,,,,,,0.0,0.0
79,,,,,,0.0,0.0
80,,,,,,0.0,0.0
81,,,,,,0.0,0.0


In [0]:
# Cleanup.
del extra_data_df
del changes
del tmp
del missing

In [0]:
# Create dataframe to store trends.
trend_df = pd.DataFrame(columns={"infection_trend", 
                                 "fatality_trend", 
                                 "quarantine_trend", 
                                 "school_trend", 
                                 "total_population", 
                                 "expected_cases", 
                                 "expected_fatalities"
                                 })

In [0]:
# Drop all dates not in March.
train_df = train_df.query("Date>'2020-02-29'and Date<'2020-04-01'")

In [0]:
days_in_month = len(list(train_df["Date"].unique()))

In [0]:
# Build trends.
trends_all = {}
trend_lengths = [x for x in range(days_in_month) if x%5 == 0 and x != 0]
trend_lengths.append(days_in_month)
iters = len(trend_lengths)*len(list(train_df["Country_Region"].unique()))
count = 0

for d, days_in_sequence in enumerate(trend_lengths):
    trend_list = []
    for c, country in enumerate(train_df.Country_Region.unique()):
        for province in train_df.query(f"Country_Region=='{country}'").Province_State.unique():
            province_df = train_df.query(f"Country_Region=='{country}' and Province_State=='{province}'")
            
            #I added a quick hack to double the number of sequences
            #Warning: This will later create a minor leakage from the 
            # training set into the validation set.
            for i in range(0,len(province_df),int(days_in_sequence/3)):
                if i+days_in_sequence<=len(province_df):
                    #prepare all the temporal inputs
                    infection_trend = [float(x) for x in province_df[i:i+days_in_sequence-1].ConfirmedCases.values]
                    fatality_trend = [float(x) for x in province_df[i:i+days_in_sequence-1].Fatalities.values]
                    # restriction_trend = [float(x) for x in province_df[i:i+days_in_sequence-1].restrictions.values]
                    # quarantine_trend = [float(x) for x in province_df[i:i+days_in_sequence-1].quarantine.values]
                    # school_trend = [float(x) for x in province_df[i:i+days_in_sequence-1].schools.values]

                    #preparing all the demographic inputs
                    total_population = float(province_df.iloc[i].total_pop)
                    density = float(province_df.iloc[i].density)
                    hospibed = float(province_df.iloc[i].hospibed)
                    lung = float(province_df.iloc[i].lung)
                    centenarian_pop = float(province_df.iloc[i]["age_100+"])

                    expected_cases = float(province_df.iloc[i+days_in_sequence-1].ConfirmedCases)
                    expected_fatalities = float(province_df.iloc[i+days_in_sequence-1].Fatalities)

                    trend_list.append({"infection_trend":infection_trend,
                                    "fatality_trend":fatality_trend,
                                    # "restriction_trend":restriction_trend,
                                    # "quarantine_trend":quarantine_trend,
                                    # "school_trend":school_trend,
                                    "demographic_inputs":[total_population,density,hospibed,lung,centenarian_pop],
                                    "expected_cases":expected_cases,
                                    "expected_fatalities":expected_fatalities})
        count += 1
        if count % 100 == 0 or count == iters:
            print("%d/%d"%(count, iters))
    trends_all.update({days_in_sequence: pd.DataFrame(trend_list)})
# trend_df = pd.DataFrame(trend_list)

100/2191
200/2191
300/2191
400/2191
500/2191
600/2191
700/2191
800/2191
900/2191
1000/2191
1100/2191
1200/2191
1300/2191
1400/2191
1500/2191
1600/2191
1700/2191
1800/2191
1900/2191
2000/2191
2100/2191
2191/2191


In [0]:
 # Format trends and shuffle data.
 for trend_l in trends_all:
    trends_all[trend_l]["temporal_inputs"] = [np.asarray([trends["infection_trend"], 
                                                          trends["fatality_trend"], 
                                                        #   trends["restriction_trend"], 
                                                        #   trends["quarantine_trend"], 
                                                        #   trends["school_trend"] , 
                                                          ]) for idx, trends in trends_all[trend_l].iterrows()]
    trends_all[trend_l] = shuffle(trends_all[trend_l])

In [0]:
# Drop all but 25 sequences where the number of cases stays at 0.
# i=0
# temp_df = pd.DataFrame()
# for idx,row in trend_df.iterrows():
#     if sum(row.infection_trend)>0:
#         temp_df = temp_df.append(row)
#     else:
#         if i<25:
#             temp_df = temp_df.append(row)
#             i+=1
# trend_df = temp_df

In [0]:
# Split dataset(s) into training and validation.
training_percentage = 0.9

for trend_l in trends_all:
    sequence_length = trend_l - 1
    trend_df = trends_all[trend_l]

    training_item_count = int(len(trend_df)*training_percentage)
    validation_item_count = len(trend_df)-int(len(trend_df)*training_percentage)
    training_df = trend_df[:training_item_count]
    validation_df = trend_df[training_item_count:]

    X_temporal_train = np.asarray(np.transpose(np.reshape(np.asarray([np.asarray(x) for x in training_df["temporal_inputs"].values]), (training_item_count, num_trends, sequence_length)), (0, 2, 1) )).astype(np.float32)
    X_demographic_train = np.asarray([np.asarray(x) for x in training_df["demographic_inputs"]]).astype(np.float32)
    Y_cases_train = np.asarray([np.asarray(x) for x in training_df["expected_cases"]]).astype(np.float32)
    Y_fatalities_train = np.asarray([np.asarray(x) for x in training_df["expected_fatalities"]]).astype(np.float32)

    X_temporal_val = np.asarray(np.transpose(np.reshape(np.asarray([np.asarray(x) for x in validation_df["temporal_inputs"]]), (validation_item_count, num_trends, sequence_length)), (0, 2, 1)) ).astype(np.float32)
    X_demographic_val = np.asarray([np.asarray(x) for x in validation_df["demographic_inputs"]]).astype(np.float32)
    Y_cases_val = np.asarray([np.asarray(x) for x in validation_df["expected_cases"]]).astype(np.float32)
    Y_fatalities_val = np.asarray([np.asarray(x) for x in validation_df["expected_fatalities"]]).astype(np.float32)

    trends_all[trend_l] = {"train": {"X_temporal_train": X_temporal_train, 
                                    "X_demographic_train": X_demographic_train, 
                                    "Y_cases_train": Y_cases_train, 
                                    "Y_fatalities_train": Y_fatalities_train, 
                                    }, 
                        "val": {"X_temporal_val": X_temporal_val, 
                                "X_demographic_val": X_demographic_val, 
                                "Y_cases_val": Y_cases_val, 
                                "Y_fatalities_val": Y_fatalities_val, 
                                }, 
                        }

# Build Model

In [0]:
# Build model structures.
models_all = []
for train_l in trends_all:
    print(train_l)
    sequence_length = train_l - 1

    #temporal input branch
    temporal_input_layer = Input(shape=(sequence_length, num_trends))
    main_rnn_layer = layers.LSTM(64, return_sequences=True, recurrent_dropout=0.2)(temporal_input_layer)

    #demographic input branch
    demographic_input_layer = Input(shape=(num_trends))
    demographic_dense = layers.Dense(16)(demographic_input_layer)
    demographic_dropout = layers.Dropout(0.2)(demographic_dense)

    #cases output branch
    rnn_c = layers.LSTM(32)(main_rnn_layer)
    merge_c = layers.Concatenate(axis=-1)([rnn_c,demographic_dropout])
    dense_c = layers.Dense(128)(merge_c)
    dropout_c = layers.Dropout(0.3)(dense_c)
    cases = layers.Dense(1, activation=layers.LeakyReLU(alpha=0.1),name="cases")(dropout_c)

    #fatality output branch
    rnn_f = layers.LSTM(32)(main_rnn_layer)
    merge_f = layers.Concatenate(axis=-1)([rnn_f,demographic_dropout])
    dense_f = layers.Dense(128)(merge_f)
    dropout_f = layers.Dropout(0.3)(dense_f)
    fatalities = layers.Dense(1, activation=layers.LeakyReLU(alpha=0.1), name="fatalities")(dropout_f)


    model = Model([temporal_input_layer, demographic_input_layer], [cases,fatalities])

    model.summary()
    print()

    models_all.append(model)

5
10
15
20
25
30
31


In [0]:
# Build callbacks and compile models.
for i, trend_l in enumerate(trends_all):
    trends_all[trend_l].update({"model": models_all[i]})
    
    trends_all[trend_l]["callbacks"] = [ReduceLROnPlateau(monitor='val_loss', patience=4, verbose=1, factor=0.6),
                                        EarlyStopping(monitor='val_loss', patience=20),
                                        ModelCheckpoint(filepath='trend_%d/best_model.h5'%trend_l, monitor='val_loss', save_best_only=True)]
    trends_all[trend_l]["model"].compile(loss=[tf.keras.losses.MeanSquaredLogarithmicError(),tf.keras.losses.MeanSquaredLogarithmicError()], optimizer="adam")

In [0]:
print(trends_all[5]["callbacks"])

[<tensorflow.python.keras.callbacks.ReduceLROnPlateau object at 0x7f623dfa8da0>, <tensorflow.python.keras.callbacks.EarlyStopping object at 0x7f623dfa8240>, <tensorflow.python.keras.callbacks.ModelCheckpoint object at 0x7f623dfa8c88>]


In [0]:
for trend_l in trends_all:
    trends_all["history"] = trends_all[trend_l]["model"].fit([trends_all[trend_l]["train"]["X_temporal_train"], 
                                                             trends_all[trend_l]["train"]["X_demographic_train"]], 
                                                             [trends_all[trend_l]["train"]["Y_cases_train"], 
                                                             trends_all[trend_l]["train"]["Y_fatalities_train"]], 
                                                             epochs = 250, 
                                                             batch_size = 16, 
                                                             validation_data=([trends_all[trend_l]["val"]["X_temporal_val"], 
                                                                               trends_all[trend_l]["val"]["X_demographic_val"]],  
                                                                               [trends_all[trend_l]["val"]["Y_cases_val"], 
                                                                               trends_all[trend_l]["val"]["Y_fatalities_val"]]), 
                                                             callbacks=trends_all[trend_l]["callbacks"])

Epoch 1/250


ValueError: ignored