In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tqdm import tqdm
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
import math
from sklearn.model_selection import train_test_split


In [3]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities
0,1,,Afghanistan,2020-01-22,0.0,0.0
1,2,,Afghanistan,2020-01-23,0.0,0.0
2,3,,Afghanistan,2020-01-24,0.0,0.0
3,4,,Afghanistan,2020-01-25,0.0,0.0
4,5,,Afghanistan,2020-01-26,0.0,0.0


In [4]:
print("Null values(train):") 
print(train.isnull().sum())
print("\n")
countries = train['Country_Region'].unique()
print("Number of different countries: ", len(countries))
print("\n")
min_date = train.Date.min()
max_date = train.Date.max()
print("Min date: {0}\nMax date: {1}".format(min_date, max_date))

Null values(train):
Id                    0
Province_State    13376
Country_Region        0
Date                  0
ConfirmedCases        0
Fatalities            0
dtype: int64


Number of different countries:  180


Min date: 2020-01-22
Max date: 2020-04-06


In [77]:
def fillState(state, country):
    if state == "empty":
        return country
    return state

train['Province_State'].fillna("empty", inplace = True)
train['Province_State'] = train.loc[:,['Province_State', 'Country_Region']].apply(lambda x: fillState(x['Province_State'], x['Country_Region']), axis = 1)


# renaming columns
train=train.rename(columns={'ConfirmedCases':'Confirmed'})

num_cols=['Confirmed', 'Fatalities']

# cast to int
for col in num_cols:
    temp=[int(i) for i in train[col]]
    train[col]=temp 
    
train.head()

Unnamed: 0,Id,Province_State,Country_Region,Date,Confirmed,Fatalities
0,1,Afghanistan,Afghanistan,2020-01-22,0,0
1,2,Afghanistan,Afghanistan,2020-01-23,0,0
2,3,Afghanistan,Afghanistan,2020-01-24,0,0
3,4,Afghanistan,Afghanistan,2020-01-25,0,0
4,5,Afghanistan,Afghanistan,2020-01-26,0,0


In [78]:
train['Date'] = pd.to_datetime(train['Date'], infer_datetime_format = True)
train_countries = train.groupby("Country_Region")
train_countries.get_group('Serbia').set_index("Date").head()

Unnamed: 0_level_0,Id,Province_State,Country_Region,Confirmed,Fatalities
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-22,22364,Serbia,Serbia,0,0
2020-01-23,22365,Serbia,Serbia,0,0
2020-01-24,22366,Serbia,Serbia,0,0
2020-01-25,22367,Serbia,Serbia,0,0
2020-01-26,22368,Serbia,Serbia,0,0


In [79]:
Confirmed = pd.pivot_table(train, values = "Confirmed", index = "Date", columns = "Country_Region")
Fatalities = pd.pivot_table(train, values = "Fatalities", index = "Date", columns = "Country_Region")
Confirmed

Country_Region,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,...,Ukraine,United Arab Emirates,United Kingdom,Uruguay,Uzbekistan,Venezuela,Vietnam,West Bank and Gaza,Zambia,Zimbabwe
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
2020-01-24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
2020-01-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
2020-01-26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-04-02,273.0,277.0,986.0,428.0,8.0,9.0,1133.0,663.0,639.500,11129.0,...,897.0,1024.0,3417.3,350.0,205.0,146.0,233.0,161.0,39.0,9.0
2020-04-03,281.0,304.0,1171.0,439.0,8.0,15.0,1265.0,736.0,666.250,11524.0,...,1072.0,1264.0,3868.9,369.0,227.0,153.0,237.0,194.0,39.0,9.0
2020-04-04,299.0,333.0,1251.0,466.0,10.0,15.0,1451.0,770.0,693.750,11781.0,...,1225.0,1505.0,4247.6,400.0,266.0,155.0,240.0,217.0,39.0,9.0
2020-04-05,349.0,361.0,1320.0,501.0,14.0,15.0,1451.0,822.0,710.875,12051.0,...,1308.0,1799.0,4843.4,400.0,342.0,159.0,241.0,237.0,39.0,9.0


In [80]:
def split_sequences(sequences, n_steps_in, n_steps_out):
    X, y = list(), list()
    for i in range(len(sequences)):
        end_ix = i + n_steps_in
        out_end_ix = end_ix + n_steps_out
        if out_end_ix > len(sequences):
            break
            
        seq_x, seq_y = sequences[i:end_ix, :], sequences[end_ix:out_end_ix, :]
        X.append(seq_x)
        y.append(seq_y)
        
    return np.array(X), np.array(y)


In [81]:
Confirmed = Confirmed.values
n_steps_in, n_steps_out = 2, 1
X_confirmed, y_confirmed = split_sequences(Confirmed, n_steps_in, n_steps_out)
print(X_confirmed.shape, y_confirmed.shape)


(74, 2, 180) (74, 1, 180)


### Split the data into train and test sets

In [82]:
X_train_confirmed, X_val_confirmed, y_train_confirmed, y_val_confirmed = train_test_split(X_confirmed, y_confirmed, test_size = 0.1, random_state = 42)

### Define model

In [83]:
n_features = X.shape[2]
epochs = 200
batch_size = 32

from keras.layers import RepeatVector, TimeDistributed

model_confirmed = Sequential()
model_confirmed.add(LSTM(50, activation='relu', input_shape =(n_steps_in, n_features)))
model_confirmed.add(RepeatVector(n_steps_out))
model_confirmed.add(LSTM(150, activation ='relu', return_sequences = True))
model_confirmed.add(TimeDistributed(Dense(n_features)))    

model_confirmed.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_14 (LSTM)               (None, 50)                46200     
_________________________________________________________________
repeat_vector_5 (RepeatVecto (None, 1, 50)             0         
_________________________________________________________________
lstm_15 (LSTM)               (None, 1, 150)            120600    
_________________________________________________________________
time_distributed_6 (TimeDist (None, 1, 180)            27180     
Total params: 193,980
Trainable params: 193,980
Non-trainable params: 0
_________________________________________________________________


In [84]:
model_confirmed.compile(optimizer='adam', loss = 'msle', metrics = ['acc'])
#callbacks = [ReduceLROnPlateau(monitor='val_loss', patience = 5, verbose = 1, factor = 0.6),
 #            EarlyStopping(monitor='val_loss', patience = 20),
  #           ModelCheckpoint(filepath = 'best_model_confirmed.h5', monitor ='val_loss', save_best_only = True)]

# fit the model
model_confirmed.fit(X_train_confirmed, y_train_confirmed, epochs = epochs,  
                    validation_data = (X_val_confirmed, y_val_confirmed), verbose = 0)

<keras.callbacks.callbacks.History at 0x7f0141b4a400>

In [87]:
scores = model_confirmed.evaluate(X_train_confirmed, y_train_confirmed)
print("Model accuracy: %.2f"%(scores[1]*100),"%")

Model accuracy: 83.33 %
