# Data analysis

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities
0,1,,Afghanistan,2020-01-22,0.0,0.0
1,2,,Afghanistan,2020-01-23,0.0,0.0
2,3,,Afghanistan,2020-01-24,0.0,0.0
3,4,,Afghanistan,2020-01-25,0.0,0.0
4,5,,Afghanistan,2020-01-26,0.0,0.0


In [4]:
df.describe()

Unnamed: 0,Id,ConfirmedCases,Fatalities
count,22644.0,22644.0,22644.0
mean,16355.0,655.267002,29.015412
std,9451.983632,5428.632429,382.138505
min,1.0,0.0,0.0
25%,8169.75,0.0,0.0
50%,16355.0,0.0,0.0
75%,24540.25,51.0,0.0
max,32709.0,126168.0,15362.0


In [5]:
df['Country_Region'].value_counts()

US                  3996
China               2442
Canada               888
United Kingdom       740
France               740
                    ... 
Papua New Guinea      74
Timor-Leste           74
Croatia               74
Brazil                74
Albania               74
Name: Country_Region, Length: 180, dtype: int64

In [6]:
df[df['ConfirmedCases']>=10]['Country_Region'].value_counts()

China                  2248
US                     1196
Canada                  208
France                  187
Australia               186
                       ... 
Antigua and Barbuda       2
Guinea-Bissau             2
Fiji                      1
Angola                    1
Liberia                   1
Name: Country_Region, Length: 159, dtype: int64

In [7]:
df[df['ConfirmedCases']>=10]['Date']

52       2020-03-14
53       2020-03-15
54       2020-03-16
55       2020-03-17
56       2020-03-18
            ...    
22565    2020-03-31
22566    2020-04-01
22567    2020-04-02
22568    2020-04-03
22569    2020-04-04
Name: Date, Length: 7736, dtype: object

In [8]:
df['Date'].nunique()

74

In [9]:
df['Country_Region'].nunique()

180

# DATA TRANSFORMATION

NaN values imputation

In [11]:
df['Province_State'] = df['Province_State'].fillna('unknown')

In [12]:
df.head()

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities
0,1,unknown,Afghanistan,2020-01-22,0.0,0.0
1,2,unknown,Afghanistan,2020-01-23,0.0,0.0
2,3,unknown,Afghanistan,2020-01-24,0.0,0.0
3,4,unknown,Afghanistan,2020-01-25,0.0,0.0
4,5,unknown,Afghanistan,2020-01-26,0.0,0.0


In [31]:
len(df)

22644

In [80]:
df['Country_Region'] = df['Country_Region'].astype('category')
df['Province_State'] = df['Province_State'].astype('category')

In [92]:
mapping_region = dict(zip(df['Country_Region'].values,df['Country_Region'].cat.codes))
mapping_province = dict(zip(df['Province_State'].values,df['Province_State'].cat.codes))

In [210]:
def slice_data(df,length=5):
    X = []
    y1 = []
    y2 = []
    X_cat1 = []
    X_cat2 = []
    for i in range(len(df)-length):
            X1 = df['ConfirmedCases'].values[i:length+i].reshape(-1,1)
            X2 = df['Fatalities'].values[i:length+i].reshape(-1,1)
            X.append(np.hstack((X1,X2)))
            y1.append(df['ConfirmedCases'].values[length+i])
            y2.append(df['Fatalities'].values[length+i])
            X_cat1.append(tf.keras.utils.to_categorical(
                mapping_region[df['Country_Region'].values[i]],len(mapping_region)))
            X_cat2.append(tf.keras.utils.to_categorical(
                mapping_province[df['Province_State'].values[i]],len(mapping_province)))
    return np.array(X),\
np.concatenate((np.array(y1).reshape(-1,1),np.array(y2).reshape(-1,1)),axis=1),\
np.concatenate((np.array(X_cat1), np.array(X_cat2)),axis=1)
        

In [211]:
X_lstm,y,X_dense = slice_data(df,4)

In [212]:
len(X_lstm), len(y),len(X_dense)

(22640, 22640, 22640)

In [213]:
X_lstm

array([[[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]],

       [[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]],

       [[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]],

       ...,

       [[7., 1.],
        [7., 1.],
        [8., 1.],
        [8., 1.]],

       [[7., 1.],
        [8., 1.],
        [8., 1.],
        [9., 1.]],

       [[8., 1.],
        [8., 1.],
        [9., 1.],
        [9., 1.]]])

In [218]:
X_dense = X_dense.reshape(X_dense.shape[0],1,X_dense.shape[1])

In [219]:
X_lstm.shape

(22640, 4, 2)

In [220]:
y

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       ...,
       [9., 1.],
       [9., 1.],
       [9., 1.]])

In [234]:
y.shape

(22640, 2)

In [221]:
X_dense.shape

(22640, 1, 311)

In [263]:
train_size = int(len(X_dense)*0.9)

In [264]:
train_size

20376

In [265]:
X_dense_train, X_lstm_train, y_train = X_dense[:train_size],X_lstm[:train_size], y[:train_size]

In [266]:
X_dense_test, X_lstm_test, y_test = X_dense[train_size:],X_lstm[train_size:], y[train_size:]

# Building a model

In [335]:
import tensorflow as tf

In [336]:
tf.keras.backend.clear_session()
#first input 
lstm_input = tf.keras.layers.Input((X_lstm.shape[1],X_lstm.shape[2]))
lstm_block1 = tf.keras.layers.LSTM(256,return_sequences=True)(lstm_input)
lstm_block2 = tf.keras.layers.LSTM(128)(lstm_block1)
reshape_layer = tf.keras.layers.Reshape((1,128))(lstm_block2)

#second input
dense_input = tf.keras.layers.Input((X_dense.shape[1],X_dense.shape[2]))
dense_layer1 = tf.keras.layers.Dense(1024,activation='relu')(dense_input)
dropout_layer1 = tf.keras.layers.Dropout(0.3)(dense_layer1)
dense_layer2 = tf.keras.layers.Dense(128,activation='relu')(dropout_layer1)

#embedd everything together
concat_layer = tf.keras.layers.add([reshape_layer,dense_layer2])
flatten_layer = tf.keras.layers.Flatten()(concat_layer)
final_dense = tf.keras.layers.Dense(1024,activation='relu')(flatten_layer)
output_layer = tf.keras.layers.Dense(2,activation='relu')(flatten_layer)

#Model
model = tf.keras.Model(inputs=[lstm_input,dense_input],outputs=output_layer)



In [337]:
model.compile(optimizer='Adam',loss=tf.keras.losses.MeanSquaredLogarithmicError())

In [338]:
callbacks = [tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=15, verbose=1, factor=0.6),
             tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)]

In [339]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 4, 2)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1, 311)]     0                                            
__________________________________________________________________________________________________
unified_lstm (UnifiedLSTM)      (None, 4, 256)       265216      input_1[0][0]                    
__________________________________________________________________________________________________
dense (Dense)                   (None, 1, 1024)      319488      input_2[0][0]                    
______________________________________________________________________________________________

In [340]:
hist = model.fit(x=[X_lstm_train,X_dense_train],y=y_train, validation_data=[[X_lstm_test,X_dense_test],y_test]
          ,epochs=250,callbacks=callbacks)

Train on 20376 samples, validate on 2264 samples
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/25

Epoch 76/250
Epoch 77/250
Epoch 00077: ReduceLROnPlateau reducing learning rate to 0.0006000000284984708.
Epoch 78/250
Epoch 79/250
Epoch 80/250
Epoch 81/250
Epoch 82/250
Epoch 83/250
Epoch 84/250
Epoch 85/250
Epoch 86/250
Epoch 87/250
Epoch 88/250
Epoch 89/250
Epoch 90/250
Epoch 91/250
Epoch 92/250
Epoch 93/250
Epoch 94/250
Epoch 95/250
Epoch 96/250
Epoch 97/250
Epoch 98/250
Epoch 99/250
Epoch 00099: ReduceLROnPlateau reducing learning rate to 0.0003600000170990825.
Epoch 100/250
Epoch 101/250
Epoch 102/250
Epoch 103/250
Epoch 104/250
Epoch 105/250
Epoch 106/250
Epoch 107/250
Epoch 108/250
Epoch 109/250
Epoch 110/250
Epoch 111/250
Epoch 112/250
Epoch 113/250
Epoch 114/250
Epoch 115/250
Epoch 116/250
Epoch 117/250
Epoch 118/250
Epoch 119/250
Epoch 120/250
Epoch 121/250
Epoch 122/250
Epoch 123/250
Epoch 124/250
Epoch 125/250
Epoch 126/250
Epoch 127/250
Epoch 128/250
Epoch 129/250
Epoch 130/250
Epoch 131/250
Epoch 132/250
Epoch 133/250
Epoch 134/250
Epoch 135/250
Epoch 136/250
Epoch 137/

Epoch 146/250
Epoch 147/250
Epoch 148/250
Epoch 149/250
Epoch 150/250
Epoch 151/250
Epoch 152/250
Epoch 153/250
Epoch 154/250
Epoch 155/250
Epoch 156/250
Epoch 157/250
Epoch 00157: ReduceLROnPlateau reducing learning rate to 0.00012960000021848827.
Epoch 158/250
Epoch 159/250
Epoch 160/250
Epoch 161/250
Epoch 162/250


In [347]:
model.predict([X_lstm[150:160],X_dense[150:160]]),y[150:160]

(array([[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]], dtype=float32),
 array([[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]]))