In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
np.version.version

'1.18.5'

In [3]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [4]:
df =  pd.read_csv('C:/Users/farha/Desktop/uni stage 3/Introduction to AI/COURSEWORK/Github/intro-to-ai-farhan-labi/covid_19_indonesia_time_series_all.csv')

In [5]:
df_regression = df.copy()

In [6]:
#Set date as index this just makes the df easier to work with
df_regression = df_regression.set_index('Date')

In [7]:
#For the NA values that are in int columns, fill them with medians 
med_gf_nc = df_regression['Growth Factor of New Cases'].median()
med_gf_nd = df_regression['Growth Factor of New Deaths'].median()
med_tot_uv = df_regression['Total Urban Villages'].median()
med_tot_rv = df_regression['Total Rural Villages'].median()
med_tot_c = df_regression['Total Cities'].median()

df_regression['Growth Factor of New Cases'] = df_regression['Growth Factor of New Cases'].fillna(med_gf_nc)

df_regression['Growth Factor of New Deaths'] = df_regression['Growth Factor of New Deaths'].fillna(med_gf_nd)

df_regression['Total Urban Villages'] = df_regression['Total Urban Villages'].fillna(med_tot_uv)

df_regression['Total Rural Villages'] = df_regression['Total Rural Villages'].fillna(med_tot_rv)

df_regression['Total Cities'] = df_regression['Total Cities'].fillna(med_tot_c)

In [8]:
#We can drop the following columns, because either they are redundant, have nothing in them (all rows NA) or non relevant
df_regression = df_regression.drop(columns=['City or Regency', 'Province', 'Island', 'Time Zone', 'Special Status', 'Location ISO Code',
                                           'Location Level', 'Country', 'Continent', 'Location', 'Case Fatality Rate', 'Case Recovered Rate'])

In [9]:
#Check datatypes of all remaining columns in dataframe
print(df_regression.dtypes)

New Cases                        int64
New Deaths                       int64
New Recovered                    int64
New Active Cases                 int64
Total Cases                      int64
Total Deaths                     int64
Total Recovered                  int64
Total Active Cases               int64
Total Regencies                  int64
Total Cities                   float64
Total Districts                  int64
Total Urban Villages           float64
Total Rural Villages           float64
Area (km2)                       int64
Population                       int64
Population Density             float64
Longitude                      float64
Latitude                       float64
New Cases per Million          float64
Total Cases per Million        float64
New Deaths per Million         float64
Total Deaths per Million       float64
Total Deaths per 100rb         float64
Growth Factor of New Cases     float64
Growth Factor of New Deaths    float64
dtype: object


In [10]:
#Convert all columns to float32 unless it is float64 (for regression)
#It is already float 64
df_regression[['New Cases', 'New Deaths', 'New Recovered', 'New Active Cases', 'Total Cases',
              'Total Deaths', 'Total Recovered', 'Total Active Cases',
              'Total Regencies', 'Total Districts', 'Area (km2)', 'Population']] = df_regression[['New Cases', 'New Deaths', 'New Recovered', 'New Active Cases', 'Total Cases',
              'Total Deaths', 'Total Recovered', 'Total Active Cases',
              'Total Regencies', 'Total Districts', 'Area (km2)', 'Population']].astype('float32') 

In [11]:
#Remove outliers function from exercises
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) 
                          >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)

In [12]:
# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(
        target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    # Regression
    return df[result].values.astype(np.float32), df[[target]].values.astype(np.float32)

In [13]:
#Removing outliers for new active cases
print("Before: {}".format(len(df_regression)))
remove_outliers(df_regression, 'New Active Cases', 2)
print("After: {}".format(len(df_regression)))

Before: 21759
After: 15844


In [14]:
X,y = to_xy(df_regression, 'New Active Cases')
#Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [15]:
# #Normalize dataset using min max
# sc_x = StandardScaler()
# sc_x.fit(X_train)
# #Transform X
# X_train = sc_x.transform(X_train)
# X_test = sc_x.transform(X_test)

In [17]:
#Minmax give a larger range of numbers so may help into back propogation when training
#Minmax
mms_x = MinMaxScaler()
#Fit x y
mms_x.fit(X_train)
#Transform X
X_train = mms_x.transform(X_train)
X_test = mms_x.transform(X_test)

In [18]:
#Creating 3D input dataset
#This is because LSTM takes a 3D input for back propagation
#Came from https://towardsdatascience.com/predictive-analysis-rnn-lstm-and-gru-to-predict-water-consumption-e6bb3c2b4b02
def create_dataset(X, y, steps = 1):
    Xs, ys = [], []
    for i in range(len(X)-steps):
        v = X[i:i+steps]
        Xs.append(v)
        ys.append(y[i+steps])
    return np.array(Xs), np.array(ys)

In [19]:
#Makes predictions based on the last x number of data
#Since the dataset is large, I chose to predict over a large amount of steps
steps = 24

In [20]:
X_test, y_test = create_dataset(X_test, y_test, steps)
X_train, y_train = create_dataset(X_train, y_train, steps)

print('X_train.shape: ', X_test.shape)
print('y_train.shape: ', y_train.shape)
print('X_test.shape: ', X_test.shape)
print('y_test.shape: ', y_train.shape)
print(X_test)

X_train.shape:  (3937, 24, 24)
y_train.shape:  (11859, 1)
X_test.shape:  (3937, 24, 24)
y_test.shape:  (11859, 1)
[[[1.47492625e-04 0.00000000e+00 1.91546424e-04 ... 5.21512411e-04
   2.71739140e-02 2.70270277e-02]
  [1.47492625e-04 0.00000000e+00 7.34261237e-04 ... 3.60169500e-01
   1.81521736e-02 2.70270277e-02]
  [3.59882019e-03 1.25156448e-03 8.55574012e-03 ... 1.04302485e-02
   7.39130471e-03 2.70270277e-02]
  ...
  [2.54277289e-02 5.63204009e-03 2.21874602e-02 ... 3.84419829e-01
   1.92391314e-02 6.08108118e-02]
  [8.84955807e-05 0.00000000e+00 1.91546424e-04 ... 3.65058682e-03
   1.63043477e-02 2.70270277e-02]
  [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 3.25945264e-04
   1.08695654e-02 2.70270277e-02]]

 [[1.47492625e-04 0.00000000e+00 7.34261237e-04 ... 3.60169500e-01
   1.81521736e-02 2.70270277e-02]
  [3.59882019e-03 1.25156448e-03 8.55574012e-03 ... 1.04302485e-02
   7.39130471e-03 2.70270277e-02]
  [0.00000000e+00 6.25782239e-04 0.00000000e+00 ... 5.21512411e-04
   

In [21]:
#Create and fit LSTM
model = Sequential()
model.add(LSTM(64, input_shape = [X_train.shape[1], X_train.shape[2]], return_sequences=True)) #Layer1
model.add(Dropout(0.2))
model.add(LSTM(64)) #Layer 2
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid')) #Output

In [22]:
model.compile(loss='mean_squared_error', optimizer='adam')
test = model.fit(X_train, y_train, verbose=2, epochs=20)
model.summary()

Epoch 1/20
371/371 - 5s - loss: 27561.0527
Epoch 2/20
371/371 - 5s - loss: 27559.8828
Epoch 3/20
371/371 - 5s - loss: 27559.8613
Epoch 4/20
371/371 - 5s - loss: 27559.8828
Epoch 5/20
371/371 - 5s - loss: 27559.8926
Epoch 6/20
371/371 - 6s - loss: 27559.8691
Epoch 7/20
371/371 - 5s - loss: 27559.8633
Epoch 8/20
371/371 - 5s - loss: 27559.8691
Epoch 9/20
371/371 - 5s - loss: 27559.8789
Epoch 10/20
371/371 - 5s - loss: 27559.8789
Epoch 11/20
371/371 - 5s - loss: 27559.8711
Epoch 12/20
371/371 - 5s - loss: 27559.8574
Epoch 13/20
371/371 - 6s - loss: 27559.8730
Epoch 14/20
371/371 - 6s - loss: 27559.8789
Epoch 15/20
371/371 - 7s - loss: 27559.8613
Epoch 16/20
371/371 - 6s - loss: 27559.8613
Epoch 17/20
371/371 - 5s - loss: 27559.8730
Epoch 18/20
371/371 - 5s - loss: 27559.8613
Epoch 19/20
371/371 - 5s - loss: 27559.8730
Epoch 20/20
371/371 - 5s - loss: 27559.8652
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape  

In [24]:
#make prediction
y_pred = model.predict(X_test)

In [25]:
print("Shape: {}".format(y_pred.shape))
print("Shape: {}".format(y_test.shape))

Shape: (3937, 1)
Shape: (3937, 1)


In [26]:
print("final RMSE =", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

final RMSE = 177.65662
