In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import OneHotEncoder

In [2]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
df =  pd.read_csv('C:/Users/farha/Desktop/uni stage 3/Introduction to AI/COURSEWORK/Github/intro-to-ai-farhan-labi/covid_19_indonesia_time_series_all.csv')

In [5]:
df_regression = df.copy()

In [6]:
#Set date as index
df_regression = df_regression.set_index('Date')

In [7]:
#For the NA values that are in int columns, fill them with medians 
med_gf_nc = df_regression['Growth Factor of New Cases'].median()
med_gf_nd = df_regression['Growth Factor of New Deaths'].median()
med_tot_uv = df_regression['Total Urban Villages'].median()
med_tot_rv = df_regression['Total Rural Villages'].median()
med_tot_c = df_regression['Total Cities'].median()

df_regression['Growth Factor of New Cases'] = df_regression['Growth Factor of New Cases'].fillna(med_gf_nc)

df_regression['Growth Factor of New Deaths'] = df_regression['Growth Factor of New Deaths'].fillna(med_gf_nd)

df_regression['Total Urban Villages'] = df_regression['Total Urban Villages'].fillna(med_tot_uv)

df_regression['Total Rural Villages'] = df_regression['Total Rural Villages'].fillna(med_tot_rv)

df_regression['Total Cities'] = df_regression['Total Cities'].fillna(med_tot_c)

In [8]:
#We can drop the columns: City or Regency, Province, Island, Time Zone, Special Status.
#This is because we already have Location and Location ISO code, for the whole of Indonesia
df_regression = df_regression.drop(columns=['City or Regency', 'Province', 'Island', 'Time Zone', 'Special Status', 'Location ISO Code',
                                           'Location Level', 'Country', 'Continent', 'Location', 'Case Fatality Rate', 'Case Recovered Rate'])

In [9]:
df_regression

Unnamed: 0_level_0,New Cases,New Deaths,New Recovered,New Active Cases,Total Cases,Total Deaths,Total Recovered,Total Active Cases,Total Regencies,Total Cities,Total Districts,Total Urban Villages,Total Rural Villages,Area (km2),Population,Population Density,Longitude,Latitude,New Cases per Million,Total Cases per Million,New Deaths per Million,Total Deaths per Million,Total Deaths per 100rb,Growth Factor of New Cases,Growth Factor of New Deaths
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
3/1/2020,2,0,0,2,39,20,41,-22,1,5.0,44,267.0,1591.0,664,10846145,16334.31,106.836118,-6.204699,0.18,3.60,0.00,1.84,0.18,0.99,1.0
3/2/2020,2,0,0,2,41,20,41,-20,1,5.0,44,267.0,1591.0,664,10846145,16334.31,106.836118,-6.204699,0.18,3.78,0.00,1.84,0.18,1.00,1.0
3/2/2020,2,0,0,2,2,0,0,2,416,98.0,7230,8488.0,74953.0,1916907,265185520,138.34,113.921327,-0.789275,0.01,0.01,0.00,0.00,0.00,0.99,1.0
3/2/2020,1,0,0,1,2,0,3,-1,10,2.0,169,268.0,1591.0,87024,6074100,69.80,101.805109,0.511648,0.16,0.33,0.00,0.00,0.00,0.99,1.0
3/3/2020,2,0,0,2,43,20,41,-18,1,5.0,44,267.0,1591.0,664,10846145,16334.31,106.836118,-6.204699,0.18,3.96,0.00,1.84,0.18,1.00,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12/2/2021,2,0,0,2,34715,1056,33566,93,11,4.0,171,332.0,1507.0,13892,2641884,190.17,124.521240,1.259638,0.76,13140.24,0.00,399.71,39.97,0.99,1.0
12/2/2021,4,0,0,4,89849,2152,87605,92,12,7.0,179,230.0,928.0,42013,5519245,131.37,100.465062,-0.850253,0.72,16279.22,0.00,389.91,38.99,4.00,1.0
12/2/2021,4,0,0,4,59937,3071,56830,36,13,4.0,241,387.0,2853.0,91592,8217551,89.72,104.169465,-3.216212,0.49,7293.78,0.00,373.71,37.37,4.00,1.0
12/2/2021,4,0,1,3,106045,2889,103061,95,25,8.0,450,693.0,5417.0,72981,14874889,203.82,99.051964,2.191894,0.27,7129.13,0.00,194.22,19.42,1.00,1.0


In [10]:
#Check datatypes of all remaining columns in dataframe
print(df_regression.dtypes)

New Cases                        int64
New Deaths                       int64
New Recovered                    int64
New Active Cases                 int64
Total Cases                      int64
Total Deaths                     int64
Total Recovered                  int64
Total Active Cases               int64
Total Regencies                  int64
Total Cities                   float64
Total Districts                  int64
Total Urban Villages           float64
Total Rural Villages           float64
Area (km2)                       int64
Population                       int64
Population Density             float64
Longitude                      float64
Latitude                       float64
New Cases per Million          float64
Total Cases per Million        float64
New Deaths per Million         float64
Total Deaths per Million       float64
Total Deaths per 100rb         float64
Growth Factor of New Cases     float64
Growth Factor of New Deaths    float64
dtype: object


In [22]:
#Convert columns all columns except for target column to float 32 unless 
#It is already float 64
df_regression[['New Cases', 'New Deaths', 'New Recovered', 'Total Cases',
              'Total Deaths', 'Total Recovered', 'Total Active Cases',
              'Total Regencies', 'Total Districts', 'Area (km2)', 'Population']] = df_regression[['New Cases', 'New Deaths', 'New Recovered', 'Total Cases',
              'Total Deaths', 'Total Recovered', 'Total Active Cases',
              'Total Regencies', 'Total Districts', 'Area (km2)', 'Population']].astype('float32') 

In [13]:
#Remove outliers function
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) 
                          >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)

In [14]:
# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(
        target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    # Regression
    return df[result].values.astype(np.float32), df[[target]].values.astype(np.float32)

In [15]:
remove_outliers(df_regression, 'New Cases', 2)
X,y = to_xy(df_regression, 'New Cases')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, )

In [16]:
sc = StandardScaler()
sc.fit(X_train)
X_train= sc.transform(X_train)
X_test = sc.transform(X_test)

In [23]:
model = Sequential()
model.add(Dense(1024, input_shape=X[1].shape, activation='relu')) # Hidden 1
model.add(Dense(64, activation='relu')) #Hidden 2
model.add(Dense(1)) # Output

In [24]:
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train,y_train,verbose=2,epochs=20)
model.summary()

Epoch 1/20
280/280 - 1s - loss: 64217.9766
Epoch 2/20
280/280 - 1s - loss: 11244.5117
Epoch 3/20
280/280 - 1s - loss: 6956.8262
Epoch 4/20
280/280 - 0s - loss: 5765.9229
Epoch 5/20
280/280 - 1s - loss: 4069.9407
Epoch 6/20
280/280 - 1s - loss: 4115.4707
Epoch 7/20
280/280 - 1s - loss: 2785.9927
Epoch 8/20
280/280 - 1s - loss: 2157.1609
Epoch 9/20
280/280 - 1s - loss: 2171.6597
Epoch 10/20
280/280 - 1s - loss: 1835.4618
Epoch 11/20
280/280 - 1s - loss: 1225.6851
Epoch 12/20
280/280 - 1s - loss: 1088.5065
Epoch 13/20
280/280 - 1s - loss: 855.8039
Epoch 14/20
280/280 - 1s - loss: 754.4539
Epoch 15/20
280/280 - 1s - loss: 890.5895
Epoch 16/20
280/280 - 1s - loss: 526.0109
Epoch 17/20
280/280 - 1s - loss: 579.3994
Epoch 18/20
280/280 - 1s - loss: 444.8954
Epoch 19/20
280/280 - 1s - loss: 428.0251
Epoch 20/20
280/280 - 1s - loss: 503.2889
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
d

In [25]:
y_pred = model.predict(X_test)
print("Shape: {}".format(y_pred.shape))
print("Shape: {}".format(y_test.shape))

Shape: (2981, 1)
Shape: (2981, 1)


In [26]:
print("final RMSE =", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

final RMSE = 15.649598


In [None]:
#Past this point is debug area

In [None]:
#Check type of x,y for debug
print(X_test.dtype)
print(X_train.dtype)
print(y_train.dtype)
print(y_test.dtype)

In [None]:
#Debug
print(y_test)
print(y_pred)