In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [2]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
df =  pd.read_csv('C:/Users/farha/Desktop/uni stage 3/Introduction to AI/COURSEWORK/Github/intro-to-ai-farhan-labi/covid_19_indonesia_time_series_all.csv')

In [4]:
df_regression = df.copy()

In [5]:
#For the NA values that are in int columns, fill them with medians 
med_gf_nc = df_regression['Growth Factor of New Cases'].median()
med_gf_nd = df_regression['Growth Factor of New Deaths'].median()
med_tot_uv = df_regression['Total Urban Villages'].median()
med_tot_rv = df_regression['Total Rural Villages'].median()
med_tot_c = df_regression['Total Cities'].median()

df_regression['Growth Factor of New Cases'] = df_regression['Growth Factor of New Cases'].fillna(med_gf_nc)

df_regression['Growth Factor of New Deaths'] = df_regression['Growth Factor of New Deaths'].fillna(med_gf_nd)

df_regression['Total Urban Villages'] = df_regression['Total Urban Villages'].fillna(med_tot_uv)

df_regression['Total Rural Villages'] = df_regression['Total Rural Villages'].fillna(med_tot_rv)

df_regression['Total Cities'] = df_regression['Total Cities'].fillna(med_tot_c)

In [6]:
#We can drop the columns: City or Regency, Province, Island, Time Zone, Special Status.
#This is because we already have Location and Location ISO code, for the whole of Indonesia
df_regression = df_regression.drop(columns=['City or Regency', 'Province', 'Island', 'Time Zone', 'Special Status'])

In [7]:
#Now we have to encode variables since there are Date and String data types
#.astype(str).apply(le.fit_transform)
for column in df_regression.columns:
    df_regression[column] = LabelEncoder().fit(df_regression[column]).transform(df_regression[column])

In [8]:
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(
        target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    # Regression
    return df[result].values.astype(np.float32), df[[target]].values.astype(np.float32)

In [10]:
X,y = to_xy(df_regression, 'New Active Cases')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [11]:
model = Sequential()
model.add(Dense(64, input_shape=X[1].shape, activation='relu')) # Hidden 1
model.add(Dense(64,activation='relu')) #Hidden 2
model.add(Dense(1958)) # Output

In [12]:
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train,y_train,verbose=2,epochs=20)
model.summary()

Epoch 1/20
510/510 - 0s - loss: 480.6641
Epoch 2/20
510/510 - 1s - loss: 1.1689
Epoch 3/20
510/510 - 1s - loss: 0.0579
Epoch 4/20
510/510 - 1s - loss: 0.0209
Epoch 5/20
510/510 - 1s - loss: 0.0054
Epoch 6/20
510/510 - 1s - loss: 0.0017
Epoch 7/20
510/510 - 0s - loss: 7.4939e-04
Epoch 8/20
510/510 - 1s - loss: 5.8696e-04
Epoch 9/20
510/510 - 0s - loss: 5.7165e-04
Epoch 10/20
510/510 - 0s - loss: 5.4754e-04
Epoch 11/20
510/510 - 0s - loss: 5.2636e-04
Epoch 12/20
510/510 - 0s - loss: 5.1077e-04
Epoch 13/20
510/510 - 1s - loss: 5.0706e-04
Epoch 14/20
510/510 - 0s - loss: 5.0590e-04
Epoch 15/20
510/510 - 1s - loss: 5.0518e-04
Epoch 16/20
510/510 - 1s - loss: 5.0466e-04
Epoch 17/20
510/510 - 1s - loss: 5.0437e-04
Epoch 18/20
510/510 - 0s - loss: 5.0407e-04
Epoch 19/20
510/510 - 1s - loss: 5.0394e-04
Epoch 20/20
510/510 - 1s - loss: 5.0385e-04
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   

In [13]:
y_pred = model.predict(X_test)
print("Shape: {}".format(y_pred.shape))
print("Shape: {}".format(y_test.shape))

Shape: (5440, 1958)
Shape: (5440, 1958)


In [14]:
print("final RMSE =", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

final RMSE = 0.024198184
