In [101]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns # for plotting
from sklearn.impute import SimpleImputer # for imputing missing values
from sklearn.preprocessing import StandardScaler # for standardizing data
from csvToExcel import toExcel
from tensorflow import keras # for building Neural Networks
from keras.models import Sequential # for creating a linear stack of layers for our Neural Network
from keras import Input # for instantiating a keras tensor
from keras.layers import Dense, SimpleRNN, LSTM # for creating regular densely-connected NN layers and RNN layers

In [102]:
def checkMissingData():
    df = pd.read_csv('DataFolder/data.csv')
    print(df.info())
    
checkMissingData()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38472 entries, 0 to 38471
Data columns (total 15 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Entity                           38472 non-null  object 
 1   Continent                        38472 non-null  object 
 2   Latitude                         38472 non-null  float64
 3   Longitude                        38472 non-null  float64
 4   Average temperature per year     38472 non-null  int64  
 5   Hospital beds per 1000 people    38472 non-null  float64
 6   Medical doctors per 1000 people  38472 non-null  float64
 7   GDP/Capita                       38472 non-null  float64
 8   Population                       38472 non-null  int64  
 9   Median age                       38472 non-null  int64  
 10  Population aged 65 and over (%)  38472 non-null  int64  
 11  Date                             38472 non-null  object 
 12  Daily tests       

In [103]:
# Fill missing values with mean
def missingDataHandler():
    df = pd.read_csv('DataFolder/data.csv')
    imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
    # From the missing data analysis, we know that the columns 'Daily tests', 'Cases', 'Deaths' have missing values
    missingColumns = ['Daily tests', 'Cases', 'Deaths']
    for column in missingColumns:
        df[column] = imputer.fit_transform(df[column].values.reshape(-1, 1))
    print(df.info())
    return df
    
dataFrame = missingDataHandler()
# toExcel(dataFrame, 'preprocessedData')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38472 entries, 0 to 38471
Data columns (total 15 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Entity                           38472 non-null  object 
 1   Continent                        38472 non-null  object 
 2   Latitude                         38472 non-null  float64
 3   Longitude                        38472 non-null  float64
 4   Average temperature per year     38472 non-null  int64  
 5   Hospital beds per 1000 people    38472 non-null  float64
 6   Medical doctors per 1000 people  38472 non-null  float64
 7   GDP/Capita                       38472 non-null  float64
 8   Population                       38472 non-null  int64  
 9   Median age                       38472 non-null  int64  
 10  Population aged 65 and over (%)  38472 non-null  int64  
 11  Date                             38472 non-null  object 
 12  Daily tests       

In [104]:
# Get the data for Greece until 2020-12-31
def getGreeceData(df):
    df = df[df['Entity'] == 'Greece']
    df = df[df['Date'] <= '2020-12-31']
    return df

dataFrame2 = getGreeceData(dataFrame)

In [105]:
# Add new columns for positivity percent and daily cases
def addPositivityPercent(df):
    df['Daily Cases'] = abs(df['Cases'].diff())
    df['Positivity percent'] = df['Daily Cases'] / df['Daily tests']
    df['Date'] = pd.to_datetime(df['Date'])
    imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
    df['Positivity percent'] = imputer.fit_transform(df['Positivity percent'].values.reshape(-1, 1))
    df['Daily Cases'] = imputer.fit_transform(df['Daily Cases'].values.reshape(-1, 1))
    return df

dataset = addPositivityPercent(dataFrame2)
# toExcel(dataset, 'GreeceData')
print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 310 entries, 11802 to 12111
Data columns (total 17 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   Entity                           310 non-null    object        
 1   Continent                        310 non-null    object        
 2   Latitude                         310 non-null    float64       
 3   Longitude                        310 non-null    float64       
 4   Average temperature per year     310 non-null    int64         
 5   Hospital beds per 1000 people    310 non-null    float64       
 6   Medical doctors per 1000 people  310 non-null    float64       
 7   GDP/Capita                       310 non-null    float64       
 8   Population                       310 non-null    int64         
 9   Median age                       310 non-null    int64         
 10  Population aged 65 and over (%)  310 non-null    int64  

In [106]:
def allGreeceData(df):
    df = df[df['Entity'] == 'Greece']
    df['Daily Cases'] = abs(df['Cases'].diff())
    df['Positivity percent'] = df['Daily Cases'] / df['Daily tests']
    df['Date'] = pd.to_datetime(df['Date'])
    imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
    df['Positivity percent'] = imputer.fit_transform(df['Positivity percent'].values.reshape(-1, 1))
    df['Daily Cases'] = imputer.fit_transform(df['Daily Cases'].values.reshape(-1, 1))
    return df

dataFrame3 = allGreeceData(dataFrame)
print(dataFrame3.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 369 entries, 11802 to 12170
Data columns (total 17 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   Entity                           369 non-null    object        
 1   Continent                        369 non-null    object        
 2   Latitude                         369 non-null    float64       
 3   Longitude                        369 non-null    float64       
 4   Average temperature per year     369 non-null    int64         
 5   Hospital beds per 1000 people    369 non-null    float64       
 6   Medical doctors per 1000 people  369 non-null    float64       
 7   GDP/Capita                       369 non-null    float64       
 8   Population                       369 non-null    int64         
 9   Median age                       369 non-null    int64         
 10  Population aged 65 and over (%)  369 non-null    int64  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[r

In [107]:
# Here starts the training of the model
posPer = dataset['Positivity percent'].values
print(posPer.shape)
# plt.plot(posPer)
# plt.show()

(310,)


In [108]:
# Standardize the data
sc_data = StandardScaler()
posPer = sc_data.fit_transform(posPer.reshape(-1, 1))

In [109]:
sequence = posPer.shape[0] - 1
X = []
y = []
for i in range(sequence, len(posPer)):
    X.append(posPer[i-sequence:i])
    y.append(posPer[i])

In [110]:
X = np.array(X)
y = np.array(y)
print(X.shape, y.shape)

(1, 309, 1) (1, 1)


In [111]:
model = Sequential()
model.add(LSTM(32, input_shape=(sequence, 1)))
model.add(Dense(1))

In [112]:
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Fit keras model on the dataset
model.fit(X, y,  epochs=200, batch_size=1000)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x22704a94c88>

In [113]:
# Predict the positivity percent for the next 3 days
def predictNextDays(date):
    numOfDays = pd.to_datetime(date) - dataFrame3.iloc[posPer.shape[0] - 1]['Date']
    last_sequence = posPer[-sequence:]
    predictions = []
    for _ in range(numOfDays.days):
        input_sequence = last_sequence.reshape(1, sequence, 1)
        prediction = model.predict(input_sequence)
        predictions.append(prediction[0])
        last_sequence = np.concatenate((last_sequence[1:], prediction), axis=0)  
    predictions = sc_data.inverse_transform(predictions)
    return predictions

predictions = predictNextDays('2021-01-03')



In [114]:
print("Predict positivity percent for the next 3 days:\n")
for i, prediction in enumerate(predictions):
    print(f"Date: {dataFrame3.iloc[posPer.shape[0] + i]['Date']}")
    print(f"Prediction {i+1}: {prediction[0]}")
    print(f"Actual value: {dataFrame3.iloc[posPer.shape[0] + i]['Positivity percent']}\n")

Predict positivity percent for the next 3 days:

Date: 2021-01-01 00:00:00
Prediction 1: 0.029469050447493192
Actual value: 0.020845699919689934

Date: 2021-01-02 00:00:00
Prediction 2: 0.02966971193230271
Actual value: 0.05460608586911213

Date: 2021-01-03 00:00:00
Prediction 3: 0.02980879022191189
Actual value: 0.04586077140169332

