In [1]:
import requests
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


In [2]:
r = requests.get("https://api-pc6dbtrtla-uc.a.run.app/API/timeseries/usa")
response_dict = r.json()
dataset1 = pd.DataFrame.from_dict(response_dict)
dataset1 = dataset1.rename(columns={'Total Results as of Date': 'Date'})
Days = (np.array([i for i in range (len(dataset1.Date))]).reshape(-1,1))+1
dataset1['Days'] = Days 

r2 = requests.get("https://api-pc6dbtrtla-uc.a.run.app/API/timeseries/ita")
response_dict2 = r2.json()
dataset2 = pd.DataFrame.from_dict(response_dict2)
dataset2 = dataset2.rename(columns={'Total Results as of Date': 'Date'})
# Drop the dates with no cases
dataset2 = dataset2[dataset2.Cases != 0]
# Add column Days
Days = (np.array([i for i in range (len(dataset2.Date))]).reshape(-1,1))+1
dataset2['Days'] = Days 

r3 = requests.get("https://api-pc6dbtrtla-uc.a.run.app/API/timeseries/esp")
response_dict3 = r3.json()
dataset3 = pd.DataFrame.from_dict(response_dict3)
dataset3 = dataset3.rename(columns={'Total Results as of Date': 'Date'})
# Drop the dates with no cases
dataset3 = dataset3[dataset3.Cases != 0]
# Add column Days
Days = (np.array([i for i in range (len(dataset3.Date))]).reshape(-1,1))+1
dataset3['Days'] = Days 

r4 = requests.get("https://api-pc6dbtrtla-uc.a.run.app/API/timeseries/gbr")
response_dict4 = r4.json()
dataset4 = pd.DataFrame.from_dict(response_dict4)
dataset4 = dataset4.rename(columns={'Total Results as of Date': 'Date'})
# Drop the dates with no cases
dataset4 = dataset4[dataset4.Cases != 0]
# Add column Days
Days = (np.array([i for i in range (len(dataset4.Date))]).reshape(-1,1))+1
dataset4['Days'] = Days 

r5 = requests.get("https://api-pc6dbtrtla-uc.a.run.app/API/timeseries/fra")
response_dict5 = r5.json()
dataset5 = pd.DataFrame.from_dict(response_dict5)
dataset5 = dataset5.rename(columns={'Total Results as of Date': 'Date'})
# Drop the dates with no cases
dataset5 = dataset5[dataset5.Cases != 0]
# Add column Days
Days = (np.array([i for i in range (len(dataset5.Date))]).reshape(-1,1))+1
dataset5['Days'] = Days 

dataset = pd.concat([dataset1, dataset2, dataset3, dataset4, dataset5])
# dataset

dataset.loc[dataset['Days'] == 50]


Unnamed: 0,ISO3,Country,Date,Cases,Deaths,Recovered,Days
49,USA,US,2020-03-11,1281.0,36.0,8.0,50
58,ITA,Italy,2020-03-20,47021.0,4032.0,4440.0,50
59,ESP,Spain,2020-03-21,25374.0,1375.0,2125.0,50
542,GBR,United Kingdom,2020-03-11,1.0,0.0,1.0,50
504,FRA,France,2020-03-07,949.0,11.0,12.0,50


## -------------------------------- ** ------------------------------------


## Cases

In [3]:
X = dataset[[ "ISO3", "Days" ]].values
y = dataset["Cases"].values
X

array([['USA', 1],
       ['USA', 2],
       ['USA', 3],
       ...,
       ['FRA', 626],
       ['FRA', 627],
       ['FRA', 628]], dtype=object)

In [4]:
# ## Encoding categorical data

# # 0 for iso3 in the first column

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
X

array([[0.0, 0.0, 0.0, 0.0, 1.0, 1],
       [0.0, 0.0, 0.0, 0.0, 1.0, 2],
       [0.0, 0.0, 0.0, 0.0, 1.0, 3],
       ...,
       [0.0, 1.0, 0.0, 0.0, 0.0, 626],
       [0.0, 1.0, 0.0, 0.0, 0.0, 627],
       [0.0, 1.0, 0.0, 0.0, 0.0, 628]], dtype=object)

In [5]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [6]:
regressor = RandomForestRegressor(n_estimators = 10, random_state = 42)
regressor.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [7]:
## Predicting the Test set results

y_pred = regressor.predict(X_test)


# y_pred = regressor.predict([[0,1,0,0,0, 50]])
# y_pred

In [8]:
## Evaluating the Model Performance - R2 Score
r2_score(y_test, y_pred)


0.9088623393477758

In [9]:
## Evaluating the Model Performance - Mean-squared Error
mean_squared_error(y_test, y_pred)

1115492583.08

In [10]:
## Applying k-Fold Cross Validation
accuracies = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 89.16 %
Standard Deviation: 5.64 %


## -------------------------------- ** ------------------------------------

## Recovered

In [11]:
X_rec = dataset[[ "ISO3", "Days"]].values
y_rec = dataset["Recovered"].values

In [12]:
# ## Encoding categorical data

# # 0 for iso3 in the first column

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X_rec = np.array(ct.fit_transform(X_rec))

In [13]:
# Splitting the dataset into the Training set and Test set
X_rec_train, X_rec_test, y_rec_train, y_rec_test = train_test_split(X_rec, y_rec, test_size = 0.2, random_state = 42)

In [14]:
regressor = RandomForestRegressor(n_estimators = 10, random_state = 42)
regressor.fit(X_rec_train, y_rec_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [15]:
## Predicting the Test set results

y_pred_rec = regressor.predict(X_rec_test)


#y_pred_rec = regressor.predict([[1,0, 100]])
#y_pred_rec

In [16]:
## Evaluating the Model Performance - R2 Score
r2_score(y_rec_test, y_pred_rec)


0.8914163251877718

In [17]:
## Applying k-Fold Cross Validation
accuracies = cross_val_score(estimator = regressor, X = X_rec_train, y = y_rec_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 86.11 %
Standard Deviation: 5.96 %


## -------------------------------- ** ------------------------------------


## Deaths

In [18]:
X_death = dataset[[ "ISO3", "Days"]].values
y_death = dataset["Recovered"].values

In [19]:
# ## Encoding categorical data

# # 0 for iso3 in the first column

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X_death = np.array(ct.fit_transform(X_death))


In [20]:
# Splitting the dataset into the Training set and Test set
X_death_train, X_death_test, y_death_train, y_death_test = train_test_split(X_death, y_death, test_size = 0.2, random_state = 42)

In [21]:
regressor = RandomForestRegressor(n_estimators = 10, random_state = 42)
regressor.fit(X_death_train, y_death_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [22]:
## Predicting the Test set results

y_pred_death = regressor.predict(X_death_test)

# y_pred_death = regressor.predict([[1,0, 100]])
# y_pred_death

In [23]:
## Evaluating the Model Performance - R2 Score
r2_score(y_death_test, y_pred_death)


0.8914163251877718

In [24]:
## Applying k-Fold Cross Validation
accuracies = cross_val_score(estimator = regressor, X = X_death_train, y = y_death_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 86.11 %
Standard Deviation: 5.96 %
