# Model Training/Evaluation

### Student: David Anim-Addo

### Course: IBM Advanced Datascience Capstone

In [2]:
# import the required libraries for defining the model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as dates
import seaborn as sns
%matplotlib inline 
from scipy.optimize import curve_fit
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

## Load and prepare the data for modeling

In [3]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Country_Region,Dates,Global_Confirmed,Global_Recovered,Global_Deaths
0,Afghanistan,2020-01-22,0.0,0.0,0.0
1,Afghanistan,2020-01-23,0.0,0.0,0.0
2,Afghanistan,2020-01-24,0.0,0.0,0.0
3,Afghanistan,2020-01-25,0.0,0.0,0.0
4,Afghanistan,2020-01-26,0.0,0.0,0.0


In [4]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Country/Region
0,Afghanistan
1,Albania
2,Algeria
3,Andorra
4,Angola


In [5]:
df_covid_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16340 entries, 0 to 16339
Data columns (total 5 columns):
Country_Region      16340 non-null object
Dates               16340 non-null object
Global_Confirmed    16340 non-null float64
Global_Recovered    16340 non-null float64
Global_Deaths       16340 non-null float64
dtypes: float64(3), object(2)
memory usage: 638.4+ KB


In [6]:
# convert values in the dates column to datetime
df_covid_test['Dates'] = pd.to_datetime(df_covid_test['Dates'])
df_covid_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16340 entries, 0 to 16339
Data columns (total 5 columns):
Country_Region      16340 non-null object
Dates               16340 non-null datetime64[ns]
Global_Confirmed    16340 non-null float64
Global_Recovered    16340 non-null float64
Global_Deaths       16340 non-null float64
dtypes: datetime64[ns](1), float64(3), object(1)
memory usage: 638.4+ KB


In [7]:
# convert the datetime values to matplotlib floats
df_covid_test['Dates'] = dates.date2num(df_covid_test['Dates'].values)
df_covid_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16340 entries, 0 to 16339
Data columns (total 5 columns):
Country_Region      16340 non-null object
Dates               16340 non-null float64
Global_Confirmed    16340 non-null float64
Global_Recovered    16340 non-null float64
Global_Deaths       16340 non-null float64
dtypes: float64(4), object(1)
memory usage: 638.4+ KB


In [8]:
# check for empty cells
df_covid_test.isna().sum()

Country_Region      0
Dates               0
Global_Confirmed    0
Global_Recovered    0
Global_Deaths       0
dtype: int64

In [9]:
df_covid_test['Country_Region'].value_counts().sum()

16340

In [10]:
# set the countries as the index
df_covid_index = df_covid_test.set_index(['Country_Region'])
df_covid_index.head()

Unnamed: 0_level_0,Dates,Global_Confirmed,Global_Recovered,Global_Deaths
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,737446.0,0.0,0.0,0.0
Afghanistan,737447.0,0.0,0.0,0.0
Afghanistan,737448.0,0.0,0.0,0.0
Afghanistan,737449.0,0.0,0.0,0.0
Afghanistan,737450.0,0.0,0.0,0.0


## Train the Gradient Boosted Regressor on the data (1/22/20 to 4/26/20)

In [20]:
# store the errors/scores
errors_mae =[]
errors_rmsle = []
errors_rmse = []
scores_r2 = []

# create a loop that trains the data from each country and stores the metrics
for i in df_covid_country.values:
    df_test = df_covid_index.loc[i]
    
    # set the feature values for x and y
    x_data = df_test['Dates'].values
    y_data = df_test['Global_Deaths'].values
    
    if sum(y_data) == 0:
        continue        

    # Normalize the data before modeling
    x_norm = x_data/max(x_data)
    y_norm = y_data/max(y_data)

    # split the data into train and test samples
    x_train, x_test, y_train, y_test = train_test_split(x_norm, y_norm, test_size=0.2, random_state=4)
      
    # set the parameters
    params = {'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1, 'criterion': 'friedman_mse'}

    # reshape the arrays without changing the data
    x_train_reshape = x_train.reshape(-1,1) 
    x_test_reshape = x_test.reshape(-1,1)
    
    # define the model
    gbr = GradientBoostingRegressor(**params)
    
    #print(i, y_train)
  
    # train the model
    gbr.fit(x_train_reshape, y_train)

    # use the model to predict
    y_hat = gbr.predict(x_test_reshape)
    
    # create a dataframe to store the errors/score for each iteration  
    errors_mae.append(y_hat - y_test)
    errors_rmsle.append((np.log(y_hat+1) - np.log(y_test+1))**2)
    errors_rmse.append((y_hat - y_test)**2)
 


## Evaluate the Gradient Boosted Regression Model

In [21]:
errors_mae = pd.DataFrame(errors_mae)
errors_rmsle = pd.DataFrame(errors_rmsle)
errors_rmse = pd.DataFrame(errors_rmse)

# calculate evaluation metrics
mae = (np.absolute(errors_mae.sum(axis=0))).mean()
rmsle = np.sqrt(((errors_rmsle.sum(axis=0)).mean()))
rmse = np.sqrt(((errors_rmse.sum(axis=0)).mean()))

# print the metrics
print(f"Mean Absolute Error: {mae: 0.5f}")
print(f"Root Mean Squared Logarithmic Error: {rmsle: 0.5f}")
print(f"Root Mean Squared Error: {rmse: 0.5f}")


Mean Absolute Error:  1.17165
Root Mean Squared Logarithmic Error:  0.30490
Root Mean Squared Error:  0.43243


In [22]:
eval_matrix = {'GBR Metrics':['Mean Absolute Error', 'Root Mean Squared Logarithmic Error', 'Root Mean Squared Error'], f'{params}':[mae, rmsle, rmse]}

eval_matrix = pd.DataFrame(eval_matrix)

eval_matrix

Unnamed: 0,GBR Metrics,"{'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1, 'criterion': 'friedman_mse'}"
0,Mean Absolute Error,1.171655
1,Root Mean Squared Logarithmic Error,0.304899
2,Root Mean Squared Error,0.432432
