## Bias & Variance

In [1]:
# estimate the bias and variance for a regression model
import numpy as np
import pandas as pd
from pandas import read_csv
import operator
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from mlxtend.evaluate import bias_variance_decomp
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from scipy.stats import linregress


In [2]:
# load dataset
#url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'
#dataframe = read_csv(url, header=None)

#   Read the csv file into a pandas DataFrame
df = pd.read_csv('../CSVs/merged_covid_cases_hosp_data.csv', 
                 parse_dates=['date'])
df


Unnamed: 0.1,Unnamed: 0,Id,country_id,date,confirmed,deaths,recovered,active,Daily hospital occupancy
0,0,6232,AUS,2020-01-22,0,0,0,0,441.0
1,1,6233,AUS,2020-01-23,0,0,0,0,415.0
2,2,6234,AUS,2020-01-24,0,0,0,0,457.0
3,3,6235,AUS,2020-01-25,0,0,0,0,490.0
4,4,6236,AUS,2020-01-26,4,0,0,4,457.0
...,...,...,...,...,...,...,...,...,...
29597,29597,136320,USA,2022-03-06,79276278,958819,0,78317459,0.0
29598,29598,136321,USA,2022-03-07,79339388,960505,0,78378883,0.0
29599,29599,136322,USA,2022-03-08,79369007,961843,0,78407164,0.0
29600,29600,136323,USA,2022-03-09,79406602,963819,0,78442783,0.0


In [3]:
#   Drop 'Unnamed: 0' - not sure where it came from - it's not in the source CSV file
df.drop('Unnamed: 0', axis=1, inplace=True)
df


Unnamed: 0,Id,country_id,date,confirmed,deaths,recovered,active,Daily hospital occupancy
0,6232,AUS,2020-01-22,0,0,0,0,441.0
1,6233,AUS,2020-01-23,0,0,0,0,415.0
2,6234,AUS,2020-01-24,0,0,0,0,457.0
3,6235,AUS,2020-01-25,0,0,0,0,490.0
4,6236,AUS,2020-01-26,4,0,0,4,457.0
...,...,...,...,...,...,...,...,...
29597,136320,USA,2022-03-06,79276278,958819,0,78317459,0.0
29598,136321,USA,2022-03-07,79339388,960505,0,78378883,0.0
29599,136322,USA,2022-03-08,79369007,961843,0,78407164,0.0
29600,136323,USA,2022-03-09,79406602,963819,0,78442783,0.0


In [4]:
# Use Pandas get_dummies to convert categorical data
df = pd.get_dummies(df)
df


Unnamed: 0,Id,date,confirmed,deaths,recovered,active,Daily hospital occupancy,country_id_AUS,country_id_AUT,country_id_BEL,...,country_id_NOR,country_id_POL,country_id_PRT,country_id_ROU,country_id_SRB,country_id_SVK,country_id_SVN,country_id_SWE,country_id_USA,country_id_ZAF
0,6232,2020-01-22,0,0,0,0,441.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,6233,2020-01-23,0,0,0,0,415.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6234,2020-01-24,0,0,0,0,457.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6235,2020-01-25,0,0,0,0,490.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6236,2020-01-26,4,0,0,4,457.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29597,136320,2022-03-06,79276278,958819,0,78317459,0.0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
29598,136321,2022-03-07,79339388,960505,0,78378883,0.0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
29599,136322,2022-03-08,79369007,961843,0,78407164,0.0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
29600,136323,2022-03-09,79406602,963819,0,78442783,0.0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [5]:
#   Drop 'date' - it's not required in measuring Bias & Variance
df.drop('date', axis=1, inplace=True)
df


Unnamed: 0,Id,confirmed,deaths,recovered,active,Daily hospital occupancy,country_id_AUS,country_id_AUT,country_id_BEL,country_id_BGR,...,country_id_NOR,country_id_POL,country_id_PRT,country_id_ROU,country_id_SRB,country_id_SVK,country_id_SVN,country_id_SWE,country_id_USA,country_id_ZAF
0,6232,0,0,0,0,441.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,6233,0,0,0,0,415.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6234,0,0,0,0,457.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6235,0,0,0,0,490.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6236,4,0,0,4,457.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29597,136320,79276278,958819,0,78317459,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
29598,136321,79339388,960505,0,78378883,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
29599,136322,79369007,961843,0,78407164,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
29600,136323,79406602,963819,0,78442783,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [6]:
# separate into inputs and outputs
data = df.values
X, y = data[:, :-1], data[:, -1]
# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)


In [7]:
# define the model
model = LinearRegression()
# estimate bias and variance
mse, bias, var = bias_variance_decomp(model, X_train, y_train, X_test, y_test, loss='mse', num_rounds=200, random_seed=1)
# summarize results
print('MSE: %.3f' % mse)
print('Bias: %.3f' % bias)
print('Variance: %.3f' % var)

MSE: 0.000
Bias: 0.000
Variance: 0.000
