In [1]:
#Package Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import skimage.feature
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import sklearn.cluster
from sklearn import svm
from sklearn.metrics import confusion_matrix

pd.options.display.max_rows= 350


In [2]:
#Load CSVs
dataset = pd.read_csv('time_series_covid19_confirmed_global.csv')
recoveries = pd.read_csv('time_series_covid19_recovered_global.csv')
deaths = pd.read_csv('time_series_covid19_deaths_global.csv')
#Lets take a look
#dataset


In [3]:
df = dataset.drop(columns=['Long', 'Lat']).fillna('').groupby('Country/Region').sum()

recoveries = recoveries.drop(columns=['Long', 'Lat']).fillna('').groupby('Country/Region').sum()

deaths = deaths.drop(columns=['Long', 'Lat']).fillna('').groupby('Country/Region').sum()
df

Unnamed: 0_level_0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,4/5/20,4/6/20,4/7/20,4/8/20,4/9/20,4/10/20,4/11/20,4/12/20,4/13/20,4/14/20
Country/Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,0,0,0,0,0,0,0,0,0,0,...,349,367,423,444,484,521,555,607,665,714
Albania,0,0,0,0,0,0,0,0,0,0,...,361,377,383,400,409,416,433,446,467,475
Algeria,0,0,0,0,0,0,0,0,0,0,...,1320,1423,1468,1572,1666,1761,1825,1914,1983,2070
Andorra,0,0,0,0,0,0,0,0,0,0,...,501,525,545,564,583,601,601,638,646,659
Angola,0,0,0,0,0,0,0,0,0,0,...,14,16,17,19,19,19,19,19,19,19
Antigua and Barbuda,0,0,0,0,0,0,0,0,0,0,...,15,15,19,19,19,19,21,21,23,23
Argentina,0,0,0,0,0,0,0,0,0,0,...,1451,1554,1628,1715,1795,1975,1975,2142,2208,2277
Armenia,0,0,0,0,0,0,0,0,0,0,...,822,833,853,881,921,937,967,1013,1039,1067
Australia,0,0,0,0,4,5,5,6,9,9,...,5687,5797,5895,6010,6108,6215,6303,6315,6351,6415
Austria,0,0,0,0,0,0,0,0,0,0,...,12051,12297,12639,12942,13244,13555,13806,13945,14041,14226


In [4]:
numpydata = df.to_numpy()
numpydataDeaths= deaths.to_numpy()
numpydataRecoveries = recoveries.to_numpy()
countryList = df.index.tolist()

In [5]:
dateList = list(df.columns)


# Extracting Engineered TimeSeries Based Features

In [6]:
#Calculating features from John Hopkins time series

#Features in result are AvgDailyIncrease/Start/End


newDataframe = []

#Parameters
timespan = 14
target = 21

#Logic
result=np.zeros([numpydata.shape[0],3])
targetCases=np.zeros([numpydata.shape[0]])
targetRecoveries=np.zeros([numpydata.shape[0]])
targetDeaths=np.zeros([numpydata.shape[0]])
#For each country
for x in range(numpydata.shape[0]):
    #For each day
    observationFlag = timespan
    grabLabel = target
    temp=np.zeros([timespan])
    for i in range(numpydata.shape[1]):
        
        #Look for outbreak then calculate single point data over timescale
        if (observationFlag>0 and numpydata[x][i] > 0):
            if(observationFlag == timespan):
                outbreakDate=i
            observationFlag = observationFlag - 1 #observationFlag is a count used to capture the observation period
            temp[observationFlag]=numpydata[x][i]
        #Grabing Label for cases    
        if (grabLabel>-1 and numpydata[x][i] > 0):
            grabLabel = grabLabel - 1
        if (grabLabel == 0):
            targetCases[x]=numpydata[x][i] #Single label for a country
            targetDeaths[x]=numpydataDeaths[x][i] #Single label for a country
            targetRecoveries[x]=numpydataRecoveries[x][i] #Single label for a country
            
            
    #Now we have captured data for the current country we can perform calculations
    rev = temp[::-1]
    if(temp[0]==0):
        #If we do not have data for the enitre period we treat country as an outlier
        result[x][0] = 0
        result[x][1] = 0
        result[x][2] = 0
    else:
        dailyIncrease = 0
        for k in range (rev.shape[0]-1):
            increase = rev[k+1]-rev[k]
            dailyIncrease = increase + dailyIncrease
        result[x][0] = dailyIncrease/(rev.shape[0]-1)
        result[x][1] = rev[0]
        result[x][2] = rev[timespan-1]
    #Else we have sufficient data and can calculate features
    
    
    
    newDataframe.append({'Country/Region': countryList[x],'OutbreakDate':dateList[outbreakDate], 'Average Daily Increase':result[x][0],'Start Cases':result[x][1] ,'End Cases' :result[x][2] ,'Cases Label':targetCases[x],'Deaths Label':targetDeaths[x],'Recoveries Label':targetRecoveries[x]})
    
featuresDataframe = pd.DataFrame(newDataframe)
featuresDataframe

Unnamed: 0,Country/Region,OutbreakDate,Average Daily Increase,Start Cases,End Cases,Cases Label,Deaths Label,Recoveries Label
0,Afghanistan,2/24/20,0.230769,1.0,4.0,16.0,0.0,0.0
1,Albania,3/9/20,6.692308,2.0,89.0,212.0,10.0,33.0
2,Algeria,2/25/20,1.461538,1.0,20.0,54.0,4.0,12.0
3,Andorra,3/2/20,0.0,1.0,1.0,113.0,1.0,1.0
4,Angola,3/20/20,0.538462,1.0,8.0,19.0,2.0,2.0
5,Antigua and Barbuda,3/13/20,0.461538,1.0,7.0,9.0,0.0,0.0
6,Argentina,3/3/20,4.230769,1.0,56.0,301.0,4.0,3.0
7,Armenia,3/1/20,1.307692,1.0,18.0,160.0,0.0,1.0
8,Australia,1/26/20,0.846154,4.0,15.0,15.0,0.0,8.0
9,Austria,2/25/20,9.923077,2.0,131.0,1018.0,3.0,6.0


In [7]:
#Now thats done we join our various health datasets and can compute date differences*

lockdownDatesFrame = pd.read_csv('countryLockdowndates.csv')
popFrame = pd.read_csv('data.csv')

lockdownDatesFrame['Province'].fillna(lockdownDatesFrame['Country/Region'], inplace = True)
lockdownDatesFrame['Country/Region']=lockdownDatesFrame['Province']
featuresDataframe2 = pd.merge(featuresDataframe,lockdownDatesFrame[['Country/Region','Date','Type']],on='Country/Region', how='left')
featuresDataframe2.dtypes

featuresDataframe2['Date'] = pd.to_datetime(featuresDataframe2['Date'],dayfirst=True)
featuresDataframe2['OutbreakDate'] = pd.to_datetime(featuresDataframe2['OutbreakDate'])
featuresDataframe2['DaysUntilLockdown'] = (featuresDataframe2['Date'] -featuresDataframe2['OutbreakDate']).dt.days

popFrame = popFrame.rename(columns={"name": "Country/Region", "pop2020":"Population"})
featuresDataframe2 = pd.merge(featuresDataframe2,popFrame[['Country/Region','density']],on='Country/Region', how='left')
featuresDataframe2['Type'].fillna('None', inplace = True)

#China in Lockdown 23/01/2020

type_dictionary = {'None':0, 'Partial':1,'Full': 2}
featuresDataframe2['Type'] = featuresDataframe2['Type'].apply(lambda x: type_dictionary[x])
featuresDataframe2


healthFrameAll = pd.read_csv('Life Expectancy Data.csv')
healthFrame = healthFrameAll.loc[healthFrameAll['Year'] == 2014]
healthFrame = healthFrame.rename(columns={"Country": "Country/Region"})
featuresDataframe2 = pd.merge(featuresDataframe2,healthFrame[['Country/Region','Population', ' BMI ', 'GDP', 'Total expenditure', 'percentage expenditure','Life expectancy ']],on='Country/Region', how='left')
featuresDataframe2 = featuresDataframe2.rename(columns={" BMI ": "BMI", "Life expectancy ":"Life expectancy"})
featuresDataframe2.dropna(subset=['BMI','Population','GDP','density'], inplace=True)
featuresDataframe2['DaysUntilLockdown'].fillna(target, inplace = True)
#replace days until lockdown with target,as we cant see into the future
featuresDataframe2.loc[featuresDataframe2.DaysUntilLockdown > timespan, 'DaysUntilLockdown'] = target

finalFeatures = featuresDataframe2.drop(columns=['OutbreakDate', 'Date'])
finalFeatures.drop(finalFeatures[finalFeatures['End Cases'] == 0].index, inplace = True)
finalFeatures.drop(finalFeatures[finalFeatures['Cases Label'] == 0].index, inplace = True)
#finalFeatures.drop(finalFeatures[finalFeatures['End Cases'].index == 0 ] , inplace=True)

#If lockdown not yet in effect change type
finalFeatures.loc[finalFeatures.DaysUntilLockdown == target, 'Type'] = 0
#finalFeatures.drop(columns=['index'])
finalFeatures.reset_index().drop(columns=['index'])
finalFeatures.loc[finalFeatures.DaysUntilLockdown < 0, 'DaysUntilLockdown'] = 0
finalFeatures

Unnamed: 0,Country/Region,Average Daily Increase,Start Cases,End Cases,Cases Label,Deaths Label,Recoveries Label,Type,DaysUntilLockdown,density,Population,BMI,GDP,Total expenditure,percentage expenditure,Life expectancy
0,Afghanistan,0.230769,1.0,4.0,16.0,0.0,0.0,0,21.0,59.685,327582.0,18.6,612.696514,8.18,73.523582,59.9
1,Albania,6.692308,2.0,89.0,212.0,10.0,33.0,2,0.0,100.1043,288914.0,57.2,4575.763787,5.88,428.749067,77.5
2,Algeria,1.461538,1.0,20.0,54.0,4.0,12.0,0,21.0,18.4113,39113310.0,58.4,547.8517,7.21,54.237318,75.4
4,Angola,0.538462,1.0,8.0,19.0,2.0,2.0,2,4.0,26.3626,2692466.0,22.7,479.31224,3.31,23.965612,51.7
6,Argentina,4.230769,1.0,56.0,301.0,4.0,3.0,0,21.0,16.2551,42981520.0,62.2,12245.25645,4.79,847.371746,76.2
7,Armenia,1.307692,1.0,18.0,160.0,0.0,1.0,0,21.0,99.6282,29622.0,54.1,3994.712355,4.48,295.608714,74.6
8,Australia,0.846154,4.0,15.0,15.0,0.0,8.0,0,21.0,3.3151,2346694.0,66.1,62214.6912,9.42,10769.36305,82.7
9,Austria,9.923077,2.0,131.0,1018.0,3.0,6.0,0,21.0,107.3839,8541575.0,57.1,51322.63997,11.21,8350.193523,81.4
10,Azerbaijan,0.923077,3.0,15.0,53.0,1.0,11.0,2,1.0,117.0806,953579.0,51.5,7891.299776,6.4,306.182431,72.5
13,Bangladesh,1.692308,3.0,25.0,48.0,5.0,15.0,2,11.0,1116.0086,15945280.0,17.7,184.56543,2.82,10.446403,71.4


### Feature Selection

In [8]:
countryListLabel = list(finalFeatures['Country/Region'])
X = finalFeatures.drop(columns=['Country/Region', 'Cases Label','Deaths Label', 'Recoveries Label'])   #Feature Matrix
y = finalFeatures["Cases Label"]          #Target Variable

cor = finalFeatures.drop(columns=['Country/Region','Deaths Label', 'Recoveries Label']).corr()
cor_target = abs(cor["Cases Label"])
#Selecting highly correlated features
relevant_features = cor_target
relevant_features

Average Daily Increase    0.975590
Start Cases               0.952965
End Cases                 0.975161
Cases Label               1.000000
Type                      0.030032
DaysUntilLockdown         0.058428
density                   0.006393
Population                0.024414
BMI                       0.001395
GDP                       0.033250
Total expenditure         0.013200
percentage expenditure    0.005297
Life expectancy           0.079478
Name: Cases Label, dtype: float64

In [9]:
countryListLabel = list(finalFeatures['Country/Region'])
X = finalFeatures.drop(columns=['Country/Region', 'Cases Label','Deaths Label', 'Recoveries Label'])   #Feature Matrix
y = finalFeatures["Recoveries Label"]          #Target Variable

cor = finalFeatures.drop(columns=['Country/Region','Deaths Label', 'Cases Label']).corr()
cor_target = abs(cor["Recoveries Label"])
#Selecting highly correlated features
relevant_features = cor_target
relevant_features

Average Daily Increase    0.998611
Start Cases               0.998363
End Cases                 0.998700
Recoveries Label          1.000000
Type                      0.055894
DaysUntilLockdown         0.056957
density                   0.001379
Population                0.017280
BMI                       0.033961
GDP                       0.010832
Total expenditure         0.024626
percentage expenditure    0.038747
Life expectancy           0.038165
Name: Recoveries Label, dtype: float64

In [10]:
countryListLabel = list(finalFeatures['Country/Region'])
X = finalFeatures.drop(columns=['Country/Region', 'Cases Label','Deaths Label', 'Recoveries Label'])   #Feature Matrix
y = finalFeatures["Deaths Label"]          #Target Variable

cor = finalFeatures.drop(columns=['Country/Region','Recoveries Label', 'Cases Label']).corr()
cor_target = abs(cor["Deaths Label"])
#Selecting highly correlated features
relevant_features = cor_target
relevant_features

Average Daily Increase    0.992292
Start Cases               0.979485
End Cases                 0.992091
Deaths Label              1.000000
Type                      0.034910
DaysUntilLockdown         0.054034
density                   0.000226
Population                0.016016
BMI                       0.015112
GDP                       0.006177
Total expenditure         0.033522
percentage expenditure    0.036234
Life expectancy           0.052236
Name: Deaths Label, dtype: float64

### Train Test Split 

In [11]:
#Train Test Split before we drop columns
mask = np.random.rand(len(finalFeatures)) < 0.8
train = finalFeatures[mask]
test = finalFeatures[~mask]


### Linear Regression With All Features

In [12]:
from sklearn.linear_model import LinearRegression
#As we only drop colums from now on, not rows, we can grab respective country labels once
countryListLabel_train = list(train['Country/Region'])
X_train = train.drop(columns=['Country/Region', 'Cases Label','Deaths Label', 'Recoveries Label'])   #Setup features
y_train = train["Cases Label"]          #Target

countryListLabel_test = list(test['Country/Region'])
X_test = test.drop(columns=['Country/Region', 'Cases Label','Deaths Label', 'Recoveries Label'])  
y_test = test["Cases Label"]          #


reg = LinearRegression()
reg.fit(X_train,y_train)
y_pred = reg.predict(X_test)


from sklearn import metrics
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)

print('Cases')
print('MAE: {}'.format(mae))
print('MSE: {}'.format(mse))
print('R2 score: {}'.format(r2))

Cases
MAE: 136.3491012335779
MSE: 32757.105185347416
R2 score: -1.5567199586445541


In [13]:

X_train = train.drop(columns=['Country/Region', 'Cases Label','Deaths Label', 'Recoveries Label'])  
y_train = train["Recoveries Label"]          


X_test = test.drop(columns=['Country/Region', 'Cases Label','Deaths Label', 'Recoveries Label'])
y_test = test["Recoveries Label"]         

reg.fit(X_train,y_train)
y_pred = reg.predict(X_test)

mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)

print('Recoveries')

print('MAE: {}'.format(mae))
print('MSE: {}'.format(mse))
print('R2 score: {}'.format(r2))

Recoveries
MAE: 5.2658043386822495
MSE: 38.56932193622071
R2 score: -0.2724949500567704


In [14]:
X_train = train.drop(columns=['Country/Region', 'Cases Label','Deaths Label', 'Recoveries Label'])
y_train = train["Deaths Label"]

countryListLabel_test = list(test['Country/Region'])
X_test = test.drop(columns=['Country/Region', 'Cases Label','Deaths Label', 'Recoveries Label'])  
y_test = test["Deaths Label"]         

reg.fit(X_train,y_train)

y_pred = reg.predict(X_test)


mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)

print('Deaths')

print('MAE: {}'.format(mae))
print('MSE: {}'.format(mse))
print('R2 score: {}'.format(r2))


Deaths
MAE: 2.6792980603192427
MSE: 11.506879731352395
R2 score: -3.379402371589875


### Decision Tree Regression With All Features

In [15]:
from sklearn.tree import DecisionTreeRegressor

In [16]:

X_train = train.drop(columns=['Country/Region', 'Cases Label','Deaths Label', 'Recoveries Label'])   #Feature Matrix
y_train = train["Cases Label"]          

X_test = test.drop(columns=['Country/Region', 'Cases Label','Deaths Label', 'Recoveries Label'])  
y_test = test["Cases Label"]         


tree = DecisionTreeRegressor()
tree.fit(X_train,y_train)
y_pred = tree.predict(X_test)


from sklearn import metrics
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)

print('Cases')
print('MAE: {}'.format(mae))
print('MSE: {}'.format(mse))


Cases
MAE: 78.65
MSE: 14874.65


In [17]:
X_test = test.drop(columns=['Country/Region', 'Cases Label','Deaths Label', 'Recoveries Label'])  
y_test = test["Recoveries Label"]         

tree.fit(X_train,y_train)
y_pred = tree.predict(X_test)

mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)

print('Recoveries')

print('MAE: {}'.format(mae))
print('MSE: {}'.format(mse))


Recoveries
MAE: 143.45
MSE: 82095.35


In [18]:
X_train = train.drop(columns=['Country/Region', 'Cases Label','Deaths Label', 'Recoveries Label'])  
y_train = train["Deaths Label"]         
X_test = test.drop(columns=['Country/Region', 'Cases Label','Deaths Label', 'Recoveries Label'])  
y_test = test["Deaths Label"]          

tree.fit(X_train,y_train)
y_pred = tree.predict(X_test)

mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)

print('Deaths')

print('MAE: {}'.format(mae))
print('MSE: {}'.format(mse))

print(y_test.mean())

Deaths
MAE: 2.25
MSE: 13.65
1.35


### Linear Regression With Feature Selection

In [19]:
X_train = train.drop(columns=['Country/Region', 'Cases Label','Deaths Label', 'Recoveries Label','density','percentage expenditure','Total expenditure'])   #Feature Matrix
y_train = train["Cases Label"]          

countryListLabel_test = list(test['Country/Region'])
X_test = test.drop(columns=['Country/Region', 'Cases Label','Deaths Label', 'Recoveries Label','density','percentage expenditure','Total expenditure'])  
y_test = test["Cases Label"]          


reg = LinearRegression()
reg.fit(X_train,y_train)
y_pred = reg.predict(X_test)


from sklearn import metrics
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)

print('Cases')
print('MAE: {}'.format(mae))
print('MSE: {}'.format(mse))
print('R2 score: {}'.format(r2))

Cases
MAE: 122.63030232317882
MSE: 25316.431518805308
R2 score: -0.9759690418169382


In [20]:
X_train = train.drop(columns=['Country/Region', 'Cases Label','Deaths Label', 'Recoveries Label','density','Population','GDP'])
y_train = train["Recoveries Label"]          


X_test = test.drop(columns=['Country/Region', 'Cases Label','Deaths Label', 'Recoveries Label','density','Population','GDP'])
y_test = test["Recoveries Label"]         

reg.fit(X_train,y_train)
y_pred = reg.predict(X_test)

mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)

print('Recoveries')

print('MAE: {}'.format(mae))
print('MSE: {}'.format(mse))
print('R2 score: {}'.format(r2))

Recoveries
MAE: 5.297359088701219
MSE: 39.29517763051396
R2 score: -0.29644267999056284


In [21]:
X_train = train.drop(columns=['Country/Region', 'Cases Label','Deaths Label', 'Recoveries Label','density','Population','GDP','BMI'])
y_train = train["Deaths Label"]         

countryListLabel_test = list(test['Country/Region'])
X_test = test.drop(columns=['Country/Region', 'Cases Label','Deaths Label', 'Recoveries Label','density','Population','GDP','BMI'])  
y_test = test["Deaths Label"]         

reg.fit(X_train,y_train)

y_pred = reg.predict(X_test)


mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)

print('Deaths')
print('MAE: {}'.format(mae))
print('MSE: {}'.format(mse))
print('R2 score: {}'.format(r2))


Deaths
MAE: 2.2773977290728373
MSE: 9.288904896128388
R2 score: -2.5352635189832116


### Decision Tree Regression With Selected Features

In [22]:

X_train = train.drop(columns=['Country/Region', 'Cases Label','Deaths Label', 'Recoveries Label','density','percentage expenditure','Total expenditure'])   #Feature Matrix
y_train = train["Cases Label"]          

X_test = test.drop(columns=['Country/Region', 'Cases Label','Deaths Label', 'Recoveries Label','density','percentage expenditure','Total expenditure'])  
y_test = test["Cases Label"]         


tree = DecisionTreeRegressor()
tree.fit(X_train,y_train)
y_pred = tree.predict(X_test)


from sklearn import metrics
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)

print('Cases')
print('MAE: {}'.format(mae))
print('MSE: {}'.format(mse))


Cases
MAE: 95.6
MSE: 31048.3


In [23]:
X_test = test.drop(columns=['Country/Region', 'Cases Label','Deaths Label', 'Recoveries Label','density','Population','GDP'])  
y_test = test["Recoveries Label"]    
X_test = test.drop(columns=['Country/Region', 'Cases Label','Deaths Label', 'Recoveries Label','density','Population','GDP'])  
y_test = test["Recoveries Label"]         

tree.fit(X_train,y_train)
y_pred = tree.predict(X_test)

mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)

print('Recoveries')

print('MAE: {}'.format(mae))
print('MSE: {}'.format(mse))


Recoveries
MAE: 125.7
MSE: 44651.0


In [28]:
X_train = train.drop(columns=['Country/Region', 'Cases Label','Deaths Label', 'Recoveries Label','density','Population','GDP','BMI'])  
y_train = train["Deaths Label"]         
X_test = test.drop(columns=['Country/Region', 'Cases Label','Deaths Label', 'Recoveries Label','density','Population','GDP','BMI'])  
y_test = test["Deaths Label"]          

tree.fit(X_train,y_train)
y_pred = tree.predict(X_test)

mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)

print('Deaths')

print('MAE: {}'.format(mae))
print('MSE: {}'.format(mse))


Deaths
MAE: 2.8
MSE: 15.4
