In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns

In [2]:
df = pd.read_csv("train_data.xlsx - Sheet1.csv")
df.head()

Unnamed: 0,Lat,Long_,Deaths,Case_Fatality_Ratio
0,33.93911,67.709953,,3.779217
1,41.1533,20.1683,,1.077234
2,28.0339,1.6596,,2.536905
3,42.5063,1.5218,165.0,0.345543
4,-11.2027,17.8739,,1.836434


In [3]:
df.isnull().sum()

Lat                      91
Long_                    91
Deaths                 1558
Case_Fatality_Ratio      44
dtype: int64

In [4]:
df.shape

(4016, 4)

In [5]:
df['Lat'] = df['Lat'].fillna(df['Lat'].median())
df['Long_'] = df['Long_'].fillna(df['Long_'].median())


In [6]:
df = df.dropna(subset=['Case_Fatality_Ratio'])

In [7]:
df.isnull().sum()

Lat                       0
Long_                     0
Deaths                 1556
Case_Fatality_Ratio       0
dtype: int64

In [8]:
null = df[~df['Deaths'].isnull()]

In [9]:
null.head()

Unnamed: 0,Lat,Long_,Deaths,Case_Fatality_Ratio
3,42.5063,1.5218,165.0,0.345543
5,-71.9499,23.347,0.0,0.0
6,17.0608,-61.7964,146.0,1.603338
9,-35.4735,149.0124,138.0,0.06136
11,-12.4634,130.8456,84.0,0.081552


In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor

poly = PolynomialFeatures(include_bias=True)
DT = DecisionTreeRegressor(random_state=0)
LR = LinearRegression()

**Linear Regression**

In [11]:
X = null.drop('Deaths',axis='columns')
Y = null['Deaths']

In [12]:
xtrain_null,xtest_null,ytrain_null,ytest_null = train_test_split(X,Y,test_size=0.2,random_state=42)

In [13]:
LR.fit(xtrain_null,ytrain_null)

In [14]:
ypred_lr_null = LR.predict(xtest_null)

In [15]:
from sklearn.metrics import mean_absolute_error,mean_squared_error

print(mean_squared_error(ytest_null,ypred_lr_null))
print(mean_absolute_error(ytest_null,ypred_lr_null))

3295.693086249361
48.76168462308372


**Polynomial Regression**

In [16]:
xtrain_poly_null = poly.fit_transform(xtrain_null)
xtest_poly_null = poly.fit_transform(xtest_null)

In [17]:
poly_LR = LinearRegression()

poly_LR.fit(xtrain_poly_null,ytrain_null)

In [18]:
ypred_poly_null_lr = poly_LR.predict(xtest_poly_null)

In [19]:
print(mean_absolute_error(ytest_null,ypred_poly_null_lr))

43.88114674741203


**Tuning**

In [20]:
poly_LR_grid = LinearRegression(fit_intercept=False,positive=True)

In [21]:
from sklearn.model_selection import GridSearchCV

grid_params = {
    'fit_intercept': [True, False],
    'positive': [True, False]
}

grid = GridSearchCV(poly_LR_grid,grid_params,cv=5)

grid.fit(xtrain_poly_null,ytrain_null)
grid.best_params_

{'fit_intercept': False, 'positive': True}

In [22]:
poly_LR_grid.fit(xtrain_poly_null,ytrain_null)

In [23]:
ypred_poly_lr_null_grid = poly_LR_grid.predict(xtest_poly_null)

In [24]:
print(mean_absolute_error(ytest_null,ypred_poly_lr_null_grid))

48.7350084942183


**filling the death values**

In [25]:
df_poly_features = poly.transform(df.drop('Deaths',axis='columns'))

In [26]:
df['Deaths predicted'] = poly_LR.predict(df_poly_features)

In [27]:
df['Deaths'].max()

np.float64(200.0)

In [28]:
df['Deaths'] = df['Deaths'].fillna(df['Deaths predicted'])

In [29]:
df['Deaths'] = round(df['Deaths'])

In [30]:
df.drop('Deaths predicted',axis='columns',inplace=True)

In [31]:
df.head()

Unnamed: 0,Lat,Long_,Deaths,Case_Fatality_Ratio
0,33.93911,67.709953,549.0,3.779217
1,41.1533,20.1683,142.0,1.077234
2,28.0339,1.6596,268.0,2.536905
3,42.5063,1.5218,165.0,0.345543
4,-11.2027,17.8739,246.0,1.836434


In [32]:
df.loc[df['Deaths'] < 0,'Deaths'] = 0

In [33]:
df.to_csv('cleanedata.csv')

In [34]:
df['CFR'] = df['Case_Fatality_Ratio'] / 100

In [35]:
df['confirmed cases'] = df['Deaths'] / df['CFR']

In [36]:
df.head()

Unnamed: 0,Lat,Long_,Deaths,Case_Fatality_Ratio,CFR,confirmed cases
0,33.93911,67.709953,549.0,3.779217,0.037792,14526.819746
1,41.1533,20.1683,142.0,1.077234,0.010772,13181.912127
2,28.0339,1.6596,268.0,2.536905,0.025369,10564.052898
3,42.5063,1.5218,165.0,0.345543,0.003455,47751.000004
4,-11.2027,17.8739,246.0,1.836434,0.018364,13395.528496


In [37]:
df['confirmed cases'] = round(df['confirmed cases'])

In [38]:
df.head()

Unnamed: 0,Lat,Long_,Deaths,Case_Fatality_Ratio,CFR,confirmed cases
0,33.93911,67.709953,549.0,3.779217,0.037792,14527.0
1,41.1533,20.1683,142.0,1.077234,0.010772,13182.0
2,28.0339,1.6596,268.0,2.536905,0.025369,10564.0
3,42.5063,1.5218,165.0,0.345543,0.003455,47751.0
4,-11.2027,17.8739,246.0,1.836434,0.018364,13396.0


In [39]:
df.drop('CFR',axis=True,inplace=True)

In [40]:
df.head()

Unnamed: 0,Lat,Long_,Deaths,Case_Fatality_Ratio,confirmed cases
0,33.93911,67.709953,549.0,3.779217,14527.0
1,41.1533,20.1683,142.0,1.077234,13182.0
2,28.0339,1.6596,268.0,2.536905,10564.0
3,42.5063,1.5218,165.0,0.345543,47751.0
4,-11.2027,17.8739,246.0,1.836434,13396.0


In [41]:
df['confirmed cases'] = df['confirmed cases'].fillna(0)

In [42]:
df.to_csv('cleanedata.csv')