### COVID-19 hospitalization prediction model

In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC  
from sklearn.linear_model import LinearRegression 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt

In [4]:
df_Covid_data = pd.read_csv('https://covidtracking.com/data/download/national-history.csv')
df_Covid_data

Unnamed: 0,date,death,deathIncrease,inIcuCumulative,inIcuCurrently,hospitalizedIncrease,hospitalizedCurrently,hospitalizedCumulative,negative,negativeIncrease,onVentilatorCumulative,onVentilatorCurrently,positive,positiveIncrease,recovered,states,totalTestResults,totalTestResultsIncrease
0,2020-12-06,273374.0,1138,31946.0,20145.0,2256,101487.0,585676.0,161986294,1172590,3322.0,7094.0,14534035,176771,5624444.0,56,204063869,1634532
1,2020-12-05,272236.0,2445,31831.0,19950.0,3316,101190.0,583420.0,160813704,1526995,3321.0,7005.0,14357264,211073,5576026.0,56,202429337,2169756
2,2020-12-04,269791.0,2563,31608.0,19858.0,4652,101276.0,580104.0,159286709,1260657,3305.0,6999.0,14146191,224831,5470389.0,56,200259581,1854869
3,2020-12-03,267228.0,2706,31276.0,19723.0,5331,100755.0,575452.0,158026052,1238465,3280.0,6867.0,13921360,210204,5404018.0,56,198404712,1828230
4,2020-12-02,264522.0,2733,31038.0,19680.0,5028,100322.0,570121.0,156787587,982032,3252.0,6855.0,13711156,195796,5322128.0,56,196576482,1459202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,2020-01-26,,0,,,0,,,0,0,,,0,0,,2,2,0
316,2020-01-25,,0,,,0,,,0,0,,,0,0,,2,2,0
317,2020-01-24,,0,,,0,,,0,0,,,0,0,,2,2,0
318,2020-01-23,,0,,,0,,,0,0,,,0,0,,2,2,1


In [2]:
covid = pd.read_csv("covid-states.csv")
covid

Unnamed: 0,date,state,dataQualityGrade,death,deathConfirmed,deathIncrease,deathProbable,hospitalized,hospitalizedCumulative,hospitalizedCurrently,...,totalTestResults,totalTestResultsIncrease,totalTestsAntibody,totalTestsAntigen,totalTestsPeopleAntibody,totalTestsPeopleAntigen,totalTestsPeopleViral,totalTestsPeopleViralIncrease,totalTestsViral,totalTestsViralIncrease
0,2020-12-02,AK,A,122.0,122.0,0,,768.0,768.0,164.0,...,1024643.0,6015,,,,,,0,1024643.0,6015
1,2020-12-02,AL,A,3711.0,3326.0,73,385.0,25821.0,25821.0,1801.0,...,1603523.0,9681,,,73187.0,,1603523.0,9681,,0
2,2020-12-02,AR,A+,2522.0,2312.0,10,210.0,9110.0,9110.0,1088.0,...,1703266.0,11813,,21856.0,,144866.0,,0,1703266.0,11813
3,2020-12-02,AS,D,0.0,,0,,,,,...,1988.0,0,,,,,,0,1988.0,0
4,2020-12-02,AZ,A+,6739.0,6237.0,52,502.0,26312.0,26312.0,2699.0,...,2288204.0,16290,365871.0,,,,2288204.0,16290,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15404,2020-01-24,WA,,,,0,,,,,...,0.0,0,,,,,,0,,0
15405,2020-01-23,MA,,,,0,,,,,...,2.0,1,,,,,,0,2.0,1
15406,2020-01-23,WA,,,,0,,,,,...,0.0,0,,,,,,0,,0
15407,2020-01-22,MA,,,,0,,,,,...,1.0,0,,,,,,0,1.0,1


In [7]:
today = pd.to_datetime('today')
today

Timestamp('2020-12-06 22:21:28.730350')

In [None]:
#print("Size/Shape of the dataset",covid.shape)
#print("Checking for null values:\n",covid.isnull().sum())
#print("Checking Data-type:",covid.dtypes)

In [None]:
pip install seaborn


In [None]:
covid.columns

In [None]:
#Drop columns
covid.drop(['dataQualityGrade','deathIncrease',\
       'deathProbable','hospitalizedCumulative','hospitalizedIncrease',\
       'negativeTestsAntibody', 'negativeTestsPeopleAntibody',\
       'negativeTestsViral', 'onVentilatorCumulative','positiveScore',\
       'positiveTestsAntibody', 'positiveTestsAntigen',\
       'positiveTestsPeopleAntibody', 'positiveTestsPeopleAntigen',\
       'totalTestEncountersViralIncrease', 'totalTestResults',\
       'totalTestResultsIncrease', 'totalTestsAntibody', 'totalTestsAntigen',\
       'totalTestsPeopleAntibody', 'totalTestsPeopleAntigen'],1,inplace = True)

In [None]:
covid.columns

In [None]:
#Data Analysis for Washington DC
dc_data = covid[covid["state"]=="DC"]
dc_data.head()

In [None]:
dc_data = dc_data.fillna(0)
dc_data.head()

In [None]:
dc_data.drop(['deathConfirmed','hospitalized','inIcuCumulative',\
       'positiveCasesViral', 'positiveTestsViral',\
       'totalTestsViral','totalTestsViralIncrease'],1,inplace = True)

In [None]:
# Export the DC data csv - when you review the exported file there are 0 deaths reported for DC!
dc_data.to_csv("dc_covid.csv")

In [None]:
dc_data.set_index('date', inplace =True)


In [None]:
# Specified data columns
X = dc_data[['hospitalizedCurrently','inIcuCurrently','onVentilatorCurrently','positive',\
             'totalTestEncountersViral',\
             'totalTestsPeopleViral']]
y = dc_data['death']

In [None]:
# Note that there is no second value for y?????
print(X.shape, y.shape)

In [None]:
data_numeric = dc_data[['hospitalizedCurrently','inIcuCurrently','onVentilatorCurrently','positive',\
             'totalTestEncountersViral',\
             'totalTestsPeopleViral','death']]
plt.figure(figsize=(20, 10))
sns.pairplot(data_numeric)
plt.show()

## Model - LinearRegression

In [None]:
# Specified data columns
# Reshape your data either using array.reshape(-1, 1) 
#if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
X = dc_data[['hospitalizedCurrently','inIcuCurrently','onVentilatorCurrently','positive',\
             'totalTestEncountersViral',\
             'totalTestsPeopleViral']]
y = dc_data['death'].values.reshape(-1,1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,)

In [None]:
model = LinearRegression()
model

In [None]:
model.fit(X_train, y_train)

In [None]:
print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

In [None]:
print('Weight coefficients:', model.coef_)
print('y-axis intercept:', model.intercept_)

#### Sklearn provides a variety of scaling and normalization options. The two most common are minmax and StandardScaler. Use StandardScaler when you don't know anything about your data. 

In [None]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [None]:
fig1 = plt.figure(figsize=(12, 6))
axes1 = fig1.add_subplot(1, 2, 1)
axes2 = fig1.add_subplot(1, 2, 2)

axes1.set_title("Original Data")
axes2.set_title("Scaled Data")

maxx = X_train["positive"].max()
maxy = y_train.max()
axes1.set_xlim(-maxx + 1, maxx + 1)
axes1.set_ylim(-maxy + 1, maxy + 1)

axes2.set_xlim(-2, 2)
axes2.set_ylim(-2, 2)

def set_axes(ax):
    ax.spines['left'].set_position('center')
    ax.spines['right'].set_color('none')
    ax.spines['bottom'].set_position('center')
    ax.spines['top'].set_color('none')
    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')
    
set_axes(axes1)
set_axes(axes2)

axes1.scatter(X_train["positive"], y_train)
axes2.scatter(X_train_scaled[:,0], y_train_scaled[:])

###  Fit the Model to the scaled training data and make predictions using the scaled test data

In [None]:
model.fit(X_train_scaled, y_train_scaled)
plt.scatter(model.predict(X_train_scaled), model.predict(X_train_scaled) - y_train_scaled, c="blue", label="Training Data")
plt.scatter(model.predict(X_test_scaled), model.predict(X_test_scaled) - y_test_scaled, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y_test_scaled.min(), xmax=y_test_scaled.max())
plt.title("Residual Plot")
plt.show()

In [None]:
from sklearn.preprocessing import MinMaxScaler
X_minmax = MinMaxScaler().fit(X_train)
y_minmax = MinMaxScaler().fit(y_train)

X_train_minmax = X_minmax.transform(X_train)
X_test_minmax = X_minmax.transform(X_test)
y_train_minmax = y_minmax.transform(y_train)
y_test_minmax = y_minmax.transform(y_test)

In [None]:
## Specified the maxx - X_train to'positive' as this was the highest Coeff value 0.0590678 - Should this be the maxx?

fig1 = plt.figure(figsize=(12, 6))
axes1 = fig1.add_subplot(1, 2, 1)
axes2 = fig1.add_subplot(1, 2, 2)

axes1.set_title("Original Data")
axes2.set_title("Min Max Scaled Data")

maxx = X_train["positive"].max()
maxy = y_train.max()
axes1.set_xlim(-maxx + 1, maxx + 1)
axes1.set_ylim(-maxy + 1, maxy + 1)

axes2.set_xlim(-1, 1)
axes2.set_ylim(-1, 1)

def set_axes(ax):
    ax.spines['left'].set_position('center')
    ax.spines['right'].set_color('none')
    ax.spines['bottom'].set_position('center')
    ax.spines['top'].set_color('none')
    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')
    
set_axes(axes1)
set_axes(axes2)

axes1.scatter(X_train["positive"], y_train)
axes2.scatter(X_train_minmax[:,0], y_train_minmax[:])

## Quantify the model using scaled data

In [None]:
from sklearn.metrics import mean_squared_error

predictions = model.predict(X_test_scaled)
MSE = mean_squared_error(y_test_scaled, predictions)
r2 = model.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

## Score for non-scaled data

In [None]:
predicted = model.predict(X)

In [None]:
#Score the prediction with mse and r2
mse = mean_squared_error(y, predicted)
r2 = r2_score(y, predicted)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

In [None]:
# Overall score for the model
model.score(X,y)