In [None]:
#Lince Rumainum
#ISE-5123
#Project: covid-19

In [3]:
#Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier

#colnames = ["date", "state","positive","negative","pending","hospitalizedCurrently",
#           "hospitalizedCumulative","inIcuCurrently","onVentilatorCurrently","death"]
#usecols=[0,1,2,3,4,5,6,7,9,14]
colnames = ["date", "state","positive","negative","pending","hospitalizedCurrently",
            "hospitalizedCumulative","inIcuCurrently","death", 
            "totalTestResults","posNeg", "onVentilatorCurrently"]
dfAll = pd.read_csv('us_states_covid19_daily.csv',
                    usecols=[0,1,2,3,4,5,6,7,14,17,18, 9],
                    header=1,
                    names=colnames)

#fill na with 0s
dfAll['positive'].fillna(0, inplace=True)
dfAll['negative'].fillna(0, inplace=True)
dfAll['pending'].fillna(0, inplace=True)
dfAll['hospitalizedCurrently'].fillna(0, inplace=True)
dfAll['hospitalizedCumulative'].fillna(0, inplace=True)
dfAll['inIcuCurrently'].fillna(0, inplace=True)
dfAll['death'].fillna(0, inplace=True)
dfAll['totalTestResults'].fillna(0, inplace=True)
dfAll['posNeg'].fillna(0, inplace=True)
dfAll['onVentilatorCurrently'].fillna(0, inplace=True)

#groupby Date
n_pos_by_date = dfAll.groupby("date")["positive"].sum()
n_neg_by_date = dfAll.groupby("date")["negative"].sum()
n_pen_by_date = dfAll.groupby("date")["pending"].sum()
n_hospCurr_by_date = dfAll.groupby("date")["hospitalizedCurrently"].sum()
n_hospCumu_by_date = dfAll.groupby("date")["hospitalizedCumulative"].sum()
n_icu_by_date = dfAll.groupby("date")["inIcuCurrently"].sum()
n_death_by_date = dfAll.groupby("date")["death"].sum()
n_totalTest_by_date = dfAll.groupby("date")["totalTestResults"].sum()
n_posneg_by_date = dfAll.groupby("date")["posNeg"].sum()
n_onVen_by_date = dfAll.groupby("date")["onVentilatorCurrently"].sum()

#groupby State
n_pos_by_state = dfAll.groupby("state")["positive"].sum()
n_neg_by_state = dfAll.groupby("state")["negative"].sum()
n_pen_by_date = dfAll.groupby("date")["pending"].sum()
n_hospCurr_by_state = dfAll.groupby("state")["hospitalizedCurrently"].sum()
n_hospCumu_by_state = dfAll.groupby("state")["hospitalizedCumulative"].sum()
n_icu_by_state = dfAll.groupby("state")["inIcuCurrently"].sum()
n_death_by_state = dfAll.groupby("state")["death"].sum()
n_totalTest_by_state = dfAll.groupby("state")["totalTestResults"].sum()
n_posneg_by_state = dfAll.groupby("state")["posNeg"].sum()
n_onVen_by_state = dfAll.groupby("state")["onVentilatorCurrently"].sum()

FileNotFoundError: File b'us_states_covid19_daily.csv' does not exist

In [None]:
#copy to new data frame
dfAll2 = dfAll.copy()
#change int to strings
dfAll2['date'] = dfAll2['date'].apply(str)

#create month and day columns
dfAll2['month'] = ""
dfAll2['day'] = ""

for i in range(0, dfAll2.shape[0]):
    currDate = dfAll2.date[i]
    dayStr =[]
    for j in range(0, 3):
        if j == 0:
            monthInt = int(currDate[j])
            dfAll2.month[i] = monthInt
        else:
            dayStr.append(currDate[j])
    dayStr2 = dayStr[0]+dayStr[1]
    dayInt = int(dayStr2)
    dfAll2.day[i] = dayInt
dfAll2.head()

In [None]:
# import datetime
from datetime import date

#create test date
dfAll2['testDate'] = ""
for i in range(0, dfAll2.shape[0]):
    dfAll2.testDate[i] = date(2020, dfAll2.month[i], dfAll2.day[i])
dfAll2.head()

In [None]:
#plot data by date
#get column names
dfColNames = dfAll2.columns.tolist()
#plot each column var with date
for i in range(2,dfAll2.shape[1]-3):    
    ylbl = str(dfColNames[i])
    dfAll2.plot(x = 'testDate', y = ylbl)    
    plt.gcf().autofmt_xdate() #x-labels: date
    plt.ylabel(ylbl) #y-label
    filename = ylbl + '.png'
    plt.savefig(filename)
    plt.show()

In [None]:
#since there are barely any data before 4 March 2020
#update data
dfStartMarch = dfAll2[(dfAll2['testDate'] > date(2020,3,4))]
dfStartMarch
print("number of rows before:", dfAll2.shape[0]) #check size
print("number of rows after :", dfStartMarch.shape[0]) #check size
print("Percentage from original data: " + 
      str(round(dfStartMarch.shape[0]/dfAll2.shape[0]*100,2)) + "%")

In [None]:
#plot data that start from March
dfColNames = dfStartMarch.columns.tolist()
#plot each column var with date - updated data
for i in range(2,dfStartMarch.shape[1]-3):    
    ylbl = str(dfColNames[i])
    dfStartMarch.plot(x = 'testDate', y = ylbl)    
    plt.gcf().autofmt_xdate() #x-labels: date
    plt.ylabel(ylbl) #y-label
    filename = ylbl + 'StartMarch.png'
    plt.savefig(filename)
    plt.show()

In [None]:
import seaborn as sns

states = dfAll.state
states = np.unique(states)
totalPos = sum(n_pos_by_state)

fractions = []
for i  in range(0,len(states)):
    fractions.append(n_pos_by_state[i]/totalPos)

offsets = [0] * len(states)

fig = plt.figure(figsize=(20,20))
plt.pie(fractions, explode = offsets, labels=states, autopct='%1.1f%%', 
        startangle = 0, colors = sns.color_palette('bright'), 
        textprops={'fontsize': 25}, 
        pctdistance=1.1, labeldistance=0.75)
plt.axis('equal')
plt.rcParams['text.color'] = 'black'
#plt.rcParams['lines.linewidth'] = 2
#plt.rcParams.update({'font.size': 50})
filename = 'allStatesPositiveCases.png'
plt.savefig(filename)
plt.show()

#plot looks awful, continue to update with top 10 states

In [None]:
states = dfAll.state
states = np.unique(states)
statesPosData = {'state': states, 'positiveCount':n_pos_by_state}
dfPosStates = pd.DataFrame(data = statesPosData)
dfPosStates.head()

In [None]:
sortPos = dfPosStates.sort_values('positiveCount', ascending=False).drop_duplicates(['state'])
top10Pos = sortPos[:10]
top10Pos

In [None]:
import seaborn as sns

top10states = list(top10Pos.state)

totalPos = sum(n_pos_by_state)

fractions = []
for i  in range(0,len(top10states)):
    fractions.append(top10Pos.positiveCount[i]/totalPos)
fractions.append(1-sum(fractions))

offsets = [0.1] * len(top10states)
offsets.append(0.5)

top10states.append('Others')
fig = plt.figure(figsize=(20,20))
plt.pie(fractions, explode = offsets, labels=top10states, autopct='%1.1f%%', 
        startangle = 0, colors = sns.color_palette('bright'), 
        textprops={'fontsize': 25}, 
        pctdistance=1.1, labeldistance=0.75)
plt.axis('equal')
plt.rcParams['text.color'] = 'black'
filename = 'top10StatesPositiveCases.png'
plt.savefig(filename)
plt.show()

In [None]:
states = dfAll.state
states = np.unique(states)
statesPosData = {'state': states, 'ventilatorCount':n_onVen_by_state}
dfPosStates = pd.DataFrame(data = statesPosData)
dfPosStates.head()

In [None]:
sortVen = dfPosStates.sort_values('ventilatorCount', ascending=False).drop_duplicates(['state'])
top15Ven = sortVen[:15]
top15Ven

In [None]:
import seaborn as sns

top15states = list(top15Ven.state)

totalVen = sum(n_onVen_by_state)

fractions = []
for i  in range(0,len(top15states)):
    fractions.append(top15Ven.ventilatorCount[i]/totalVen)
fractions.append(1-sum(fractions))

offsets = [0.1] * len(top15states)
offsets.append(0.25)

top15states.append('Others')
fig = plt.figure(figsize=(20,20))
plt.pie(fractions, explode = offsets, labels=top15states, autopct='%1.1f%%', 
        startangle = 0, colors = sns.color_palette('bright'), 
        textprops={'fontsize': 25}, 
        pctdistance=1.1, labeldistance=0.80)
plt.axis('equal')
plt.rcParams['text.color'] = 'black'
filename = 'top10VentilatorNumbers.png'
plt.savefig(filename)
plt.show()

In [None]:
#get data without date and states
dfNums = dfStartMarch[["positive","negative","pending","hospitalizedCurrently",
            "hospitalizedCumulative","inIcuCurrently","death", 
            "totalTestResults","posNeg", "onVentilatorCurrently"]]

In [None]:
from sklearn import preprocessing

y = dfNums.onVentilatorCurrently #target
x=dfNums.drop('onVentilatorCurrently',axis=1)
#X_scaled = preprocessing.scale(x)
# normalize data
#X_scaled = pd.DataFrame(preprocessing.scale(dfNums),columns = dfNums.columns) 
X_scaled = pd.DataFrame(preprocessing.scale(x),columns = dfNums.columns)

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(X_train)
cummulativeSum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cummulativeSum >= 0.95) + 1

print("PCA explained var ratio: ")
print(pca.explained_variance_ratio_)
print("cummulative sum: ")
print(cummulativeSum)
print("dimension needed to reach 95%: ",d)

In [None]:
# Dump components relations with features:
print (pd.DataFrame(pca.components_,columns=X_scaled.columns,
                   index = ['PC-1','PC-2','PC-3','PC-4','PC-5',
                            'PC-6','PC-7','PC-8','PC-9','PC-10','PC-11']))

In [None]:
# Get the PCA components (loadings)
PCs = pca.components_

# Use quiver to generate the basic plot
fig = plt.figure(figsize=(5,5))
plt.quiver(np.zeros(PCs.shape[1]), np.zeros(PCs.shape[1]),
           PCs[0,:], PCs[1,:], 
           angles='xy', scale_units='xy', scale=1)

# Add labels based on feature names (here just numbers)
feature_names = np.arange(PCs.shape[1])
for i,j,z in zip(PCs[1,:]+0.02, PCs[0,:]+0.02, feature_names):
    plt.text(j, i, z, ha='center', va='center')

# Add unit circle
circle = plt.Circle((0,0), 1, facecolor='none', edgecolor='b')
plt.gca().add_artist(circle)

# Ensure correct aspect ratio and axis limits
plt.axis('equal')
plt.xlim([-1.0,1.0])
plt.ylim([-1.0,1.0])

# Label axes
plt.xlabel('PC 0')
plt.ylabel('PC 1')

# Done
plt.show()

In [None]:
# Get the PCA components (loadings)
PCs = pca.components_

# Use quiver to generate the basic plot
fig = plt.figure(figsize=(10,3))
plt.quiver(np.zeros(PCs.shape[1]), np.zeros(PCs.shape[1]),
           PCs[0,:], PCs[1,:], 
           angles='xy', scale_units='xy', scale=1)

# Add labels based on feature names (here just numbers)
feature_names = np.arange(PCs.shape[1])
for i,j,z in zip(PCs[1,:]+0.02, PCs[0,:]+0.02, feature_names):
    plt.text(j, i, z, ha='center', va='center')

# Add unit circle
circle = plt.Circle((0,0), 1, facecolor='none', edgecolor='b')
plt.gca().add_artist(circle)

# Ensure correct aspect ratio and axis limits
plt.axis('equal')
plt.xlim([-0.5,1.0])
plt.ylim([-0.1,0.1])

# Label axes
plt.xlabel('PC 0')
plt.ylabel('PC 1')

# Done
plt.show()

In [None]:
#Linear model
import sklearn.linear_model
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

lm = sklearn.linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
predictions = lm.predict(X_test)

In [None]:
plt.scatter(y_test, predictions)
plt.xlabel("True Values")
plt.ylabel("Predictions")
plt.show()

In [None]:
print("Score:", model.score(X_test, y_test))
#linear model is bad model for this data

In [None]:
#non-linear regression --> random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
rf = sklearn.ensemble.RandomForestClassifier(n_estimators = 200, max_depth=1000, 
                                             random_state=0, n_jobs = 4)
model2 = rf.fit(X_train, y_train)
predictions2 = rf.predict(X_test)
plt.scatter(y_test, predictions2)
plt.xlabel("True Values")
plt.ylabel("Predictions")
plt.show()

In [None]:
print("Score:", model2.score(X_test, y_test))

In [None]:
#print level of importance
dfColNames = dfNums.columns.tolist()
for i in range(len(dfColNames)-1):
    print(dfColNames[i] + ": " + 
          str(round(rf.feature_importances_[i]*100,2)) + "%")

In [None]:
#random forest
clf = RandomForestClassifier(n_estimators = 9, max_depth=45, random_state=0)
clfModel = clf.fit(X_train, y_train)

In [None]:
dfColNames = dfNums.columns.tolist()
for i in range(len(dfColNames)):
    print(dfColNames[i] + ": " + 
          str(round(rf.feature_importances_[i]*100,2)) + "%")

In [None]:
print("Score:", clfModel.score(X_test, y_test))