# Exploratory Data Analysis


In [1]:
import numpy as np
import pandas as pd     
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns    

In [2]:
df = pd.read_csv("latimes-county-totals.csv")

In [3]:
df.head()

Unnamed: 0,1/26/20,Alameda,1,0,0.1,Unnamed: 5,Unnamed: 6
0,1/31/20,Alameda,1,0,0,0.0,0.0
1,2/2/20,Alameda,1,0,0,0.0,0.0
2,2/20/20,Alameda,1,0,0,0.0,0.0
3,2/21/20,Alameda,1,0,0,0.0,0.0
4,2/26/20,Alameda,1,0,0,0.0,0.0


In [4]:
df.describe()

Unnamed: 0,1,0,0.1,Unnamed: 5,Unnamed: 6
count,3151.0,3151.0,3151.0,3094.0,3094.0
mean,59.335766,363.686131,12.739765,18.138009,0.739173
std,32.972797,1646.784332,73.915663,82.994716,4.295296
min,1.0,0.0,0.0,-2.0,0.0
25%,29.0,3.0,0.0,0.0,0.0
50%,63.0,20.0,0.0,1.0,0.0
75%,85.0,155.0,3.0,9.0,0.0
max,115.0,26238.0,1260.0,1505.0,76.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3151 entries, 0 to 3150
Data columns (total 7 columns):
1/26/20       3151 non-null object
Alameda       3151 non-null object
1             3151 non-null int64
0             3151 non-null int64
0.1           3151 non-null int64
Unnamed: 5    3094 non-null float64
Unnamed: 6    3094 non-null float64
dtypes: float64(2), int64(3), object(2)
memory usage: 172.4+ KB


In [6]:
df2 = df.copy(deep=True)
df_la = df2[df2['county'] == 'Los Angeles']
df_la.head()

KeyError: 'county'

In [None]:
df_la = df_la.drop(labels=['fips'], axis=1)
print (df_la.info())
df_la.describe()

In [None]:
#replace the only two NAs in the first day of new_confirmed_cases and new_deaths to 0
df_la = df_la.fillna(0)

In [None]:
print(df_la)

In [None]:
days = list(range(1,len(df_la)+1))
df_la['culm_day'] = days

In [None]:
df_la.describe()

In [None]:
df_la.columns

In [None]:
print(df_la)

In [None]:
df_la.corr(min_periods=1)

Correlation between deaths and confirmed_cases is the highest 0.99. Although it makes sense how having a high number of confirmed cases means more people are infected therefore increasing the number of deaths, the correlation is too high and maybe due to having a small dataset, with 100 rows for Los Angeles only.

In [None]:
%matplotlib inline

In [None]:
#only Los Angeles
plot_df_la = df_la.dropna()
sns.pairplot(plot_df_la)

In [None]:
sns.distplot(df_la['confirmed_cases'], hist=False,rug=True, kde_kws={'shade':True})

In [None]:
sns.distplot(df_la['new_confirmed_cases'], hist=False,rug=True, kde_kws={'shade':True})

In [None]:
sns.distplot(df_la['deaths'], hist=False,rug=True, kde_kws={'shade':True})

In [None]:
sns.distplot(df_la['new_deaths'], hist=False)

In [None]:
sns.regplot(x="culm_day", y="confirmed_cases", data=df_la,fit_reg=False).set_title("COVID19 infections from the first confirmed case in LA")

In [None]:
sns.regplot(x="culm_day", y="new_confirmed_cases", data=df_la, fit_reg=False).set_title("COVID19 new confirmed cases from the first confirmed case in LA")

As culmulative days increase, we see the numbers of new confirmed cases increase, dropped a bit and then went back up again, probably due to having enough testing kits.

In [None]:
sns.regplot(x="culm_day", y="deaths", data=df_la,fit_reg=False).set_title("COVID19 deaths from the first confirmed case in LA")

The deaths continues to increase in LA. We do not see a turning point yet.

In [None]:
sns.regplot(x="culm_day", y="new_deaths", data=df_la, fit_reg=False).set_title("COVID19 new deaths from the first confirmed case in LA")

In [None]:
sns.regplot(x="confirmed_cases", y="new_deaths", data=df_la, fit_reg=False).set_title("COVID19 confirmed cases vs new deaths in LA")

In [None]:
sns.regplot(x="confirmed_cases", y="deaths", data=df_la, fit_reg=False).set_title("COVID19 confirmed cases vs deaths in LA")

There is a positive correlation between the total numbers of confirmed cases and deaths. The relationship looks linear from this plot.

In [None]:
sns.regplot(x="new_deaths", y="deaths", data=df_la, fit_reg=False).set_title("COVID19 new deaths vs deaths in LA")

The numbers of cumulative deaths seem to be positively correlated with the number of new_deaths, however, can't tell the shape of relationship between the two. Will need more data points in order to find out the true relationship.

In [None]:
confirmed_total_date = df_la.groupby(['date']).agg({'confirmed_cases':['sum']})
fatalities_total_date = df_la.groupby(['date']).agg({'deaths':['sum']})
total_date = confirmed_total_date.join(fatalities_total_date)

In [None]:
#side by side plots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(17,7))
total_date.plot(ax=ax1)
ax1.set_title("LA total confirmed cases", size=13)
ax1.set_ylabel("Number of confirmed cases", size=13)
ax1.set_xlabel("Date", size=13)
fatalities_total_date.plot(ax=ax2, color='orange')
ax2.set_title("LA total deaths", size=13)
ax2.set_ylabel("Number of deaths", size=13)
ax2.set_xlabel("Date", size=13)

In [None]:
confirmed_total_date = df_la.groupby(['culm_day']).agg({'confirmed_cases':['sum']})
fatalities_total_date = df_la.groupby(['culm_day']).agg({'deaths':['sum']})
total_date = confirmed_total_date.join(fatalities_total_date)

In [None]:
#same as plots above, just trying to use days to replace date for exploration
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(17,7))
total_date.plot(ax=ax1)
ax1.set_title("LA total confirmed cases", size=13)
ax1.set_ylabel("Number of confirmed cases", size=13)
ax1.set_xlabel("days", size=13)
fatalities_total_date.plot(ax=ax2, color='orange')
ax2.set_title("LA total deaths", size=13)
ax2.set_ylabel("Number of deaths", size=13)
ax2.set_xlabel("days", size=13)

The left plot shows the total confirmed cases in LA compared with the total deaths as of 5/4/2020. We do not see a turning point in LA yet. The right is a zoomed in of the total death in LA, sharing a similar shape as the number of confirmed cases.