# COVID-19 Data Exploration
This Jupyter notebook is what I will be using to explore the data to decide on what visualizations to include in my Dash.py dashboard.

### Imports

In [11]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

### Load in Data

In [2]:
# Use Pandas to read in the data -> pulls from the live distribution of covidtesting data
df = pd.read_csv('https://data.ontario.ca/dataset/f4f86e54-872d-43f8-8a86-3892fd3cb5e6/resource/ed270bb8-340b-41f9-a7c6-e8ef587e6d11/download/covidtesting.csv')

# Look at the last 5 rows of data
df.tail()

Unnamed: 0,Reported Date,Confirmed Negative,Presumptive Negative,Presumptive Positive,Confirmed Positive,Resolved,Deaths,Total Cases,Total patients approved for testing as of Reporting Date,Total tests completed in the last day,...,Number of patients in ICU on a ventilator due to COVID-19,Num. of patients in ICU on a ventilator testing positive,Num. of patients in ICU on a ventilator testing negative,Total Positive LTC Resident Cases,Total Positive LTC HCW Cases,Total LTC Resident Deaths,Total LTC HCW Deaths,Total_Lineage_B.1.1.7,Total_Lineage_B.1.351,Total_Lineage_P.1
435,2021-04-16,,,,39977.0,360742.0,7664.0,408383.0,13422529.0,64304.0,...,480.0,456.0,24.0,15091.0,6890.0,3909.0,10.0,30175.0,99.0,202.0
436,2021-04-17,,,,40694.0,364353.0,7698.0,412745.0,13479381.0,56852.0,...,501.0,471.0,30.0,15095.0,6891.0,3912.0,10.0,32327.0,101.0,207.0
437,2021-04-18,,,,41588.0,367691.0,7716.0,416995.0,13533157.0,53776.0,...,506.0,475.0,31.0,15105.0,6896.0,3912.0,10.0,34112.0,104.0,207.0
438,2021-04-19,,,,42863.0,370844.0,7735.0,421442.0,13576030.0,42873.0,...,516.0,484.0,32.0,15109.0,6901.0,3912.0,10.0,36579.0,104.0,211.0
439,2021-04-20,,,,42941.0,374213.0,7757.0,424911.0,13616626.0,40596.0,...,537.0,507.0,30.0,15111.0,6903.0,3912.0,10.0,39276.0,105.0,211.0


In [3]:
# get the columns into a list
cols = list(df.columns)
print(cols)

['Reported Date', 'Confirmed Negative', 'Presumptive Negative', 'Presumptive Positive', 'Confirmed Positive', 'Resolved', 'Deaths', 'Total Cases', 'Total patients approved for testing as of Reporting Date', 'Total tests completed in the last day', 'Percent positive tests in last day', 'Under Investigation', 'Number of patients hospitalized with COVID-19', 'Number of patients in ICU due to COVID-19', 'Number of patients in ICU, testing positive for COVID-19', 'Number of patients in ICU, testing negative for COVID-19', 'Number of patients in ICU on a ventilator due to COVID-19', 'Num. of patients in ICU on a ventilator testing positive', 'Num. of patients in ICU on a ventilator testing negative', 'Total Positive LTC Resident Cases', 'Total Positive LTC HCW Cases', 'Total LTC Resident Deaths', 'Total LTC HCW Deaths', 'Total_Lineage_B.1.1.7', 'Total_Lineage_B.1.351', 'Total_Lineage_P.1']


In [4]:
# Look at some summary statistics of the dataframe
df.describe()

Unnamed: 0,Confirmed Negative,Presumptive Negative,Presumptive Positive,Confirmed Positive,Resolved,Deaths,Total Cases,Total patients approved for testing as of Reporting Date,Total tests completed in the last day,Percent positive tests in last day,...,Number of patients in ICU on a ventilator due to COVID-19,Num. of patients in ICU on a ventilator testing positive,Num. of patients in ICU on a ventilator testing negative,Total Positive LTC Resident Cases,Total Positive LTC HCW Cases,Total LTC Resident Deaths,Total LTC HCW Deaths,Total_Lineage_B.1.1.7,Total_Lineage_B.1.351,Total_Lineage_P.1
count,47.0,12.0,25.0,438.0,428.0,400.0,438.0,434.0,371.0,367.0,...,354.0,384.0,354.0,334.0,330.0,337.0,337.0,82.0,78.0,66.0
mean,7022.148936,22.083333,0.28,8651.086758,99976.140187,3619.35,109650.009132,4643565.0,36397.692722,3.295095,...,133.579096,126.315104,11.929379,9102.191617,3876.739394,2482.237389,8.183976,4493.317073,41.589744,61.469697
std,10246.576459,30.113296,0.541603,9196.884797,109906.871349,2118.996824,119324.874073,4235637.0,16729.572807,2.274503,...,109.877143,103.47973,7.052863,3927.057941,1790.748738,884.342796,1.551299,9339.169816,31.232449,65.402004
min,90.0,0.0,0.0,0.0,1.0,1.0,1.0,139.0,5813.0,0.4,...,9.0,5.0,1.0,4235.0,1456.0,1115.0,3.0,51.0,1.0,1.0
25%,449.0,0.0,0.0,1549.75,23108.75,2638.5,26578.75,705580.5,23997.0,1.2,...,35.0,26.0,7.0,5902.0,2602.0,1800.0,8.0,357.25,9.0,3.0
50%,1665.0,10.0,0.0,4558.5,40688.5,2947.5,44660.5,3384182.0,33440.0,3.1,...,112.0,112.0,11.0,6952.0,2884.5,2024.0,8.0,938.5,42.5,35.0
75%,9860.0,29.75,0.0,12986.0,163219.5,4990.25,184016.0,8037679.0,49156.5,4.6,...,205.0,193.0,17.0,14116.25,5802.5,3462.0,10.0,1873.5,69.75,103.0
max,40630.0,81.0,2.0,42941.0,374213.0,7757.0,424911.0,13616630.0,76472.0,10.5,...,537.0,507.0,35.0,15111.0,6903.0,3912.0,10.0,39276.0,105.0,211.0


## Visualizations
This section will be to explore some visualizations that I think make sense for the dashboard. **Note:** If there were actual stakeholders I would use that opportunity to understand the value provided by the severity information and what decisions it ultimately affects to further justify the choice of which visualizations to use.

### Total Cases over Time

In [8]:
fig = px.line(df, x="Reported Date", y="Total Cases", title="Total number of COVID-19 cases in Ontario (Cumulative)")
fig.show()

### Confirmed Active Cases by Day in Ontario

In [10]:
fig = px.line(df, x="Reported Date", y="Confirmed Positive", title="Confirmed Active Case in Ontario (Daily)")
fig.show()

### Exploring Lineage Data for the Variants

In [21]:
# restrict the dataframe to 2021 to get a proper view of the data
df_variant = df[df['Reported Date'] >= '2021-01-15']

fig = go.Figure()
fig.add_trace(go.Scatter(x=df_variant["Reported Date"], y=df_variant['Total_Lineage_B.1.1.7'],
                    mode='lines',
                    name='B.1.1.7'))
fig.add_trace(go.Scatter(x=df_variant["Reported Date"], y=df_variant['Total_Lineage_B.1.351'],
                    mode='lines',
                    name='B.1.351'))
fig.add_trace(go.Scatter(x=df_variant["Reported Date"], y=df_variant['Total_Lineage_P.1'],
                    mode='lines', name='P.1'))

fig.show()

In [23]:
# Closer look at the smaller two variants
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_variant["Reported Date"], y=df_variant['Total_Lineage_B.1.351'],
                    mode='lines',
                    name='B.1.351'))
fig.add_trace(go.Scatter(x=df_variant["Reported Date"], y=df_variant['Total_Lineage_P.1'],
                    mode='lines', name='P.1'))

fig.show()