# Exploratory Data Analysis on Covid 19

Dataset by [Johns Hopkins University](https://www.jhu.edu/), preprocessed by [Laxmi Kant Tiwari](https://github.com/laxmimerit)

## 1. INSTALLATION AND INITIAL SETUP

In [2]:
# install folium for map based plotting
#!pip3 install folium

# install plotly for dynamic plots
#!pip3 install plotly

^C


In [2]:
# imports
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

import folium

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import math
import random
from datetime import timedelta

import warnings
warnings.filterwarnings('ignore')

# color pallette
cnf = '#393e46'
dth = '#ff2e63'
rec = '#21bf73'
act = '#fe9801'


## 2. DATASET PREPARATION

In [3]:
# setting plotly to offline
import plotly as py
py.offline.init_notebook_mode(connected = True)

In [4]:
import os

In [5]:
try:
    os.system("rm -rf Covid-19-Preprocessed-Dataset")
except:
    print("File does not exist.")

In [6]:
!git clone https://github.com/laxmimerit/Covid-19-Preprocessed-Dataset.git

fatal: destination path 'Covid-19-Preprocessed-Dataset' already exists and is not an empty directory.


In [7]:
df = pd.read_csv('Covid-19-Preprocessed-Dataset/preprocessed/covid_19_data_cleaned.csv', parse_dates = ['Date'])

In [8]:
# show dataframe
# df
# clean missing values ( Province/State in this case )fill with empty string
df['Province/State'] = df['Province/State'].fillna("")
df.head()

Unnamed: 0,Date,Province/State,Country,Lat,Long,Confirmed,Recovered,Deaths,Active
0,2020-01-22,,Afghanistan,33.0,65.0,0,0,0,0
1,2020-01-23,,Afghanistan,33.0,65.0,0,0,0,0
2,2020-01-24,,Afghanistan,33.0,65.0,0,0,0,0
3,2020-01-25,,Afghanistan,33.0,65.0,0,0,0,0
4,2020-01-26,,Afghanistan,33.0,65.0,0,0,0,0


In [9]:
country_daywise = pd.read_csv('Covid-19-Preprocessed-Dataset/preprocessed/country_daywise.csv', parse_dates = ['Date'])
countrywise = pd.read_csv('Covid-19-Preprocessed-Dataset/preprocessed/countrywise.csv')
daywise = pd.read_csv('Covid-19-Preprocessed-Dataset/preprocessed/daywise.csv', parse_dates = ['Date'])

In [10]:
country_daywise.head()


Unnamed: 0,Date,Country,Confirmed,Deaths,Recovered,Active,New Cases,New Deaths,New Recovered
0,2020-01-22,Afghanistan,0,0,0,0,0,0,0
1,2020-01-22,Albania,0,0,0,0,0,0,0
2,2020-01-22,Algeria,0,0,0,0,0,0,0
3,2020-01-22,Andorra,0,0,0,0,0,0,0
4,2020-01-22,Angola,0,0,0,0,0,0,0


In [11]:
countrywise.head()

Unnamed: 0,Country,Confirmed,Deaths,Recovered,Active,New Cases,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Population,Cases / Million People,Confirmed last week,1 week change,1 week % increase
0,Afghanistan,4033,115,502,3416,255,2.85,12.45,22.91,38928341,104.0,8676,4360,50.25
1,Albania,856,31,627,198,6,3.62,73.25,4.94,2877800,297.0,969,107,11.04
2,Algeria,5558,494,2546,2518,189,8.89,45.81,19.4,43851043,127.0,7728,1269,16.42
3,Andorra,754,48,545,161,2,6.37,72.28,8.81,77265,9759.0,762,1,0.13
4,Angola,43,2,13,28,0,4.65,30.23,15.38,32866268,1.0,58,16,27.59


In [12]:
daywise.head()

Unnamed: 0,Date,Confirmed,Deaths,Recovered,Active,New Cases,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,No. of Countries
0,2020-01-22,555,17,28,510,0,3.06,5.05,60.71,6
1,2020-01-23,654,18,30,606,99,2.75,4.59,60.0,8
2,2020-01-24,941,26,36,879,287,2.76,3.83,72.22,9
3,2020-01-25,1434,42,39,1353,493,2.93,2.72,107.69,11
4,2020-01-26,2118,56,52,2010,684,2.64,2.46,107.69,13


In [13]:
confirmed = df.groupby('Date').sum()['Confirmed'].reset_index()
confirmed

Unnamed: 0,Date,Confirmed
0,2020-01-22,555
1,2020-01-23,654
2,2020-01-24,941
3,2020-01-25,1434
4,2020-01-26,2118
...,...,...
123,2020-05-24,5407613
124,2020-05-25,5495061
125,2020-05-26,5589626
126,2020-05-27,5691790


In [14]:
df.isnull().sum()

Date              0
Province/State    0
Country           0
Lat               0
Long              0
Confirmed         0
Recovered         0
Deaths            0
Active            0
dtype: int64

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36096 entries, 0 to 36095
Data columns (total 9 columns):
Date              36096 non-null datetime64[ns]
Province/State    36096 non-null object
Country           36096 non-null object
Lat               36096 non-null float64
Long              36096 non-null float64
Confirmed         36096 non-null int64
Recovered         36096 non-null int64
Deaths            36096 non-null int64
Active            36096 non-null int64
dtypes: datetime64[ns](1), float64(2), int64(4), object(2)
memory usage: 2.5+ MB


In [16]:
df.query('Country == "India"')

Unnamed: 0,Date,Province/State,Country,Lat,Long,Confirmed,Recovered,Deaths,Active
16768,2020-01-22,,India,21.0,78.0,0,0,0,0
16769,2020-01-23,,India,21.0,78.0,0,0,0,0
16770,2020-01-24,,India,21.0,78.0,0,0,0,0
16771,2020-01-25,,India,21.0,78.0,0,0,0,0
16772,2020-01-26,,India,21.0,78.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
16891,2020-05-24,,India,21.0,78.0,138536,57692,4024,76820
16892,2020-05-25,,India,21.0,78.0,144950,60706,4172,80072
16893,2020-05-26,,India,21.0,78.0,150793,64277,4344,82172
16894,2020-05-27,,India,21.0,78.0,158086,67749,4534,85803


## 3. CONFIRMED, RECOVERED AND DEATH TOTAL

In [17]:
confirmed.tail()

Unnamed: 0,Date,Confirmed
123,2020-05-24,5407613
124,2020-05-25,5495061
125,2020-05-26,5589626
126,2020-05-27,5691790
127,2020-05-28,5808946


In [19]:
deaths = df.groupby('Date').sum()['Deaths'].reset_index()
deaths.tail()

Unnamed: 0,Date,Deaths
123,2020-05-24,345058
124,2020-05-25,346231
125,2020-05-26,350452
126,2020-05-27,355628
127,2020-05-28,360308


In [20]:
recovered = df.groupby('Date').sum()['Recovered'].reset_index()
recovered.tail()

Unnamed: 0,Date,Recovered
123,2020-05-24,2168563
124,2020-05-25,2231738
125,2020-05-26,2286956
126,2020-05-27,2350088
127,2020-05-28,2415960


In [27]:
# Scatter plot for the above three categories

fig = go.Figure()
fig.add_trace(go.Scatter(x = confirmed['Date'],
                         y = confirmed['Confirmed'],
                         name = 'Confirmed',
                         mode = 'lines+markers',
                         line = dict(color = 'Orange', width = 1)))
fig.add_trace(go.Scatter(x = recovered['Date'],
                         y = recovered['Recovered'],
                         name = 'Recovered',
                         mode = 'lines+markers',
                         line = dict(color = 'Green', width = 1)))
fig.add_trace(go.Scatter(x = deaths['Date'],
                         y = deaths['Deaths'],
                         name = 'Deaths',
                         mode = 'lines+markers',
                         line = dict(color = 'Red', width = 1)))
fig.update_layout(title = 'Worldwide Covid-19 Cases', xaxis_tickfont_size = 14, yaxis = dict(title = 'Number of Cases'))
fig.show()