## 45 - Prepare your Data for the Analysis

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import os

In [3]:
#path_base = '/home/yeat/Documents/personal/datasets/'
path_base = '/home/yeat_fedora/Documents/datasets/'
files = os.listdir(path_base+'/Covid-19/')

In [4]:
def read_data(path, filename):
    return pd.read_csv(path+'/'+filename)

In [5]:
path = path_base+'Covid-19/'
worldometer_data = read_data(path,files[0])
country_wise_latest = read_data(path,files[1])
day_wise = read_data(path,files[2])
full_grouped = read_data(path,files[3])
covid_19_clean_complete = read_data(path,files[4])
usa_country_wise = read_data(path,files[5])

In [15]:
usa_country_wise.columns

Index(['Country/Region', 'Continent', 'Population', 'TotalCases', 'NewCases',
       'TotalDeaths', 'NewDeaths', 'TotalRecovered', 'NewRecovered',
       'ActiveCases', 'Serious,Critical', 'Tot Cases/1M pop', 'Deaths/1M pop',
       'TotalTests', 'Tests/1M pop', 'WHO Region'],
      dtype='object')

## 46 - Analysing Total cases Deaths Recovered active cases
#### 1. Which country has maximum total cases, death, recovered and active cases?
#### 2. What is the trend of confirmed deaths, recovered, active, cases?

In [6]:
worldometer_data.columns

Index(['Country/Region', 'Confirmed', 'Deaths', 'Recovered', 'Active',
       'New cases', 'New deaths', 'New recovered', 'Deaths / 100 Cases',
       'Recovered / 100 Cases', 'Deaths / 100 Recovered',
       'Confirmed last week', '1 week change', '1 week % increase',
       'WHO Region'],
      dtype='object')

In [None]:
worldometer_data.head()

In [5]:
columns=['New cases','Deaths','New recovered','Active']
for i in columns:
    fig = px.treemap(worldometer_data.iloc[0:20], values=i, path=['Country/Region'], title='Tree map representation of COVID-19 cases to their {}'.format(i))
    fig.show()


In [None]:
day_wise.columns

Index(['Date', 'Confirmed', 'Deaths', 'Recovered', 'Active', 'New cases',
       'New deaths', 'New recovered', 'Deaths / 100 Cases',
       'Recovered / 100 Cases', 'Deaths / 100 Recovered', 'No. of countries'],
      dtype='object')

In [9]:
day_wise.head()

Unnamed: 0,Date,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,No. of countries
0,2020-01-22,555,17,28,510,0,0,0,3.06,5.05,60.71,6
1,2020-01-23,654,18,30,606,99,1,2,2.75,4.59,60.0,8
2,2020-01-24,941,26,36,879,287,8,6,2.76,3.83,72.22,9
3,2020-01-25,1434,42,39,1353,493,16,3,2.93,2.72,107.69,11
4,2020-01-26,2118,56,52,2010,684,14,13,2.64,2.46,107.69,13


In [10]:
px.line(day_wise, x='Date', y=['Confirmed', 'Deaths', 'Recovered', 'Active'], title='Covid cases to date', template='plotly_dark')

## 47 - Perform EDA on Data
### 1. Visualize population to test done ratio
### 2. 20 countries that are badly affected by corona
### 3. Top 20 countries of Total confirmed cases, total recovered

In [16]:
usa_country_wise.head()

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
0,USA,North America,331198100.0,5032179,,162804.0,,2576668.0,,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas
1,Brazil,South America,212710700.0,2917562,,98644.0,,2047660.0,,771258.0,8318.0,13716.0,464.0,13206188.0,62085.0,Americas
2,India,Asia,1381345000.0,2025409,,41638.0,,1377384.0,,606387.0,8944.0,1466.0,30.0,22149351.0,16035.0,South-EastAsia
3,Russia,Europe,145940900.0,871894,,14606.0,,676357.0,,180931.0,2300.0,5974.0,100.0,29716907.0,203623.0,Europe
4,South Africa,Africa,59381570.0,538184,,9604.0,,387316.0,,141264.0,539.0,9063.0,162.0,3149807.0,53044.0,Africa


In [21]:
usa_country_wise.columns

Index(['Country/Region', 'Continent', 'Population', 'TotalCases', 'NewCases',
       'TotalDeaths', 'NewDeaths', 'TotalRecovered', 'NewRecovered',
       'ActiveCases', 'Serious,Critical', 'Tot Cases/1M pop', 'Deaths/1M pop',
       'TotalTests', 'Tests/1M pop', 'WHO Region'],
      dtype='object')

In [None]:
pop_test_ratio = usa_country_wise['Population']/usa_country_wise['TotalTests']

In [28]:
fig = px.bar(usa_country_wise.iloc[0:20], x='Country/Region', y=pop_test_ratio.iloc[0:20], color='Country/Region',title='Pop test ratio')
fig.show()

In [29]:
usa_country_wise.columns

Index(['Country/Region', 'Continent', 'Population', 'TotalCases', 'NewCases',
       'TotalDeaths', 'NewDeaths', 'TotalRecovered', 'NewRecovered',
       'ActiveCases', 'Serious,Critical', 'Tot Cases/1M pop', 'Deaths/1M pop',
       'TotalTests', 'Tests/1M pop', 'WHO Region'],
      dtype='object')

In [30]:
px.bar(usa_country_wise.iloc[0:20], x='Country/Region', y=['Serious,Critical','TotalDeaths','TotalRecovered','ActiveCases','TotalCases'])

### 4. Worst 20 countries having maximum confirmed cases
### 4. Worst 20 countries having maximum total deaths
### 4. Worst 20 countries having maximum active cases
### 4. Worst 20 countries having maximum recovered cases

In [31]:
fig = px.bar(usa_country_wise.iloc[0:20], y='Country/Region', x='TotalCases', color='TotalCases', text='TotalCases')
fig.update_layout(template='plotly_dark', title_text='Top 20 countries of total confirmed cases')
fig.show()

In [33]:
usa_country_wise.sort_values(by='TotalDeaths', ascending=False).iloc[0:20]

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
0,USA,North America,331198100.0,5032179,,162804.0,,2576668.0,,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas
1,Brazil,South America,212710700.0,2917562,,98644.0,,2047660.0,,771258.0,8318.0,13716.0,464.0,13206188.0,62085.0,Americas
5,Mexico,North America,129066200.0,462690,6590.0,50517.0,819.0,308848.0,4140.0,103325.0,3987.0,3585.0,391.0,1056915.0,8189.0,Americas
11,UK,Europe,67922030.0,308134,,46413.0,,,,,73.0,4537.0,683.0,17515234.0,257873.0,Europe
2,India,Asia,1381345000.0,2025409,,41638.0,,1377384.0,,606387.0,8944.0,1466.0,30.0,22149351.0,16035.0,South-EastAsia
15,Italy,Europe,60452570.0,249204,,35187.0,,201323.0,,12694.0,42.0,4122.0,582.0,7099713.0,117443.0,Europe
19,France,Europe,65288310.0,195633,,30312.0,,82460.0,,82861.0,384.0,2996.0,464.0,3992206.0,61147.0,Europe
9,Spain,Europe,46756650.0,354530,,28500.0,,,,,617.0,7582.0,610.0,7064329.0,151087.0,Europe
6,Peru,South America,33016320.0,455409,,20424.0,,310337.0,,124648.0,1426.0,13793.0,619.0,2493429.0,75521.0,Americas
10,Iran,Asia,84097620.0,320117,,17976.0,,277463.0,,24678.0,4156.0,3806.0,214.0,2612763.0,31068.0,EasternMediterranean


In [34]:
fig = px.bar(usa_country_wise.sort_values(by='TotalDeaths', ascending=False).iloc[0:20], y='Country/Region', x='TotalDeaths', color='TotalDeaths', text='TotalDeaths')
fig.update_layout(template='plotly_dark', title_text='Top 20 countries of total deaths cases')
fig.show()

In [36]:
labels = usa_country_wise[0:15]['Country/Region'].values
cases = ['TotalCases', 'TotalDeaths', 'TotalRecovered', 'ActiveCases']
for i in cases: 
    fig = px.pie(usa_country_wise[0:15], values=i, names=labels, hole=0.3, title=" {} recorded wrt to who region of 15 worst affected countries".format(i))
    fig.show()