# Covid cases

### Initial Configuration

In [2]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
import seaborn as sns

In [3]:
# Reading the csv file
df = pd.read_csv('./covid_worldwide.csv')
df.head()

Unnamed: 0,Serial Number,Country,Total Cases,Total Deaths,Total Recovered,Active Cases,Total Test,Population
0,1,USA,104196861,1132935,101322779,1741147,1159832679,334805269
1,2,India,44682784,530740,44150289,1755,915265788,1406631776
2,3,France,39524311,164233,39264546,95532,271490188,65584518
3,4,Germany,37779833,165711,37398100,216022,122332384,83883596
4,5,Brazil,36824580,697074,35919372,208134,63776166,215353593


In [4]:
df.columns

Index(['Serial Number', 'Country', 'Total Cases', 'Total Deaths',
       'Total Recovered', 'Active Cases', 'Total Test', 'Population'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231 entries, 0 to 230
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Serial Number    231 non-null    int64 
 1   Country          231 non-null    object
 2   Total Cases      231 non-null    object
 3   Total Deaths     225 non-null    object
 4   Total Recovered  210 non-null    object
 5   Active Cases     212 non-null    object
 6   Total Test       213 non-null    object
 7   Population       228 non-null    object
dtypes: int64(1), object(7)
memory usage: 14.6+ KB


In [6]:
df.columns = df.columns.str.strip().str.replace(' ', '_').str.lower()
df

Unnamed: 0,serial_number,country,total_cases,total_deaths,total_recovered,active_cases,total_test,population
0,1,USA,104196861,1132935,101322779,1741147,1159832679,334805269
1,2,India,44682784,530740,44150289,1755,915265788,1406631776
2,3,France,39524311,164233,39264546,95532,271490188,65584518
3,4,Germany,37779833,165711,37398100,216022,122332384,83883596
4,5,Brazil,36824580,697074,35919372,208134,63776166,215353593
...,...,...,...,...,...,...,...,...
226,227,Diamond Princess,712,13,699,0,,
227,228,Vatican City,29,,29,0,,799
228,229,Western Sahara,10,1,9,0,,626161
229,230,MS Zaandam,9,2,7,0,,


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231 entries, 0 to 230
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   serial_number    231 non-null    int64 
 1   country          231 non-null    object
 2   total_cases      231 non-null    object
 3   total_deaths     225 non-null    object
 4   total_recovered  210 non-null    object
 5   active_cases     212 non-null    object
 6   total_test       213 non-null    object
 7   population       228 non-null    object
dtypes: int64(1), object(7)
memory usage: 14.6+ KB


In [8]:
df = df.drop('serial_number', axis=1)
df.head()

Unnamed: 0,country,total_cases,total_deaths,total_recovered,active_cases,total_test,population
0,USA,104196861,1132935,101322779,1741147,1159832679,334805269
1,India,44682784,530740,44150289,1755,915265788,1406631776
2,France,39524311,164233,39264546,95532,271490188,65584518
3,Germany,37779833,165711,37398100,216022,122332384,83883596
4,Brazil,36824580,697074,35919372,208134,63776166,215353593


In [9]:
df.total_cases = df.total_cases.str.replace(',', '')
df.total_deaths = df.total_deaths.str.replace(',', '')
df.total_recovered = df.total_recovered.str.replace(',', '')
df.active_cases = df.active_cases.str.replace(',', '')
df.total_test = df.total_test.str.replace(',', '')
df.population = df.population.str.replace(',', '')

In [10]:
df.isna().sum()

country             0
total_cases         0
total_deaths        6
total_recovered    21
active_cases       19
total_test         18
population          3
dtype: int64

In [11]:
# show the row with missing values
df[df.isna().any(axis=1)]

Unnamed: 0,country,total_cases,total_deaths,total_recovered,active_cases,total_test,population
10,Turkey,17042722,101492.0,,,162743369.0,85561976.0
29,DPRK,4772813,74.0,4772739.0,0.0,,25990679.0
65,Tunisia,1150356,29308.0,,,4997373.0,12046656.0
77,Paraguay,806256,19820.0,,,2657506.0,7305843.0
90,China,503302,5272.0,379053.0,118977.0,160000000.0,
94,Honduras,470757,11104.0,,,1619676.0,10221247.0
111,Martinique,229687,1097.0,,,828928.0,374087.0
112,Laos,217973,758.0,,,1233207.0,7481023.0
113,Iceland,208688,229.0,,,1996384.0,345393.0
117,Guadeloupe,201394,1007.0,,,938039.0,399794.0


In [12]:
# filling the missing values with 0
df = df.fillna(0)

In [13]:
df.total_cases = df.total_cases.astype(np.int64)
df.total_deaths = df.total_deaths.astype(np.int64)
df.total_recovered = df.total_recovered.astype(np.int64)
df.active_cases = df.active_cases.astype(np.int64)
df.total_test = df.total_test.astype(np.int64)
df.population = df.population.astype(np.int64)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231 entries, 0 to 230
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   country          231 non-null    object
 1   total_cases      231 non-null    int64 
 2   total_deaths     231 non-null    int64 
 3   total_recovered  231 non-null    int64 
 4   active_cases     231 non-null    int64 
 5   total_test       231 non-null    int64 
 6   population       231 non-null    int64 
dtypes: int64(6), object(1)
memory usage: 12.8+ KB


In [14]:
# total cases in country == peru
df[df.country == 'Peru']

Unnamed: 0,country,total_cases,total_deaths,total_recovered,active_cases,total_test,population
34,Peru,4481621,218931,4258688,4002,37754603,33684208


In [19]:
# show in a map the total_cases by country
import plotly.express as px
fig = px.choropleth(df, locations="country", locationmode='country names', color="total_cases", hover_name="country", 
                    color_continuous_scale=px.colors.sequential.Plasma, range_color=[df['total_cases'].min(), df['total_cases'].max()], 
                    title='Covid-19: Total Cases by Country')
fig.show()

In [16]:
# 

In [22]:
# show in a map the total_deaths by country
import plotly.express as px
fig = px.choropleth(df, locations="country", locationmode='country names', color="total_deaths", hover_name="country", 
                    color_continuous_scale=px.colors.sequential.Plasma, range_color=[df['total_deaths'].min(), df['total_deaths'].max()], 
                    title='Covid-19: Total Deaths by Country')
fig.show()

In [30]:
# now a map by percentage (total_deaths / total_cases)
df['percentage_deaths'] = (df['total_deaths'] / df['total_cases'])*100

import plotly.express as px
fig = px.choropleth(df, locations="country", locationmode='country names', color="percentage_deaths", hover_name="country", 
                    color_continuous_scale=px.colors.sequential.Plasma, range_color=[df['percentage_deaths'].min(), df['percentage_deaths'].quantile(0.95)], 
                    title='Covid-19: Total Deaths by Country')
fig.show()
