## Exploratory Data Analysis (EDA)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Set the seaborn 
sns.set_theme()

In [4]:
covid = pd.read_csv("data/covid_19_clean_complete.csv",
                       parse_dates=True)
world = pd.read_csv("data/worldometer_data.csv")

In [5]:
covid.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered,Active,WHO Region
0,,Afghanistan,33.93911,67.709953,2020-01-22,0,0,0,0,Eastern Mediterranean
1,,Albania,41.1533,20.1683,2020-01-22,0,0,0,0,Europe
2,,Algeria,28.0339,1.6596,2020-01-22,0,0,0,0,Africa
3,,Andorra,42.5063,1.5218,2020-01-22,0,0,0,0,Europe
4,,Angola,-11.2027,17.8739,2020-01-22,0,0,0,0,Africa


In [6]:
covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49068 entries, 0 to 49067
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Province/State  14664 non-null  object 
 1   Country/Region  49068 non-null  object 
 2   Lat             49068 non-null  float64
 3   Long            49068 non-null  float64
 4   Date            49068 non-null  object 
 5   Confirmed       49068 non-null  int64  
 6   Deaths          49068 non-null  int64  
 7   Recovered       49068 non-null  int64  
 8   Active          49068 non-null  int64  
 9   WHO Region      49068 non-null  object 
dtypes: float64(2), int64(4), object(4)
memory usage: 3.7+ MB


In [7]:
covid.describe()

Unnamed: 0,Lat,Long,Confirmed,Deaths,Recovered,Active
count,49068.0,49068.0,49068.0,49068.0,49068.0,49068.0
mean,21.43373,23.528236,16884.9,884.17916,7915.713,8085.012
std,24.95032,70.44274,127300.2,6313.584411,54800.92,76258.9
min,-51.7963,-135.0,0.0,0.0,0.0,-14.0
25%,7.873054,-15.3101,4.0,0.0,0.0,0.0
50%,23.6345,21.7453,168.0,2.0,29.0,26.0
75%,41.20438,80.771797,1518.25,30.0,666.0,606.0
max,71.7069,178.065,4290259.0,148011.0,1846641.0,2816444.0


In [8]:
covid.rename(columns = {"Province/State": "State", "Country/Region": "Country"}, inplace=True)

In [9]:
covid.head()

Unnamed: 0,State,Country,Lat,Long,Date,Confirmed,Deaths,Recovered,Active,WHO Region
0,,Afghanistan,33.93911,67.709953,2020-01-22,0,0,0,0,Eastern Mediterranean
1,,Albania,41.1533,20.1683,2020-01-22,0,0,0,0,Europe
2,,Algeria,28.0339,1.6596,2020-01-22,0,0,0,0,Africa
3,,Andorra,42.5063,1.5218,2020-01-22,0,0,0,0,Europe
4,,Angola,-11.2027,17.8739,2020-01-22,0,0,0,0,Africa


In [10]:
covid["State"].fillna("Unknown", inplace=True)

In [11]:
covid.isna().sum()

State         0
Country       0
Lat           0
Long          0
Date          0
Confirmed     0
Deaths        0
Recovered     0
Active        0
WHO Region    0
dtype: int64

In [12]:
world.head()

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
0,USA,North America,331198100.0,5032179,,162804.0,,2576668.0,,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas
1,Brazil,South America,212710700.0,2917562,,98644.0,,2047660.0,,771258.0,8318.0,13716.0,464.0,13206188.0,62085.0,Americas
2,India,Asia,1381345000.0,2025409,,41638.0,,1377384.0,,606387.0,8944.0,1466.0,30.0,22149351.0,16035.0,South-EastAsia
3,Russia,Europe,145940900.0,871894,,14606.0,,676357.0,,180931.0,2300.0,5974.0,100.0,29716907.0,203623.0,Europe
4,South Africa,Africa,59381570.0,538184,,9604.0,,387316.0,,141264.0,539.0,9063.0,162.0,3149807.0,53044.0,Africa


In [13]:
world.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Country/Region    209 non-null    object 
 1   Continent         208 non-null    object 
 2   Population        208 non-null    float64
 3   TotalCases        209 non-null    int64  
 4   NewCases          4 non-null      float64
 5   TotalDeaths       188 non-null    float64
 6   NewDeaths         3 non-null      float64
 7   TotalRecovered    205 non-null    float64
 8   NewRecovered      3 non-null      float64
 9   ActiveCases       205 non-null    float64
 10  Serious,Critical  122 non-null    float64
 11  Tot Cases/1M pop  208 non-null    float64
 12  Deaths/1M pop     187 non-null    float64
 13  TotalTests        191 non-null    float64
 14  Tests/1M pop      191 non-null    float64
 15  WHO Region        184 non-null    object 
dtypes: float64(12), int64(1), object(3)
memory u

In [14]:
world.rename(columns={"Country/Region": "Country", "Serious,Critical": "Serious"}, inplace=True)

In [15]:
world.head()

Unnamed: 0,Country,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,Serious,Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
0,USA,North America,331198100.0,5032179,,162804.0,,2576668.0,,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas
1,Brazil,South America,212710700.0,2917562,,98644.0,,2047660.0,,771258.0,8318.0,13716.0,464.0,13206188.0,62085.0,Americas
2,India,Asia,1381345000.0,2025409,,41638.0,,1377384.0,,606387.0,8944.0,1466.0,30.0,22149351.0,16035.0,South-EastAsia
3,Russia,Europe,145940900.0,871894,,14606.0,,676357.0,,180931.0,2300.0,5974.0,100.0,29716907.0,203623.0,Europe
4,South Africa,Africa,59381570.0,538184,,9604.0,,387316.0,,141264.0,539.0,9063.0,162.0,3149807.0,53044.0,Africa


In [16]:
world.isna().sum()

Country               0
Continent             1
Population            1
TotalCases            0
NewCases            205
TotalDeaths          21
NewDeaths           206
TotalRecovered        4
NewRecovered        206
ActiveCases           4
Serious              87
Tot Cases/1M pop      1
Deaths/1M pop        22
TotalTests           18
Tests/1M pop         18
WHO Region           25
dtype: int64

In [17]:
world["Continent"].fillna("Asia", inplace=True)

In [18]:
world.isna().sum()

Country               0
Continent             0
Population            1
TotalCases            0
NewCases            205
TotalDeaths          21
NewDeaths           206
TotalRecovered        4
NewRecovered        206
ActiveCases           4
Serious              87
Tot Cases/1M pop      1
Deaths/1M pop        22
TotalTests           18
Tests/1M pop         18
WHO Region           25
dtype: int64

In [19]:
num = int(world["Population"].mean())
world.Population.fillna(num, inplace=True)

In [20]:
world["Population"].astype(int)

0       331198130
1       212710692
2      1381344997
3       145940924
4        59381566
          ...    
204          4992
205         26247
206          3489
207           801
208        598682
Name: Population, Length: 209, dtype: int32

In [21]:
world.isna().sum()

Country               0
Continent             0
Population            0
TotalCases            0
NewCases            205
TotalDeaths          21
NewDeaths           206
TotalRecovered        4
NewRecovered        206
ActiveCases           4
Serious              87
Tot Cases/1M pop      1
Deaths/1M pop        22
TotalTests           18
Tests/1M pop         18
WHO Region           25
dtype: int64

In [22]:
world["NewCases"].value_counts(), world.shape

(6590.0    1
 20.0      1
 30.0      1
 1282.0    1
 Name: NewCases, dtype: int64,
 (209, 16))

In [23]:
world["NewCases"].fillna(0, inplace=True)
world["NewCases"].astype(int)

0      0
1      0
2      0
3      0
4      0
      ..
204    0
205    0
206    0
207    0
208    0
Name: NewCases, Length: 209, dtype: int32

In [24]:
world.isna().sum()

Country               0
Continent             0
Population            0
TotalCases            0
NewCases              0
TotalDeaths          21
NewDeaths           206
TotalRecovered        4
NewRecovered        206
ActiveCases           4
Serious              87
Tot Cases/1M pop      1
Deaths/1M pop        22
TotalTests           18
Tests/1M pop         18
WHO Region           25
dtype: int64

In [25]:
world.head()

Unnamed: 0,Country,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,Serious,Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
0,USA,North America,331198100.0,5032179,0.0,162804.0,,2576668.0,,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas
1,Brazil,South America,212710700.0,2917562,0.0,98644.0,,2047660.0,,771258.0,8318.0,13716.0,464.0,13206188.0,62085.0,Americas
2,India,Asia,1381345000.0,2025409,0.0,41638.0,,1377384.0,,606387.0,8944.0,1466.0,30.0,22149351.0,16035.0,South-EastAsia
3,Russia,Europe,145940900.0,871894,0.0,14606.0,,676357.0,,180931.0,2300.0,5974.0,100.0,29716907.0,203623.0,Europe
4,South Africa,Africa,59381570.0,538184,0.0,9604.0,,387316.0,,141264.0,539.0,9063.0,162.0,3149807.0,53044.0,Africa


In [26]:
world["TotalDeaths"].fillna(0, inplace=True)

In [27]:
world.isna().sum()

Country               0
Continent             0
Population            0
TotalCases            0
NewCases              0
TotalDeaths           0
NewDeaths           206
TotalRecovered        4
NewRecovered        206
ActiveCases           4
Serious              87
Tot Cases/1M pop      1
Deaths/1M pop        22
TotalTests           18
Tests/1M pop         18
WHO Region           25
dtype: int64

In [28]:
world["NewDeaths"].fillna(0, inplace=True)
world["NewRecovered"].fillna(0, inplace=True)
world["TotalRecovered"].fillna(0, inplace=True)

In [29]:
world.isna().sum()

Country              0
Continent            0
Population           0
TotalCases           0
NewCases             0
TotalDeaths          0
NewDeaths            0
TotalRecovered       0
NewRecovered         0
ActiveCases          4
Serious             87
Tot Cases/1M pop     1
Deaths/1M pop       22
TotalTests          18
Tests/1M pop        18
WHO Region          25
dtype: int64

In [30]:
world["ActiveCases"].fillna(0, inplace=True)

In [31]:
world.rename(columns={"Country/Region": "Country", "Serious,Critical": "Serious"}, inplace=True)

In [32]:
world.isna().sum()

Country              0
Continent            0
Population           0
TotalCases           0
NewCases             0
TotalDeaths          0
NewDeaths            0
TotalRecovered       0
NewRecovered         0
ActiveCases          0
Serious             87
Tot Cases/1M pop     1
Deaths/1M pop       22
TotalTests          18
Tests/1M pop        18
WHO Region          25
dtype: int64

In [33]:
world.loc[world.ActiveCases==0,'Serious'] = world.loc[world.ActiveCases==0,'Serious'].fillna(0)

In [34]:
world.loc[world.ActiveCases==0,'Serious']

9      617.0
11      73.0
29      38.0
40      37.0
157      0.0
166      0.0
167      0.0
176      0.0
181      0.0
193      0.0
199      0.0
201      0.0
203      0.0
206      0.0
207      0.0
Name: Serious, dtype: float64

In [35]:
serious = np.mean((world["ActiveCases"])*0.008)
serious

217.07892822966514

In [36]:
world["Serious"].fillna(serious, inplace=True)

In [37]:
world.isna().sum()

Country              0
Continent            0
Population           0
TotalCases           0
NewCases             0
TotalDeaths          0
NewDeaths            0
TotalRecovered       0
NewRecovered         0
ActiveCases          0
Serious              0
Tot Cases/1M pop     1
Deaths/1M pop       22
TotalTests          18
Tests/1M pop        18
WHO Region          25
dtype: int64

In [38]:
world.head()

Unnamed: 0,Country,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,Serious,Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
0,USA,North America,331198100.0,5032179,0.0,162804.0,0.0,2576668.0,0.0,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas
1,Brazil,South America,212710700.0,2917562,0.0,98644.0,0.0,2047660.0,0.0,771258.0,8318.0,13716.0,464.0,13206188.0,62085.0,Americas
2,India,Asia,1381345000.0,2025409,0.0,41638.0,0.0,1377384.0,0.0,606387.0,8944.0,1466.0,30.0,22149351.0,16035.0,South-EastAsia
3,Russia,Europe,145940900.0,871894,0.0,14606.0,0.0,676357.0,0.0,180931.0,2300.0,5974.0,100.0,29716907.0,203623.0,Europe
4,South Africa,Africa,59381570.0,538184,0.0,9604.0,0.0,387316.0,0.0,141264.0,539.0,9063.0,162.0,3149807.0,53044.0,Africa


In [39]:
# If we have 0 in Total Cases we can simply use 0 to fill the Total Cases/1M pop, that makes sense
world[world["TotalCases"] == 0]["Tot Cases/1M pop"]

Series([], Name: Tot Cases/1M pop, dtype: float64)

We got a empty dataframe,so we need to fill this in other way

In [40]:
world["Tot Cases/1M pop"].fillna("None", inplace=True)

In [41]:
index = world[world["Tot Cases/1M pop"] == "None"].index
total_pop = int(world["Population"][index])
total_cases = int(world["TotalCases"][index]) + int(world["NewCases"][index])
total_pop, total_cases

(30415486, 712)

In [42]:
tot_cases_per_1M_pop = (total_cases)/((total_pop)//1000000) 
world.loc[(world["Tot Cases/1M pop"] == "None"), "Tot Cases/1M pop"] = tot_cases_per_1M_pop

In [43]:
world.isna().sum()

Country              0
Continent            0
Population           0
TotalCases           0
NewCases             0
TotalDeaths          0
NewDeaths            0
TotalRecovered       0
NewRecovered         0
ActiveCases          0
Serious              0
Tot Cases/1M pop     0
Deaths/1M pop       22
TotalTests          18
Tests/1M pop        18
WHO Region          25
dtype: int64

In [44]:
world.loc[world.TotalDeaths==0,'Deaths/1M pop'] = world.loc[world.TotalDeaths==0,'Deaths/1M pop'].fillna(0)

In [45]:
world.isna().sum()

Country              0
Continent            0
Population           0
TotalCases           0
NewCases             0
TotalDeaths          0
NewDeaths            0
TotalRecovered       0
NewRecovered         0
ActiveCases          0
Serious              0
Tot Cases/1M pop     0
Deaths/1M pop        1
TotalTests          18
Tests/1M pop        18
WHO Region          25
dtype: int64

In [46]:
world["Deaths/1M pop"].fillna("None", inplace=True)

In [47]:
index = world[world["Deaths/1M pop"] == "None"].index
total_pop = int(world["Population"][index])
total_deaths = int(world["TotalDeaths"][index]) + int(world["NewDeaths"][index])
total_pop, total_deaths

(30415486, 13)

In [48]:
tot_deaths_per_1M_pop = (total_deaths)/((total_pop)//1000000) 
world.loc[(world["Deaths/1M pop"] == "None"), "Deaths/1M pop"] = tot_deaths_per_1M_pop

In [49]:
world.isna().sum()

Country              0
Continent            0
Population           0
TotalCases           0
NewCases             0
TotalDeaths          0
NewDeaths            0
TotalRecovered       0
NewRecovered         0
ActiveCases          0
Serious              0
Tot Cases/1M pop     0
Deaths/1M pop        0
TotalTests          18
Tests/1M pop        18
WHO Region          25
dtype: int64

In [50]:
world.head()

Unnamed: 0,Country,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,Serious,Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
0,USA,North America,331198100.0,5032179,0.0,162804.0,0.0,2576668.0,0.0,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas
1,Brazil,South America,212710700.0,2917562,0.0,98644.0,0.0,2047660.0,0.0,771258.0,8318.0,13716.0,464.0,13206188.0,62085.0,Americas
2,India,Asia,1381345000.0,2025409,0.0,41638.0,0.0,1377384.0,0.0,606387.0,8944.0,1466.0,30.0,22149351.0,16035.0,South-EastAsia
3,Russia,Europe,145940900.0,871894,0.0,14606.0,0.0,676357.0,0.0,180931.0,2300.0,5974.0,100.0,29716907.0,203623.0,Europe
4,South Africa,Africa,59381570.0,538184,0.0,9604.0,0.0,387316.0,0.0,141264.0,539.0,9063.0,162.0,3149807.0,53044.0,Africa


In [51]:
world["TotalTests"].fillna("None", inplace=True)

In [52]:
world.index[world["TotalTests"] == "None"].tolist()

[54,
 82,
 86,
 106,
 108,
 111,
 130,
 138,
 142,
 145,
 149,
 156,
 161,
 163,
 169,
 184,
 207,
 208]

In [53]:
column = world["TotalTests"]
for idx in world.index[world["TotalTests"] == "None"].tolist():
    if column[idx] == "None":
        column[idx] = (world["TotalCases"][idx]) + (world["NewCases"][idx]) + (world["TotalDeaths"][idx])+(world["NewDeaths"][idx])+(world["TotalRecovered"][idx])+(world["NewRecovered"][idx])

In [54]:
world[106:112]

Unnamed: 0,Country,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,Serious,Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
106,Nicaragua,North America,6632263.0,3902,0.0,123.0,0.0,2913.0,0.0,866.0,217.078928,588.0,19.0,6938.0,,Americas
107,Hong Kong,Asia,7503041.0,3850,0.0,46.0,0.0,2458.0,0.0,1346.0,39.0,513.0,6.0,692430.0,92287.0,WesternPacific
108,Congo,Africa,5530506.0,3546,0.0,58.0,0.0,1589.0,0.0,1899.0,217.078928,641.0,10.0,5193.0,,Africa
109,Montenegro,Europe,628074.0,3480,0.0,60.0,0.0,2178.0,0.0,1242.0,217.078928,5541.0,96.0,38427.0,61182.0,Europe
110,Thailand,Asia,69817894.0,3330,0.0,58.0,0.0,3148.0,0.0,124.0,1.0,48.0,0.8,749213.0,10731.0,South-EastAsia
111,Somalia,Africa,15933012.0,3227,0.0,93.0,0.0,1728.0,0.0,1406.0,2.0,203.0,6.0,5048.0,,EasternMediterranean


In [55]:
world.isna().sum()

Country              0
Continent            0
Population           0
TotalCases           0
NewCases             0
TotalDeaths          0
NewDeaths            0
TotalRecovered       0
NewRecovered         0
ActiveCases          0
Serious              0
Tot Cases/1M pop     0
Deaths/1M pop        0
TotalTests           0
Tests/1M pop        18
WHO Region          25
dtype: int64

In [56]:
world["Tests/1M pop"].fillna("None", inplace=True)

In [57]:
column = world["Tests/1M pop"]
for idx in world.index[world["Tests/1M pop"] == "None"].tolist():
    if column[idx] == "None":
        total_pop = (world["Population"][idx])*10000
        total_tests = world["TotalTests"][idx]
        tot_test_per_1M_pop = (total_tests)/((total_pop)//1000000) 
        column[idx] = tot_test_per_1M_pop / 10000

In [58]:
world[106:112]

Unnamed: 0,Country,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,Serious,Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
106,Nicaragua,North America,6632263.0,3902,0.0,123.0,0.0,2913.0,0.0,866.0,217.078928,588.0,19.0,6938.0,1e-05,Americas
107,Hong Kong,Asia,7503041.0,3850,0.0,46.0,0.0,2458.0,0.0,1346.0,39.0,513.0,6.0,692430.0,92287.0,WesternPacific
108,Congo,Africa,5530506.0,3546,0.0,58.0,0.0,1589.0,0.0,1899.0,217.078928,641.0,10.0,5193.0,9e-06,Africa
109,Montenegro,Europe,628074.0,3480,0.0,60.0,0.0,2178.0,0.0,1242.0,217.078928,5541.0,96.0,38427.0,61182.0,Europe
110,Thailand,Asia,69817894.0,3330,0.0,58.0,0.0,3148.0,0.0,124.0,1.0,48.0,0.8,749213.0,10731.0,South-EastAsia
111,Somalia,Africa,15933012.0,3227,0.0,93.0,0.0,1728.0,0.0,1406.0,2.0,203.0,6.0,5048.0,3e-06,EasternMediterranean


In [59]:
world.isna().sum()

Country              0
Continent            0
Population           0
TotalCases           0
NewCases             0
TotalDeaths          0
NewDeaths            0
TotalRecovered       0
NewRecovered         0
ActiveCases          0
Serious              0
Tot Cases/1M pop     0
Deaths/1M pop        0
TotalTests           0
Tests/1M pop         0
WHO Region          25
dtype: int64

In [60]:
world["WHO Region"].fillna("None", inplace=True)

In [61]:
world.isna().sum()

Country             0
Continent           0
Population          0
TotalCases          0
NewCases            0
TotalDeaths         0
NewDeaths           0
TotalRecovered      0
NewRecovered        0
ActiveCases         0
Serious             0
Tot Cases/1M pop    0
Deaths/1M pop       0
TotalTests          0
Tests/1M pop        0
WHO Region          0
dtype: int64

In [62]:
world.to_csv("updated-worldometer")