## Matplotlib: case COVID-19
Matplotlib is the most commonly used Python library for visualizing data. See https://jakevdp.github.io/PythonDataScienceHandbook/, a free online book by Jake Van der Plas, that covers a.o. pandas, matplotlib and machine learning.

In [1]:
import pandas as pd
import numpy as np

In [2]:
# coviddata = pd.read_csv("https://opendata.ecdc.europa.eu/covid19/casedistribution/csv")
'''
Concerning case Corona: the dataset referenced by the original link (https://opendata.ecdc.europa.eu/covid19/casedistribution/csv) has changed and now only contains weekly averages
We now use instead
coviddata = pd.read_excel ("https://www.ecdc.europa.eu/sites/default/files/documents/COVID-19-geographic-disbtribution-worldwide-2020-12-14.xlsx")
These are not the most recent values, but the notebook will then be fully functional again.
'''
coviddata = pd.read_excel("https://www.ecdc.europa.eu/sites/default/files/documents/COVID-19-geographic-disbtribution-worldwide-2020-12-14.xlsx")

In [3]:
# Print the first 5 (default value) records
coviddata.head()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
0,2020-12-14,14,12,2020,746,6,Afghanistan,AF,AFG,38041757.0,Asia,9.013779
1,2020-12-13,13,12,2020,298,9,Afghanistan,AF,AFG,38041757.0,Asia,7.052776
2,2020-12-12,12,12,2020,113,11,Afghanistan,AF,AFG,38041757.0,Asia,6.868768
3,2020-12-11,11,12,2020,63,10,Afghanistan,AF,AFG,38041757.0,Asia,7.134266
4,2020-12-10,10,12,2020,202,16,Afghanistan,AF,AFG,38041757.0,Asia,6.968658


In [4]:
# Print the first 10 records
coviddata.head(10)

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
0,27/10/2020,27,10,2020,199,8,Afghanistan,AF,AFG,38041757.0,Asia,2.902074
1,26/10/2020,26,10,2020,65,3,Afghanistan,AF,AFG,38041757.0,Asia,2.718066
2,25/10/2020,25,10,2020,81,4,Afghanistan,AF,AFG,38041757.0,Asia,2.799555
3,24/10/2020,24,10,2020,61,2,Afghanistan,AF,AFG,38041757.0,Asia,2.586631
4,23/10/2020,23,10,2020,116,4,Afghanistan,AF,AFG,38041757.0,Asia,2.452568
5,22/10/2020,22,10,2020,135,2,Afghanistan,AF,AFG,38041757.0,Asia,2.350049
6,21/10/2020,21,10,2020,88,2,Afghanistan,AF,AFG,38041757.0,Asia,2.173927
7,20/10/2020,20,10,2020,87,5,Afghanistan,AF,AFG,38041757.0,Asia,2.105581
8,19/10/2020,19,10,2020,59,4,Afghanistan,AF,AFG,38041757.0,Asia,2.258045
9,18/10/2020,18,10,2020,68,3,Afghanistan,AF,AFG,38041757.0,Asia,2.218615


In [4]:
# Print the last 5 (default value) records
coviddata.tail()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
61895,2020-03-25,25,3,2020,0,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,
61896,2020-03-24,24,3,2020,0,1,Zimbabwe,ZW,ZWE,14645473.0,Africa,
61897,2020-03-23,23,3,2020,0,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,
61898,2020-03-22,22,3,2020,1,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,
61899,2020-03-21,21,3,2020,1,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,


In [6]:
# Print the last 15 records
coviddata.tail(15)

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
51668,04/04/2020,4,4,2020,1,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,0.054624
51669,03/04/2020,3,4,2020,0,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,0.054624
51670,02/04/2020,2,4,2020,0,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,
51671,01/04/2020,1,4,2020,1,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,
51672,31/03/2020,31,3,2020,0,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,
51673,30/03/2020,30,3,2020,0,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,
51674,29/03/2020,29,3,2020,2,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,
51675,28/03/2020,28,3,2020,2,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,
51676,27/03/2020,27,3,2020,0,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,
51677,26/03/2020,26,3,2020,1,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,


In [7]:
# Names of the columns
coviddata.columns

Index(['dateRep', 'day', 'month', 'year', 'cases', 'deaths',
       'countriesAndTerritories', 'geoId', 'countryterritoryCode',
       'popData2019', 'continentExp',
       'Cumulative_number_for_14_days_of_COVID-19_cases_per_100000'],
      dtype='object')

In [8]:
# What is the index?
coviddata.index

RangeIndex(start=0, stop=51683, step=1)

In [9]:
# This is an easy way to check if there are some columns with missing data.
# E.g. CountryterritoryCode is 51398 and most other values are 51472
coviddata.count()

dateRep                                                       51683
day                                                           51683
month                                                         51683
year                                                          51683
cases                                                         51683
deaths                                                        51683
countriesAndTerritories                                       51683
geoId                                                         51456
countryterritoryCode                                          51608
popData2019                                                   51608
continentExp                                                  51683
Cumulative_number_for_14_days_of_COVID-19_cases_per_100000    48879
dtype: int64

In [10]:
# The dimensions of the dataframe
coviddata.shape

(51683, 12)

In [11]:
# General statistical information on numeric fields
coviddata.describe()

Unnamed: 0,day,month,year,cases,deaths,popData2019,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
count,51683.0,51683.0,51683.0,51683.0,51683.0,51608.0,48879.0
mean,15.885436,6.249966,2019.998704,843.566221,22.46377,41910340.0,42.494551
std,8.739709,2.515775,0.035982,4843.344797,124.707584,155461000.0,100.455527
min,1.0,1.0,2019.0,-8261.0,-1918.0,815.0,-147.419587
25%,8.0,4.0,2020.0,0.0,0.0,1324820.0,0.529216
50%,16.0,6.0,2020.0,12.0,0.0,8082359.0,5.464659
75%,23.0,8.0,2020.0,191.0,3.0,28608720.0,37.063642
max,31.0,12.0,2020.0,97894.0,4928.0,1433784000.0,1761.686598


In [12]:
# Checking the data types of the columns
coviddata.dtypes
# You can see that dateRep is an object and not datetype. We will have to change this.

dateRep                                                        object
day                                                             int64
month                                                           int64
year                                                            int64
cases                                                           int64
deaths                                                          int64
countriesAndTerritories                                        object
geoId                                                          object
countryterritoryCode                                           object
popData2019                                                   float64
continentExp                                                   object
Cumulative_number_for_14_days_of_COVID-19_cases_per_100000    float64
dtype: object

In [13]:
# We have to cast the dateRep from object to datetime
from datetime import datetime
coviddata['dateRep'] = pd.to_datetime(coviddata['dateRep'], format='%d/%m/%Y')
coviddata.head()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
0,2020-10-27,27,10,2020,199,8,Afghanistan,AF,AFG,38041757.0,Asia,2.902074
1,2020-10-26,26,10,2020,65,3,Afghanistan,AF,AFG,38041757.0,Asia,2.718066
2,2020-10-25,25,10,2020,81,4,Afghanistan,AF,AFG,38041757.0,Asia,2.799555
3,2020-10-24,24,10,2020,61,2,Afghanistan,AF,AFG,38041757.0,Asia,2.586631
4,2020-10-23,23,10,2020,116,4,Afghanistan,AF,AFG,38041757.0,Asia,2.452568


In [14]:
# Checking the data types of the columns again
coviddata.dtypes
# You can see that dateRep is an object and not datetype. We will have to change this.

dateRep                                                       datetime64[ns]
day                                                                    int64
month                                                                  int64
year                                                                   int64
cases                                                                  int64
deaths                                                                 int64
countriesAndTerritories                                               object
geoId                                                                 object
countryterritoryCode                                                  object
popData2019                                                          float64
continentExp                                                          object
Cumulative_number_for_14_days_of_COVID-19_cases_per_100000           float64
dtype: object

In [15]:
# The column name countriesAndTerritories and Cumulative_number_for_14_days_of_COVID-19_cases_per_100000 are quite long, let's rename both
coviddata.rename(columns={'countriesAndTerritories':'country', 'Cumulative_number_for_14_days_of_COVID-19_cases_per_100000':'cum_number_14_days_cases_per_100000'}, inplace=True)
coviddata.head()

Unnamed: 0,dateRep,day,month,year,cases,deaths,country,geoId,countryterritoryCode,popData2019,continentExp,cum_number_14_days_cases_per_100000
0,2020-10-27,27,10,2020,199,8,Afghanistan,AF,AFG,38041757.0,Asia,2.902074
1,2020-10-26,26,10,2020,65,3,Afghanistan,AF,AFG,38041757.0,Asia,2.718066
2,2020-10-25,25,10,2020,81,4,Afghanistan,AF,AFG,38041757.0,Asia,2.799555
3,2020-10-24,24,10,2020,61,2,Afghanistan,AF,AFG,38041757.0,Asia,2.586631
4,2020-10-23,23,10,2020,116,4,Afghanistan,AF,AFG,38041757.0,Asia,2.452568


In [16]:
# We have duplicate data: geoId, vs countryterritoryCode => drop geoId
# axis = 1 is necessary to indicate we want to delete columns (use axis = 0 for row based deletion) 
# If you only want to drop more 1 column, you don't really need square brackets
coviddata = coviddata.drop(['geoId'], axis = 1)
coviddata.head()

Unnamed: 0,dateRep,day,month,year,cases,deaths,country,countryterritoryCode,popData2019,continentExp,cum_number_14_days_cases_per_100000
0,2020-10-27,27,10,2020,199,8,Afghanistan,AFG,38041757.0,Asia,2.902074
1,2020-10-26,26,10,2020,65,3,Afghanistan,AFG,38041757.0,Asia,2.718066
2,2020-10-25,25,10,2020,81,4,Afghanistan,AFG,38041757.0,Asia,2.799555
3,2020-10-24,24,10,2020,61,2,Afghanistan,AFG,38041757.0,Asia,2.586631
4,2020-10-23,23,10,2020,116,4,Afghanistan,AFG,38041757.0,Asia,2.452568


In [17]:
# Excercise: drop the columns day and year
coviddata = coviddata.drop(['day','year'], axis = 1)
coviddata.head()

Unnamed: 0,dateRep,month,cases,deaths,country,countryterritoryCode,popData2019,continentExp,cum_number_14_days_cases_per_100000
0,2020-10-27,10,199,8,Afghanistan,AFG,38041757.0,Asia,2.902074
1,2020-10-26,10,65,3,Afghanistan,AFG,38041757.0,Asia,2.718066
2,2020-10-25,10,81,4,Afghanistan,AFG,38041757.0,Asia,2.799555
3,2020-10-24,10,61,2,Afghanistan,AFG,38041757.0,Asia,2.586631
4,2020-10-23,10,116,4,Afghanistan,AFG,38041757.0,Asia,2.452568


In [18]:
# What are the unique values for country
coviddata['country'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Anguilla', 'Antigua_and_Barbuda', 'Argentina', 'Armenia', 'Aruba',
       'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin',
       'Bermuda', 'Bhutan', 'Bolivia',
       'Bonaire, Saint Eustatius and Saba', 'Bosnia_and_Herzegovina',
       'Botswana', 'Brazil', 'British_Virgin_Islands',
       'Brunei_Darussalam', 'Bulgaria', 'Burkina_Faso', 'Burundi',
       'Cambodia', 'Cameroon', 'Canada', 'Cape_Verde',
       'Cases_on_an_international_conveyance_Japan', 'Cayman_Islands',
       'Central_African_Republic', 'Chad', 'Chile', 'China', 'Colombia',
       'Comoros', 'Congo', 'Costa_Rica', 'Cote_dIvoire', 'Croatia',
       'Cuba', 'Curaçao', 'Cyprus', 'Czechia',
       'Democratic_Republic_of_the_Congo', 'Denmark', 'Djibouti',
       'Dominica', 'Dominican_Republic', 'Ecuador', 'Egypt',
       'El_Salvador', 'Equatorial_Guinea', 'Eri

In [19]:
# How many unique values for country?
len(coviddata['country'].unique())

212

In [20]:
# FILTERING
# Find all lines from Belgium
coviddata[coviddata['country'] == 'Belgium']

Unnamed: 0,dateRep,month,cases,deaths,country,countryterritoryCode,popData2019,continentExp,cum_number_14_days_cases_per_100000
4623,2020-10-27,10,8,25,Belgium,BEL,11455519.0,Europe,1390.875437
4624,2020-10-26,10,1485,66,Belgium,BEL,11455519.0,Europe,1398.932689
4625,2020-10-25,10,7187,70,Belgium,BEL,11455519.0,Europe,1373.783239
4626,2020-10-24,10,14560,85,Belgium,BEL,11455519.0,Europe,1314.623982
4627,2020-10-23,10,15085,53,Belgium,BEL,11455519.0,Europe,1240.423939
...,...,...,...,...,...,...,...,...,...
4920,2020-01-04,1,0,0,Belgium,BEL,11455519.0,Europe,
4921,2020-01-03,1,0,0,Belgium,BEL,11455519.0,Europe,
4922,2020-01-02,1,0,0,Belgium,BEL,11455519.0,Europe,
4923,2020-01-01,1,0,0,Belgium,BEL,11455519.0,Europe,


In [21]:
# Find all lines for the Benelux
# Of course you could use the countryterritoryCode of these countries. But what if you want to use country and you don't have countryterritoryCode?
# You could first try to find countries from Europe
coviddata[coviddata['continentExp']=='Europe']

Unnamed: 0,dateRep,month,cases,deaths,country,countryterritoryCode,popData2019,continentExp,cum_number_14_days_cases_per_100000
292,2020-10-27,10,288,3,Albania,ALB,2862427.0,Europe,135.374631
293,2020-10-26,10,299,4,Albania,ALB,2862427.0,Europe,131.287191
294,2020-10-25,10,302,4,Albania,ALB,2862427.0,Europe,126.710655
295,2020-10-24,10,306,4,Albania,ALB,2862427.0,Europe,121.924507
296,2020-10-23,10,302,3,Albania,ALB,2862427.0,Europe,117.068488
...,...,...,...,...,...,...,...,...,...
49116,2020-01-04,1,0,0,United_Kingdom,GBR,66647112.0,Europe,
49117,2020-01-03,1,0,0,United_Kingdom,GBR,66647112.0,Europe,
49118,2020-01-02,1,0,0,United_Kingdom,GBR,66647112.0,Europe,
49119,2020-01-01,1,0,0,United_Kingdom,GBR,66647112.0,Europe,


In [22]:
# This doesn't help us to find the correct name of The Netherlands, so we try it in another way
coviddata[coviddata['country'].str.contains('etherlands')]

Unnamed: 0,dateRep,month,cases,deaths,country,countryterritoryCode,popData2019,continentExp,cum_number_14_days_cases_per_100000
33595,2020-10-27,10,10324,26,Netherlands,NLD,17282163.0,Europe,694.097145
33596,2020-10-26,10,10194,27,Netherlands,NLD,17282163.0,Europe,673.960777
33597,2020-10-25,10,8639,55,Netherlands,NLD,17282163.0,Europe,651.822344
33598,2020-10-24,10,9983,45,Netherlands,NLD,17282163.0,Europe,639.404917
33599,2020-10-23,10,9268,46,Netherlands,NLD,17282163.0,Europe,616.120795
...,...,...,...,...,...,...,...,...,...
33892,2020-01-04,1,0,0,Netherlands,NLD,17282163.0,Europe,
33893,2020-01-03,1,0,0,Netherlands,NLD,17282163.0,Europe,
33894,2020-01-02,1,0,0,Netherlands,NLD,17282163.0,Europe,
33895,2020-01-01,1,0,0,Netherlands,NLD,17282163.0,Europe,


In [23]:
# Find all lines for the Benelux
coviddata[coviddata['country'].isin(['Belgium','Netherlands', 'Luxembourg'])]

Unnamed: 0,dateRep,month,cases,deaths,country,countryterritoryCode,popData2019,continentExp,cum_number_14_days_cases_per_100000
4623,2020-10-27,10,8,25,Belgium,BEL,11455519.0,Europe,1390.875437
4624,2020-10-26,10,1485,66,Belgium,BEL,11455519.0,Europe,1398.932689
4625,2020-10-25,10,7187,70,Belgium,BEL,11455519.0,Europe,1373.783239
4626,2020-10-24,10,14560,85,Belgium,BEL,11455519.0,Europe,1314.623982
4627,2020-10-23,10,15085,53,Belgium,BEL,11455519.0,Europe,1240.423939
...,...,...,...,...,...,...,...,...,...
33892,2020-01-04,1,0,0,Netherlands,NLD,17282163.0,Europe,
33893,2020-01-03,1,0,0,Netherlands,NLD,17282163.0,Europe,
33894,2020-01-02,1,0,0,Netherlands,NLD,17282163.0,Europe,
33895,2020-01-01,1,0,0,Netherlands,NLD,17282163.0,Europe,


In [24]:
# Find all lines where the number of deaths is not 0
coviddata[~(coviddata['deaths'] == 0)]

Unnamed: 0,dateRep,month,cases,deaths,country,countryterritoryCode,popData2019,continentExp,cum_number_14_days_cases_per_100000
0,2020-10-27,10,199,8,Afghanistan,AFG,38041757.0,Asia,2.902074
1,2020-10-26,10,65,3,Afghanistan,AFG,38041757.0,Asia,2.718066
2,2020-10-25,10,81,4,Afghanistan,AFG,38041757.0,Asia,2.799555
3,2020-10-24,10,61,2,Afghanistan,AFG,38041757.0,Asia,2.586631
4,2020-10-23,10,116,4,Afghanistan,AFG,38041757.0,Asia,2.452568
...,...,...,...,...,...,...,...,...,...
51590,2020-06-21,6,7,2,Zimbabwe,ZWE,14645473.0,Africa,1.413406
51647,2020-04-25,4,0,1,Zimbabwe,ZWE,14645473.0,Africa,0.122905
51662,2020-04-10,4,0,1,Zimbabwe,ZWE,14645473.0,Africa,0.054624
51663,2020-04-09,4,1,1,Zimbabwe,ZWE,14645473.0,Africa,0.054624


In [25]:
# Find all lines where the number of deaths is not 0 and country is Belgium
coviddata[~(coviddata['deaths'] == 0) & (coviddata['country'] == 'Belgium')]

Unnamed: 0,dateRep,month,cases,deaths,country,countryterritoryCode,popData2019,continentExp,cum_number_14_days_cases_per_100000
4623,2020-10-27,10,8,25,Belgium,BEL,11455519.0,Europe,1390.875437
4624,2020-10-26,10,1485,66,Belgium,BEL,11455519.0,Europe,1398.932689
4625,2020-10-25,10,7187,70,Belgium,BEL,11455519.0,Europe,1373.783239
4626,2020-10-24,10,14560,85,Belgium,BEL,11455519.0,Europe,1314.623982
4627,2020-10-23,10,15085,53,Belgium,BEL,11455519.0,Europe,1240.423939
...,...,...,...,...,...,...,...,...,...
4849,2020-03-15,3,179,5,Belgium,BEL,11455519.0,Europe,11.880736
4850,2020-03-14,3,338,3,Belgium,BEL,11455519.0,Europe,8.930193
4851,2020-03-13,3,250,1,Belgium,BEL,11455519.0,Europe,6.747839
4852,2020-03-12,3,174,3,Belgium,BEL,11455519.0,Europe,5.228921


In [26]:
# Find all lines where countryterritoryCode is empty
coviddata[coviddata['countryterritoryCode'].isnull()]

Unnamed: 0,dateRep,month,cases,deaths,country,countryterritoryCode,popData2019,continentExp,cum_number_14_days_cases_per_100000
9159,2020-03-10,3,-9,1,Cases_on_an_international_conveyance_Japan,,,Other,
9160,2020-03-02,3,0,0,Cases_on_an_international_conveyance_Japan,,,Other,
9161,2020-03-01,3,0,0,Cases_on_an_international_conveyance_Japan,,,Other,
9162,2020-02-29,2,0,2,Cases_on_an_international_conveyance_Japan,,,Other,
9163,2020-02-28,2,0,0,Cases_on_an_international_conveyance_Japan,,,Other,
...,...,...,...,...,...,...,...,...,...
50848,2020-10-21,10,0,0,Wallis_and_Futuna,,,Oceania,
50849,2020-10-20,10,0,0,Wallis_and_Futuna,,,Oceania,
50850,2020-10-19,10,0,0,Wallis_and_Futuna,,,Oceania,
50851,2020-10-18,10,0,0,Wallis_and_Futuna,,,Oceania,


In [27]:
# Drop all lines where values are missing
# First print the number of values per column before
coviddata.count()


dateRep                                51683
month                                  51683
cases                                  51683
deaths                                 51683
country                                51683
countryterritoryCode                   51608
popData2019                            51608
continentExp                           51683
cum_number_14_days_cases_per_100000    48879
dtype: int64

In [28]:
# Delete all records that possibly contain empty values
coviddata = coviddata.dropna()

In [29]:
# Print the number of values per column afterwards
coviddata.count()

dateRep                                48879
month                                  48879
cases                                  48879
deaths                                 48879
country                                48879
countryterritoryCode                   48879
popData2019                            48879
continentExp                           48879
cum_number_14_days_cases_per_100000    48879
dtype: int64

In [30]:
# What were the values reported yesterday and the day beforeyesterday in Europe?
# First calculate the date of today and yesterday

from datetime import date, timedelta
today = date.today()
yesterday = today - timedelta(days = 1)
daybeforeyesterday = today - timedelta(days = 2)

print(today)
print(yesterday)
print(daybeforeyesterday)


2020-10-27
2020-10-26
2020-10-25


In [31]:
coviddata[(coviddata['dateRep'].isin([yesterday, daybeforeyesterday])) & (coviddata['continentExp'] == 'Europe')]

Unnamed: 0,dateRep,month,cases,deaths,country,countryterritoryCode,popData2019,continentExp,cum_number_14_days_cases_per_100000
293,2020-10-26,10,299,4,Albania,ALB,2862427.0,Europe,131.287191
294,2020-10-25,10,302,4,Albania,ALB,2862427.0,Europe,126.710655
823,2020-10-26,10,0,0,Andorra,AND,76177.0,Europe,1761.686598
824,2020-10-25,10,0,0,Andorra,AND,76177.0,Europe,1761.686598
1943,2020-10-26,10,973,16,Armenia,ARM,2957728.0,Europe,755.951866
...,...,...,...,...,...,...,...,...,...
45424,2020-10-25,10,0,0,Switzerland,CHE,8544527.0,Europe,504.334529
48296,2020-10-26,10,6079,102,Ukraine,UKR,43993643.0,Europe,187.424806
48297,2020-10-25,10,7014,125,Ukraine,UKR,43993643.0,Europe,184.444830
48820,2020-10-26,10,19790,151,United_Kingdom,GBR,66647112.0,Europe,405.244866


In [32]:
# AGGREGATION

In [33]:
# What is the largest number of new cases in 1 day for each country?
# Maximum value of cases per country
coviddata.groupby('country')['cases'].max()

country
Afghanistan       1063
Albania            306
Algeria            675
Andorra            299
Angola             355
                  ... 
Vietnam             50
Western_Sahara     242
Yemen              116
Zambia             527
Zimbabwe           490
Name: cases, Length: 209, dtype: int64

In [34]:
# Which country has the largest number of new cases in 1 day?
# Maximum value of cases per country, sorting = descending
coviddata.groupby('country')['cases'].max().sort_values(ascending=False)

country
India                          97894
United_States_of_America       85329
Brazil                         69074
Spain                          52188
France                         52010
                               ...  
Montserrat                         3
Saint_Kitts_and_Nevis              2
Greenland                          1
Falkland_Islands_(Malvinas)        1
Anguilla                           0
Name: cases, Length: 209, dtype: int64

In [35]:
# What is the dateRep of the largest number of new cases in 1 day?
coviddata.groupby(['country']).agg({'cases': "max", 'dateRep': "max"}).sort_values(by='cases', ascending=False)

Unnamed: 0_level_0,cases,dateRep
country,Unnamed: 1_level_1,Unnamed: 2_level_1
India,97894,2020-10-27
United_States_of_America,85329,2020-10-27
Brazil,69074,2020-10-27
Spain,52188,2020-10-26
France,52010,2020-10-27
...,...,...
New_Caledonia,3,2020-10-27
Saint_Kitts_and_Nevis,2,2020-10-27
Greenland,1,2020-10-27
Falkland_Islands_(Malvinas),1,2020-10-27


In [36]:
# What is the dateRep of the largest number of new cases in 1 day? Nicely formatted
coviddata.groupby(['country']).agg({'cases': "max", 'dateRep': "max"}).sort_values(by='cases', ascending=False).reset_index()

Unnamed: 0,country,cases,dateRep
0,India,97894,2020-10-27
1,United_States_of_America,85329,2020-10-27
2,Brazil,69074,2020-10-27
3,Spain,52188,2020-10-26
4,France,52010,2020-10-27
...,...,...,...
204,New_Caledonia,3,2020-10-27
205,Saint_Kitts_and_Nevis,2,2020-10-27
206,Greenland,1,2020-10-27
207,Falkland_Islands_(Malvinas),1,2020-10-27


In [37]:
# What is the largest and the total number of new cases for each country?
# Maximum value and sum of values of cases per country
coviddata.groupby('country')['cases'].agg(['max','sum']).sort_values(by='sum',ascending=False)

Unnamed: 0_level_0,max,sum
country,Unnamed: 1_level_1,Unnamed: 2_level_1
United_States_of_America,85329,8704524
India,97894,7946429
Brazil,69074,5409854
Russia,33897,1547774
France,52010,1165278
...,...,...
Montserrat,3,8
Saint_Kitts_and_Nevis,2,8
Greenland,1,7
Falkland_Islands_(Malvinas),1,2


In [38]:
# What is the largest and the total number of new cases for each country? The columns are renamed
# Maximum value and sum of values of cases per country + renaming the columns
coviddata.groupby('country')['cases'].agg(['max','sum']).rename(columns={'max':'maximum_value', 'sum': 'total_number'}).sort_values(by='total_number',ascending=False)

# coviddata.groupby('country')['cases'].agg(['max','sum']).rename(columns={'max':'maximum_value', 'sum': 'total_number'}).sort_values(by=['maximum_value','total_number'],ascending=[False,True])

Unnamed: 0_level_0,maximum_value,total_number
country,Unnamed: 1_level_1,Unnamed: 2_level_1
United_States_of_America,85329,8704524
India,97894,7946429
Brazil,69074,5409854
Russia,33897,1547774
France,52010,1165278
...,...,...
Montserrat,3,8
Saint_Kitts_and_Nevis,2,8
Greenland,1,7
Falkland_Islands_(Malvinas),1,2


In [39]:
# What is the total number of cases for each continent and each country?
coviddata.groupby(['continentExp', 'country'])['cases'].sum()

continentExp  country                 
Africa        Algeria                     56419
              Angola                       9373
              Benin                        2551
              Botswana                     6270
              Burkina_Faso                 2360
                                          ...  
Oceania       Guam                         4276
              New_Caledonia                  11
              New_Zealand                  1585
              Northern_Mariana_Islands       81
              Papua_New_Guinea              587
Name: cases, Length: 209, dtype: int64

In [40]:
# What is the total number of cases for each continent and each country? Nicely formatted
coviddata.groupby(['continentExp', 'country'])['cases'].sum().reset_index()

Unnamed: 0,continentExp,country,cases
0,Africa,Algeria,56419
1,Africa,Angola,9373
2,Africa,Benin,2551
3,Africa,Botswana,6270
4,Africa,Burkina_Faso,2360
...,...,...,...
204,Oceania,Guam,4276
205,Oceania,New_Caledonia,11
206,Oceania,New_Zealand,1585
207,Oceania,Northern_Mariana_Islands,81


In [41]:
# Exercise: What is the top 15 of countries with most Corona deaths in absolute numbers
coviddata.groupby('country')['deaths'].sum().sort_values(ascending=False).reset_index().head(15)


Unnamed: 0,country,deaths
0,United_States_of_America,225735
1,Brazil,157397
2,India,119502
3,Mexico,89171
4,United_Kingdom,44998
5,Italy,37479
6,Spain,35031
7,France,35018
8,Peru,34195
9,Iran,32953


In [42]:
# Exercise: What is the top 15 of countries with most Corona deaths in relative numbers
extra = coviddata.groupby(['country','popData2019'])['deaths'].sum().sort_values(ascending=False).reset_index()
extra.head(15)

Unnamed: 0,country,popData2019,deaths
0,United_States_of_America,329064900.0,225735
1,Brazil,211049500.0,157397
2,India,1366418000.0,119502
3,Mexico,127575500.0,89171
4,United_Kingdom,66647110.0,44998
5,Italy,60359550.0,37479
6,Spain,46937060.0,35031
7,France,67012880.0,35018
8,Peru,32510460.0,34195
9,Iran,82913890.0,32953


In [43]:
extra['rel_nr_of_deaths_per_100000'] = extra['deaths'] * 100000 / extra['popData2019']
extra.sort_values(by=['rel_nr_of_deaths_per_100000'], ascending=False).head(15)

Unnamed: 0,country,popData2019,deaths,rel_nr_of_deaths_per_100000
149,San_Marino,34453.0,42,121.905204
8,Peru,32510462.0,34195,105.181526
17,Belgium,11455519.0,10899,95.141914
133,Andorra,76177.0,72,94.516718
22,Bolivia,11513102.0,8658,75.201279
6,Spain,46937060.0,35031,74.633989
1,Brazil,211049519.0,157397,74.578232
14,Chile,18952035.0,14003,73.886525
16,Ecuador,17373657.0,12573,72.368184
3,Mexico,127575529.0,89171,69.896634


In [44]:
# We don't need the columns popData2019 and deaths in the dataframe extra any more
extra = extra.drop(['popData2019','deaths'], axis=1)
extra.sort_values(by=['rel_nr_of_deaths_per_100000'], ascending=False).head(15)

Unnamed: 0,country,rel_nr_of_deaths_per_100000
149,San_Marino,121.905204
8,Peru,105.181526
17,Belgium,95.141914
133,Andorra,94.516718
22,Bolivia,75.201279
6,Spain,74.633989
1,Brazil,74.578232
14,Chile,73.886525
16,Ecuador,72.368184
3,Mexico,69.896634


In [45]:
# Exercise: Are there countries where people have more chance to die from corona?
# Calculate the percentage of deaths relative to the number of cases
extra = coviddata.groupby(['country']).agg({'deaths': "sum", 'cases': "sum"}).reset_index()
extra['rel_nr_of_deaths_per_nr_of_cases'] = extra['deaths'] / extra['cases']
extra.sort_values(by=['rel_nr_of_deaths_per_nr_of_cases'], ascending=False).head(15)

Unnamed: 0,country,deaths,cases,rel_nr_of_deaths_per_nr_of_cases
206,Yemen,600,2063,0.290839
130,Montserrat,1,8,0.125
125,Mexico,89171,895325,0.099596
96,Isle_of_Man,24,286,0.083916
57,Ecuador,12573,162178,0.077526
67,Fiji,2,28,0.071429
98,Italy,37479,542789,0.069049
39,Chad,96,1434,0.066946
101,Jersey,30,475,0.063158
82,Guernsey,13,206,0.063107


In [46]:
# Excercise: On which day was the first Belgian Covid death reported?
coviddata[(coviddata['country'] == 'Belgium') & (coviddata['deaths'] >= 1)].sort_values(by=['dateRep'])

Unnamed: 0,dateRep,month,cases,deaths,country,countryterritoryCode,popData2019,continentExp,cum_number_14_days_cases_per_100000
4853,2020-03-11,3,99,1,Belgium,BEL,11455519.0,Europe,4.364708
4852,2020-03-12,3,174,3,Belgium,BEL,11455519.0,Europe,5.228921
4851,2020-03-13,3,250,1,Belgium,BEL,11455519.0,Europe,6.747839
4850,2020-03-14,3,338,3,Belgium,BEL,11455519.0,Europe,8.930193
4849,2020-03-15,3,179,5,Belgium,BEL,11455519.0,Europe,11.880736
...,...,...,...,...,...,...,...,...,...
4627,2020-10-23,10,15085,53,Belgium,BEL,11455519.0,Europe,1240.423939
4626,2020-10-24,10,14560,85,Belgium,BEL,11455519.0,Europe,1314.623982
4625,2020-10-25,10,7187,70,Belgium,BEL,11455519.0,Europe,1373.783239
4624,2020-10-26,10,1485,66,Belgium,BEL,11455519.0,Europe,1398.932689


In [47]:
# Exercise: What was the last country to report the first Corona deaths?
extra = coviddata[coviddata['deaths'] >= 1]
extra = extra.groupby(['country'])['dateRep'].min().sort_values(ascending=False).reset_index()
extra.head(15)

Unnamed: 0,country,dateRep
0,"Bonaire, Saint Eustatius and Saba",2020-09-15
1,French_Polynesia,2020-09-12
2,Vietnam,2020-08-01
3,Fiji,2020-08-01
4,Papua_New_Guinea,2020-07-29
5,Botswana,2020-07-28
6,Uganda,2020-07-25
7,Namibia,2020-07-11
8,Lesotho,2020-07-10
9,Turks_and_Caicos_islands,2020-07-01


In [48]:
# What is the total number of cases for each European country per month?
# unstack --> Nice visual explanation can be found on 
# http://www.datasciencemadesimple.com/reshape-using-stack-unstack-function-pandas-python/

coviddata[coviddata['continentExp'] == 'Europe'].groupby(['month', 'country'])['cases'].sum().unstack().sort_values(by=['month'])

country,Albania,Andorra,Armenia,Austria,Azerbaijan,Belarus,Belgium,Bosnia_and_Herzegovina,Bulgaria,Croatia,Cyprus,Czechia,Denmark,Estonia,Faroe_Islands,Finland,France,Georgia,Germany,Gibraltar,Greece,Guernsey,Holy_See,Hungary,Iceland,Ireland,Isle_of_Man,Italy,Jersey,Kosovo,Latvia,Liechtenstein,Lithuania,Luxembourg,Malta,Moldova,Monaco,Montenegro,Netherlands,North_Macedonia,Norway,Poland,Portugal,Romania,Russia,San_Marino,Serbia,Slovakia,Slovenia,Spain,Sweden,Switzerland,Ukraine,United_Kingdom
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1
1,,,0.0,0.0,0.0,0.0,0.0,,,0.0,,0.0,0.0,0.0,,1.0,6.0,0.0,5.0,,0.0,,,,0.0,0.0,,3.0,,,,,0.0,0.0,,,0.0,,0.0,0.0,0.0,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,0.0
2,,,0.0,7.0,1.0,1.0,1.0,,,5.0,,0.0,2.0,1.0,,2.0,51.0,2.0,52.0,,4.0,,,,1.0,0.0,,885.0,,,,,1.0,0.0,,,1.0,,2.0,1.0,6.0,,,3.0,2.0,1.0,,,,54.0,11.0,12.0,,30.0
3,153.0,182.0,532.0,9611.0,272.0,104.0,15295.0,189.0,196.0,785.0,114.0,3002.0,2575.0,714.0,150.0,1310.0,44493.0,96.0,61856.0,,1208.0,,1.0,442.0,1085.0,2910.0,,100851.0,,39.0,305.0,18.0,483.0,1988.0,108.0,218.0,35.0,6.0,11748.0,284.0,4220.0,1817.0,6239.0,1949.0,1834.0,228.0,636.0,213.0,488.0,104213.0,4349.0,15400.0,367.0,29651.0
4,543.0,373.0,1534.0,5746.0,1493.0,13076.0,34643.0,1336.0,1088.0,1272.0,613.0,4577.0,6431.0,951.0,19.0,3593.0,83892.0,419.0,97206.0,72.0,1364.0,191.0,4.0,2283.0,711.0,17343.0,248.0,101852.0,205.0,698.0,473.0,19.0,891.0,1781.0,306.0,3473.0,30.0,231.0,27052.0,1157.0,3441.0,10585.0,18284.0,10026.0,97563.0,334.0,7939.0,1055.0,655.0,110916.0,16608.0,13912.0,9386.0,137469.0
5,356.0,21.0,7216.0,1274.0,3480.0,28477.0,8976.0,804.0,1066.0,184.0,100.0,1651.0,2625.0,199.0,0.0,1920.0,23054.0,240.0,22363.0,20.0,339.0,1.0,2.0,1092.0,9.0,4676.0,23.0,29073.0,22.0,262.0,216.0,0.0,295.0,247.0,155.0,4327.0,1.0,2.0,7455.0,722.0,744.0,10931.0,7511.0,7155.0,297176.0,108.0,2657.0,130.0,55.0,24246.0,17422.0,1438.0,13338.0,87240.0
6,1344.0,91.0,16260.0,1028.0,11722.0,20132.0,2969.0,1850.0,2318.0,479.0,53.0,2575.0,1118.0,122.0,0.0,383.0,12764.0,171.0,12777.0,16.0,475.0,0.0,0.0,278.0,16.0,533.0,0.0,7772.0,11.0,1729.0,52.0,0.0,146.0,240.0,52.0,8259.0,3.0,177.0,3966.0,4060.0,444.0,10583.0,9709.0,7449.0,244581.0,27.0,2907.0,144.0,112.0,9842.0,28669.0,807.0,20424.0,29151.0
7,2731.0,67.0,13008.0,3343.0,14592.0,5875.0,7706.0,7101.0,6589.0,2346.0,88.0,4537.0,974.0,64.0,33.0,214.0,22313.0,232.0,14439.0,10.0,1011.0,0.0,0.0,360.0,50.0,565.0,0.0,6722.0,16.0,5305.0,111.0,6.0,246.0,2360.0,58.0,7590.0,14.0,2572.0,3740.0,4399.0,317.0,10877.0,8956.0,23009.0,193343.0,1.0,10925.0,600.0,554.0,39251.0,9621.0,3364.0,25166.0,18760.0
8,4183.0,202.0,5200.0,6209.0,4749.0,4022.0,15806.0,8345.0,4770.0,5052.0,403.0,8025.0,2975.0,322.0,191.0,654.0,91370.0,327.0,33683.0,98.0,5733.0,0.0,0.0,1456.0,233.0,2733.0,0.0,21060.0,39.0,5230.0,165.0,19.0,812.0,9.0,1137.0,12753.0,47.0,1717.0,16108.0,3304.0,1371.0,21839.0,6900.0,37194.0,155827.0,11.0,6152.0,1611.0,726.0,174336.0,7552.0,6973.0,50280.0,32166.0
9,4138.0,842.0,6151.0,17389.0,3810.0,6573.0,36957.0,7433.0,4357.0,6257.0,256.0,43476.0,10764.0,942.0,52.0,1815.0,272747.0,4379.0,46838.0,106.0,7989.0,4.0,0.0,20500.0,590.0,6980.0,4.0,44793.0,35.0,2240.0,336.0,12.0,1704.0,1806.0,1171.0,15329.0,83.0,5785.0,47349.0,3859.0,3245.0,23092.0,16949.0,38629.0,177479.0,17.0,2114.0,5698.0,2622.0,306330.0,8915.0,10845.0,85858.0,111689.0
10,5927.0,2359.0,30509.0,40441.0,10367.0,15447.0,211271.0,14374.0,19585.0,20828.0,1893.0,200527.0,13948.0,1113.0,27.0,5078.0,614588.0,26261.0,160056.0,279.0,13373.0,10.0,14.0,37181.0,1809.0,22327.0,11.0,229778.0,147.0,2574.0,3028.0,293.0,6371.0,5968.0,2338.0,19782.0,96.0,6334.0,183829.0,9413.0,4120.0,173967.0,46416.0,87078.0,379969.0,125.0,6348.0,35581.0,18593.0,329132.0,17447.0,67929.0,143992.0,448534.0


In [49]:
# Replace NaN in the previous result by 0.0
coviddata[coviddata['continentExp'] == 'Europe'].groupby(['month', 'country'])['cases'].sum().unstack().sort_values(by=['month']).fillna(0)

country,Albania,Andorra,Armenia,Austria,Azerbaijan,Belarus,Belgium,Bosnia_and_Herzegovina,Bulgaria,Croatia,Cyprus,Czechia,Denmark,Estonia,Faroe_Islands,Finland,France,Georgia,Germany,Gibraltar,Greece,Guernsey,Holy_See,Hungary,Iceland,Ireland,Isle_of_Man,Italy,Jersey,Kosovo,Latvia,Liechtenstein,Lithuania,Luxembourg,Malta,Moldova,Monaco,Montenegro,Netherlands,North_Macedonia,Norway,Poland,Portugal,Romania,Russia,San_Marino,Serbia,Slovakia,Slovenia,Spain,Sweden,Switzerland,Ukraine,United_Kingdom
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,7.0,1.0,1.0,1.0,0.0,0.0,5.0,0.0,0.0,2.0,1.0,0.0,2.0,51.0,2.0,52.0,0.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,885.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0,1.0,6.0,0.0,0.0,3.0,2.0,1.0,0.0,0.0,0.0,54.0,11.0,12.0,0.0,30.0
3,153.0,182.0,532.0,9611.0,272.0,104.0,15295.0,189.0,196.0,785.0,114.0,3002.0,2575.0,714.0,150.0,1310.0,44493.0,96.0,61856.0,0.0,1208.0,0.0,1.0,442.0,1085.0,2910.0,0.0,100851.0,0.0,39.0,305.0,18.0,483.0,1988.0,108.0,218.0,35.0,6.0,11748.0,284.0,4220.0,1817.0,6239.0,1949.0,1834.0,228.0,636.0,213.0,488.0,104213.0,4349.0,15400.0,367.0,29651.0
4,543.0,373.0,1534.0,5746.0,1493.0,13076.0,34643.0,1336.0,1088.0,1272.0,613.0,4577.0,6431.0,951.0,19.0,3593.0,83892.0,419.0,97206.0,72.0,1364.0,191.0,4.0,2283.0,711.0,17343.0,248.0,101852.0,205.0,698.0,473.0,19.0,891.0,1781.0,306.0,3473.0,30.0,231.0,27052.0,1157.0,3441.0,10585.0,18284.0,10026.0,97563.0,334.0,7939.0,1055.0,655.0,110916.0,16608.0,13912.0,9386.0,137469.0
5,356.0,21.0,7216.0,1274.0,3480.0,28477.0,8976.0,804.0,1066.0,184.0,100.0,1651.0,2625.0,199.0,0.0,1920.0,23054.0,240.0,22363.0,20.0,339.0,1.0,2.0,1092.0,9.0,4676.0,23.0,29073.0,22.0,262.0,216.0,0.0,295.0,247.0,155.0,4327.0,1.0,2.0,7455.0,722.0,744.0,10931.0,7511.0,7155.0,297176.0,108.0,2657.0,130.0,55.0,24246.0,17422.0,1438.0,13338.0,87240.0
6,1344.0,91.0,16260.0,1028.0,11722.0,20132.0,2969.0,1850.0,2318.0,479.0,53.0,2575.0,1118.0,122.0,0.0,383.0,12764.0,171.0,12777.0,16.0,475.0,0.0,0.0,278.0,16.0,533.0,0.0,7772.0,11.0,1729.0,52.0,0.0,146.0,240.0,52.0,8259.0,3.0,177.0,3966.0,4060.0,444.0,10583.0,9709.0,7449.0,244581.0,27.0,2907.0,144.0,112.0,9842.0,28669.0,807.0,20424.0,29151.0
7,2731.0,67.0,13008.0,3343.0,14592.0,5875.0,7706.0,7101.0,6589.0,2346.0,88.0,4537.0,974.0,64.0,33.0,214.0,22313.0,232.0,14439.0,10.0,1011.0,0.0,0.0,360.0,50.0,565.0,0.0,6722.0,16.0,5305.0,111.0,6.0,246.0,2360.0,58.0,7590.0,14.0,2572.0,3740.0,4399.0,317.0,10877.0,8956.0,23009.0,193343.0,1.0,10925.0,600.0,554.0,39251.0,9621.0,3364.0,25166.0,18760.0
8,4183.0,202.0,5200.0,6209.0,4749.0,4022.0,15806.0,8345.0,4770.0,5052.0,403.0,8025.0,2975.0,322.0,191.0,654.0,91370.0,327.0,33683.0,98.0,5733.0,0.0,0.0,1456.0,233.0,2733.0,0.0,21060.0,39.0,5230.0,165.0,19.0,812.0,9.0,1137.0,12753.0,47.0,1717.0,16108.0,3304.0,1371.0,21839.0,6900.0,37194.0,155827.0,11.0,6152.0,1611.0,726.0,174336.0,7552.0,6973.0,50280.0,32166.0
9,4138.0,842.0,6151.0,17389.0,3810.0,6573.0,36957.0,7433.0,4357.0,6257.0,256.0,43476.0,10764.0,942.0,52.0,1815.0,272747.0,4379.0,46838.0,106.0,7989.0,4.0,0.0,20500.0,590.0,6980.0,4.0,44793.0,35.0,2240.0,336.0,12.0,1704.0,1806.0,1171.0,15329.0,83.0,5785.0,47349.0,3859.0,3245.0,23092.0,16949.0,38629.0,177479.0,17.0,2114.0,5698.0,2622.0,306330.0,8915.0,10845.0,85858.0,111689.0
10,5927.0,2359.0,30509.0,40441.0,10367.0,15447.0,211271.0,14374.0,19585.0,20828.0,1893.0,200527.0,13948.0,1113.0,27.0,5078.0,614588.0,26261.0,160056.0,279.0,13373.0,10.0,14.0,37181.0,1809.0,22327.0,11.0,229778.0,147.0,2574.0,3028.0,293.0,6371.0,5968.0,2338.0,19782.0,96.0,6334.0,183829.0,9413.0,4120.0,173967.0,46416.0,87078.0,379969.0,125.0,6348.0,35581.0,18593.0,329132.0,17447.0,67929.0,143992.0,448534.0


In [50]:
# alternative solution:
table = pd.pivot_table(coviddata[coviddata['continentExp'] == 'Europe'], values='cases', index='month',columns='country',aggfunc=np.sum).fillna(0)
table

country,Albania,Andorra,Armenia,Austria,Azerbaijan,Belarus,Belgium,Bosnia_and_Herzegovina,Bulgaria,Croatia,Cyprus,Czechia,Denmark,Estonia,Faroe_Islands,Finland,France,Georgia,Germany,Gibraltar,Greece,Guernsey,Holy_See,Hungary,Iceland,Ireland,Isle_of_Man,Italy,Jersey,Kosovo,Latvia,Liechtenstein,Lithuania,Luxembourg,Malta,Moldova,Monaco,Montenegro,Netherlands,North_Macedonia,Norway,Poland,Portugal,Romania,Russia,San_Marino,Serbia,Slovakia,Slovenia,Spain,Sweden,Switzerland,Ukraine,United_Kingdom
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,7.0,1.0,1.0,1.0,0.0,0.0,5.0,0.0,0.0,2.0,1.0,0.0,2.0,51.0,2.0,52.0,0.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,885.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0,1.0,6.0,0.0,0.0,3.0,2.0,1.0,0.0,0.0,0.0,54.0,11.0,12.0,0.0,30.0
3,153.0,182.0,532.0,9611.0,272.0,104.0,15295.0,189.0,196.0,785.0,114.0,3002.0,2575.0,714.0,150.0,1310.0,44493.0,96.0,61856.0,0.0,1208.0,0.0,1.0,442.0,1085.0,2910.0,0.0,100851.0,0.0,39.0,305.0,18.0,483.0,1988.0,108.0,218.0,35.0,6.0,11748.0,284.0,4220.0,1817.0,6239.0,1949.0,1834.0,228.0,636.0,213.0,488.0,104213.0,4349.0,15400.0,367.0,29651.0
4,543.0,373.0,1534.0,5746.0,1493.0,13076.0,34643.0,1336.0,1088.0,1272.0,613.0,4577.0,6431.0,951.0,19.0,3593.0,83892.0,419.0,97206.0,72.0,1364.0,191.0,4.0,2283.0,711.0,17343.0,248.0,101852.0,205.0,698.0,473.0,19.0,891.0,1781.0,306.0,3473.0,30.0,231.0,27052.0,1157.0,3441.0,10585.0,18284.0,10026.0,97563.0,334.0,7939.0,1055.0,655.0,110916.0,16608.0,13912.0,9386.0,137469.0
5,356.0,21.0,7216.0,1274.0,3480.0,28477.0,8976.0,804.0,1066.0,184.0,100.0,1651.0,2625.0,199.0,0.0,1920.0,23054.0,240.0,22363.0,20.0,339.0,1.0,2.0,1092.0,9.0,4676.0,23.0,29073.0,22.0,262.0,216.0,0.0,295.0,247.0,155.0,4327.0,1.0,2.0,7455.0,722.0,744.0,10931.0,7511.0,7155.0,297176.0,108.0,2657.0,130.0,55.0,24246.0,17422.0,1438.0,13338.0,87240.0
6,1344.0,91.0,16260.0,1028.0,11722.0,20132.0,2969.0,1850.0,2318.0,479.0,53.0,2575.0,1118.0,122.0,0.0,383.0,12764.0,171.0,12777.0,16.0,475.0,0.0,0.0,278.0,16.0,533.0,0.0,7772.0,11.0,1729.0,52.0,0.0,146.0,240.0,52.0,8259.0,3.0,177.0,3966.0,4060.0,444.0,10583.0,9709.0,7449.0,244581.0,27.0,2907.0,144.0,112.0,9842.0,28669.0,807.0,20424.0,29151.0
7,2731.0,67.0,13008.0,3343.0,14592.0,5875.0,7706.0,7101.0,6589.0,2346.0,88.0,4537.0,974.0,64.0,33.0,214.0,22313.0,232.0,14439.0,10.0,1011.0,0.0,0.0,360.0,50.0,565.0,0.0,6722.0,16.0,5305.0,111.0,6.0,246.0,2360.0,58.0,7590.0,14.0,2572.0,3740.0,4399.0,317.0,10877.0,8956.0,23009.0,193343.0,1.0,10925.0,600.0,554.0,39251.0,9621.0,3364.0,25166.0,18760.0
8,4183.0,202.0,5200.0,6209.0,4749.0,4022.0,15806.0,8345.0,4770.0,5052.0,403.0,8025.0,2975.0,322.0,191.0,654.0,91370.0,327.0,33683.0,98.0,5733.0,0.0,0.0,1456.0,233.0,2733.0,0.0,21060.0,39.0,5230.0,165.0,19.0,812.0,9.0,1137.0,12753.0,47.0,1717.0,16108.0,3304.0,1371.0,21839.0,6900.0,37194.0,155827.0,11.0,6152.0,1611.0,726.0,174336.0,7552.0,6973.0,50280.0,32166.0
9,4138.0,842.0,6151.0,17389.0,3810.0,6573.0,36957.0,7433.0,4357.0,6257.0,256.0,43476.0,10764.0,942.0,52.0,1815.0,272747.0,4379.0,46838.0,106.0,7989.0,4.0,0.0,20500.0,590.0,6980.0,4.0,44793.0,35.0,2240.0,336.0,12.0,1704.0,1806.0,1171.0,15329.0,83.0,5785.0,47349.0,3859.0,3245.0,23092.0,16949.0,38629.0,177479.0,17.0,2114.0,5698.0,2622.0,306330.0,8915.0,10845.0,85858.0,111689.0
10,5927.0,2359.0,30509.0,40441.0,10367.0,15447.0,211271.0,14374.0,19585.0,20828.0,1893.0,200527.0,13948.0,1113.0,27.0,5078.0,614588.0,26261.0,160056.0,279.0,13373.0,10.0,14.0,37181.0,1809.0,22327.0,11.0,229778.0,147.0,2574.0,3028.0,293.0,6371.0,5968.0,2338.0,19782.0,96.0,6334.0,183829.0,9413.0,4120.0,173967.0,46416.0,87078.0,379969.0,125.0,6348.0,35581.0,18593.0,329132.0,17447.0,67929.0,143992.0,448534.0


In [51]:
# Window functions
# LAG
coviddata['cases_previous_day'] = coviddata.sort_values(by=['dateRep'], ascending=True)['cases'].shift(1)
coviddata.head()

Unnamed: 0,dateRep,month,cases,deaths,country,countryterritoryCode,popData2019,continentExp,cum_number_14_days_cases_per_100000,cases_previous_day
0,2020-10-27,10,199,8,Afghanistan,AFG,38041757.0,Asia,2.902074,1.0
1,2020-10-26,10,65,3,Afghanistan,AFG,38041757.0,Asia,2.718066,986.0
2,2020-10-25,10,81,4,Afghanistan,AFG,38041757.0,Asia,2.799555,0.0
3,2020-10-24,10,61,2,Afghanistan,AFG,38041757.0,Asia,2.586631,0.0
4,2020-10-23,10,116,4,Afghanistan,AFG,38041757.0,Asia,2.452568,138.0


In [52]:
# LEAD
coviddata['cases_next_day'] = coviddata.sort_values(by=['dateRep'], ascending=True)['cases'].shift(-1)
coviddata.head()

Unnamed: 0,dateRep,month,cases,deaths,country,countryterritoryCode,popData2019,continentExp,cum_number_14_days_cases_per_100000,cases_previous_day,cases_next_day
0,2020-10-27,10,199,8,Afghanistan,AFG,38041757.0,Asia,2.902074,1.0,
1,2020-10-26,10,65,3,Afghanistan,AFG,38041757.0,Asia,2.718066,986.0,2.0
2,2020-10-25,10,81,4,Afghanistan,AFG,38041757.0,Asia,2.799555,0.0,11.0
3,2020-10-24,10,61,2,Afghanistan,AFG,38041757.0,Asia,2.586631,0.0,2899.0
4,2020-10-23,10,116,4,Afghanistan,AFG,38041757.0,Asia,2.452568,138.0,0.0


In [53]:
# Choose your own countries of interest
some_countries = ['Austria', 'Belgium','France','Germany', 'Italy', 'Netherlands', 'Spain', 'United_Kingdom', 'United_States_of_America']
extra = coviddata[coviddata['country'].isin(some_countries)]
extra.head()



Unnamed: 0,dateRep,month,cases,deaths,country,countryterritoryCode,popData2019,continentExp,cum_number_14_days_cases_per_100000,cases_previous_day,cases_next_day
2756,2020-10-27,10,2512,4,Austria,AUT,8858775.0,Europe,321.229515,0.0,316.0
2757,2020-10-26,10,2766,6,Austria,AUT,8858775.0,Europe,299.736702,509.0,8.0
2758,2020-10-25,10,2989,6,Austria,AUT,8858775.0,Europe,283.165562,12.0,1.0
2759,2020-10-24,10,3442,15,Austria,AUT,8858775.0,Europe,259.595712,1759.0,17.0
2760,2020-10-23,10,2570,17,Austria,AUT,8858775.0,Europe,234.072995,88.0,3.0


In [54]:
# Show for some_countries a ranking (per country) of the days with the most new cases. 
# Show only the top 5 days per country
# In SQL: ROW_NUMBER() over (PARTITION BY country ORDER BY cases DESC) as rank

extra['rank'] = extra.sort_values(by=['cases'], ascending=False).groupby(['country']).cumcount() + 1
extra[extra['rank'] <= 5].sort_values(by=['country','rank'], ascending=True).head(15)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,dateRep,month,cases,deaths,country,countryterritoryCode,popData2019,continentExp,cum_number_14_days_cases_per_100000,cases_previous_day,cases_next_day,rank
2759,2020-10-24,10,3442,15,Austria,AUT,8858775.0,Europe,259.595712,1759.0,17.0,1
2758,2020-10-25,10,2989,6,Austria,AUT,8858775.0,Europe,283.165562,12.0,1.0,2
2757,2020-10-26,10,2766,6,Austria,AUT,8858775.0,Europe,299.736702,509.0,8.0,3
2760,2020-10-23,10,2570,17,Austria,AUT,8858775.0,Europe,234.072995,88.0,3.0,4
2756,2020-10-27,10,2512,4,Austria,AUT,8858775.0,Europe,321.229515,0.0,316.0,5
4629,2020-10-21,10,18574,47,Belgium,BEL,11455519.0,Europe,1046.691992,0.0,1627.0,1
4628,2020-10-22,10,16103,55,Belgium,BEL,11455519.0,Europe,1159.187986,714.0,1542.0,2
4627,2020-10-23,10,15085,53,Belgium,BEL,11455519.0,Europe,1240.423939,0.0,792.0,3
4630,2020-10-20,10,15032,49,Belgium,BEL,11455519.0,Europe,961.318296,31.0,6.0,4
4626,2020-10-24,10,14560,85,Belgium,BEL,11455519.0,Europe,1314.623982,17.0,14714.0,5


In [55]:
# Running Average
# Calculate the average number of cases registered in the past week
# In SQL AVG(cases) OVER (ORDER BY dateRep ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) as avg_number_cases_past_week
extra['avg_number_cases_past_week'] = extra.sort_values(by=['dateRep'], ascending=True).groupby('country')['cases'].rolling(6, min_periods = 1).mean()
extra.head()

TypeError: ignored

In [None]:
# We only need the data form Belgium
covid_belgium = coviddata[coviddata['country'] == 'Belgium']
covid_belgium.head()

In [None]:
# Drop the columns  month, country, countryterritoryCode, continentExp
covid_belgium = covid_belgium.drop(['month','country','countryterritoryCode','continentExp'], axis=1)
covid_belgium.head()

In [None]:
belgium = covid[covid['Country/Region']=='Belgium']
belgium

In [None]:
Brazil = covid[covid['Country/Region']=='Brazil']

In [None]:
Brazil.head(20)

In [None]:
# make Series of inhabitants per country (source: wikipedia)
inhabitants = pd.Series()
inhabitants['Belgium'] = 11491346
inhabitants['France'] = 64834000  # metropolitan France only
inhabitants['Netherlands'] = 17424978
inhabitants['Italy'] = 60317546
inhabitants['Spain'] = 46733038
inhabitants['Germany'] = 83149300
inhabitants['China'] = 58500000 #Hubei only
inhabitants['United Kingdom'] = 67545757
inhabitants['US'] = 326625791
inhabitants['Iran'] = 82021564
inhabitants['Sweden'] = 9960487
inhabitants['Denmark'] = 5605948
inhabitants['Finland'] = 5518371 
inhabitants['Norway'] = 5367580
inhabitants['Austria'] = 8754413
inhabitants['Switzerland'] = 8236303
inhabitants['India'] = 1281935911
inhabitants['Brazil'] = 207353391

In [None]:
def DrawCurve(province,country,color):
    if province == '':
        df = covid[(covid['Country/Region']==country) & (covid['Province/State'].isnull())]  # isnull()==True if NaN
    else:
        df = covid[(covid['Province/State']==province) & (covid['Country/Region']==country)]

    df = df.drop(['Province/State','Country/Region','Lat','Long'],axis=1)
    
    df_2 = df.T  # transpose
    
    df_2 = df_2.rename(columns={df_2.columns[0]:'cases'})   

    df_2['new_cases'] = df_2['cases'] - df_2.cases.shift(1)  # shift(1) returns the previous line

    df_2['new_cases'] = np.round(df_2['new_cases']*1000000.0/inhabitants[country],1)  # cases per million inhabitants

    # skip days at beginning as long as new_cases (per million) < 10
    for index,row in df_2.iterrows():
        if row['new_cases'] >= 10:
            break
        else:
            df_2.drop(index,inplace=True)
  
    df_2 = df_2.reset_index() # brengt de index naar een kolom met naam 'index'

    df_2 = df_2.drop(['index'],axis=1)
    df_2 = df_2.reset_index()

    df_2['new_cases'] = df_2['new_cases'].rolling(7).mean() # moving average of last 7 days 
    df_2 = df_2.rename(columns={'index':'dayssincefirst'})
    
    plt.plot(df_2['dayssincefirst'], df_2['new_cases'], color=color, label=country+' '+province, linewidth=3.0)

In [None]:
from datetime import datetime
last_update = covid.columns[-1]
last_update = datetime.strptime(last_update, '%m/%d/%y')
last_update = datetime.strftime(last_update,'%Y-%m-%d')

In [None]:
import matplotlib.pyplot as plt 
%matplotlib inline
# set the size of the diagrams
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 18,9
rcParams['figure.dpi'] = 200
plt.xlabel('Days since threshold of 10 new cases per million inhabitants')
plt.ylim([0,250])
plt.title('Evolution of new daily cases per million inhabitants (moving average of last 7 days) - Last update: ' + last_update)
plt.style.use('classic')

DrawCurve('','Belgium',color='black')

DrawCurve('','Germany',color='brown')
DrawCurve('','France',color='blue')
DrawCurve('','Italy',color='green')
DrawCurve('','Netherlands',color='red')
DrawCurve('','Spain',color='cyan')
# DrawCurve('Hubei','China',color='magenta')
DrawCurve('','United Kingdom',color='#00ff00')
DrawCurve('','Sweden',color='orange')
DrawCurve('','US',color='yellow')
DrawCurve('','Brazil',color='magenta')
DrawCurve('','India',color='grey')

plt.legend(loc='upper left',fontsize='x-small',frameon=True)

plt.show()
    

In [None]:
# show top 10 of countries with most deaths per 1.000.000 inhabitants
# import data
deaths = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv',sep=',')

# totaldeaths is value in last column
deaths['totaldeaths'] = deaths[deaths.columns[-1]]

# keep only relevant columns
deaths = deaths[['Country/Region','Province/State','totaldeaths']]
print(deaths.sort_values(by='totaldeaths',ascending=False).head(15))

def inh(c):
    if c in inhabitants:
        return inhabitants[c]
    else:
        return 1000000000
    
deaths['inhabitants'] = deaths['Country/Region'].map(inh)
print(deaths.head(15))
deaths['totaldeathspermillion'] = deaths['totaldeaths']*1000000/deaths['inhabitants']

In [None]:
# sort by totaldeaths
top10 = deaths.sort_values(by='totaldeathspermillion',ascending=False).head(10).reset_index().drop(['index'],axis=1).reset_index()
top10['Rank'] = top10['index'] + 1
top10 = top10[['Rank','Country/Region','totaldeathspermillion']]
top10