In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [73]:
%pprint

#This line makes sure that matplotlib prints the plots
%matplotlib inline

# This just sets the default plot size to be bigger.
plt.rcParams['figure.figsize'] = (16, 12)

Pretty printing has been turned ON


To get up to date and trusted data I have gone to the website of the European Centre for Disease Prevention and Control which is an agency for the European Union. The website link is below. 

(When I have the notebook finished, I will try to pull the data directly from the website)

https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide

In [74]:
# reading in a csv file from ecdc website download, for up to date data
df = pd.read_csv("COVID-19Cases_05.07.2020.csv")
df

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp
0,05/07/2020,5,7,2020,348,7,Afghanistan,AF,AFG,38041757.0,Asia
1,04/07/2020,4,7,2020,302,12,Afghanistan,AF,AFG,38041757.0,Asia
2,03/07/2020,3,7,2020,186,33,Afghanistan,AF,AFG,38041757.0,Asia
3,02/07/2020,2,7,2020,319,28,Afghanistan,AF,AFG,38041757.0,Asia
4,01/07/2020,1,7,2020,279,13,Afghanistan,AF,AFG,38041757.0,Asia
...,...,...,...,...,...,...,...,...,...,...,...
27826,25/03/2020,25,3,2020,0,0,Zimbabwe,ZW,ZWE,14645473.0,Africa
27827,24/03/2020,24,3,2020,0,1,Zimbabwe,ZW,ZWE,14645473.0,Africa
27828,23/03/2020,23,3,2020,0,0,Zimbabwe,ZW,ZWE,14645473.0,Africa
27829,22/03/2020,22,3,2020,1,0,Zimbabwe,ZW,ZWE,14645473.0,Africa


In [75]:
# information on number of pieces of data in columns and datatype
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27831 entries, 0 to 27830
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   dateRep                  27831 non-null  object 
 1   day                      27831 non-null  int64  
 2   month                    27831 non-null  int64  
 3   year                     27831 non-null  int64  
 4   cases                    27831 non-null  int64  
 5   deaths                   27831 non-null  int64  
 6   countriesAndTerritories  27831 non-null  object 
 7   geoId                    27718 non-null  object 
 8   countryterritoryCode     27767 non-null  object 
 9   popData2019              27767 non-null  float64
 10  continentExp             27831 non-null  object 
dtypes: float64(1), int64(5), object(5)
memory usage: 2.3+ MB


As you can see above the date column is not a date datatype, I will need to change that so that data stays in order.

In [76]:
# Reading in the spreadsheet using Pandas, and telling pandas that there is a set of dates in the data
#df = pd.read_csv("COVID-19Cases_05.07.2020.csv", parse_dates=['dateRep'])
#df

In [77]:
#using pandas built in to_datetime function to change the dateRep column form object to a datetime object
df['dateRep'] = pd.to_datetime(df['dateRep'], format='%d/%m/%Y')
df

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp
0,2020-07-05,5,7,2020,348,7,Afghanistan,AF,AFG,38041757.0,Asia
1,2020-07-04,4,7,2020,302,12,Afghanistan,AF,AFG,38041757.0,Asia
2,2020-07-03,3,7,2020,186,33,Afghanistan,AF,AFG,38041757.0,Asia
3,2020-07-02,2,7,2020,319,28,Afghanistan,AF,AFG,38041757.0,Asia
4,2020-07-01,1,7,2020,279,13,Afghanistan,AF,AFG,38041757.0,Asia
...,...,...,...,...,...,...,...,...,...,...,...
27826,2020-03-25,25,3,2020,0,0,Zimbabwe,ZW,ZWE,14645473.0,Africa
27827,2020-03-24,24,3,2020,0,1,Zimbabwe,ZW,ZWE,14645473.0,Africa
27828,2020-03-23,23,3,2020,0,0,Zimbabwe,ZW,ZWE,14645473.0,Africa
27829,2020-03-22,22,3,2020,1,0,Zimbabwe,ZW,ZWE,14645473.0,Africa


In [78]:
#checkign info to see the datatype of dataRep now
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27831 entries, 0 to 27830
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   dateRep                  27831 non-null  datetime64[ns]
 1   day                      27831 non-null  int64         
 2   month                    27831 non-null  int64         
 3   year                     27831 non-null  int64         
 4   cases                    27831 non-null  int64         
 5   deaths                   27831 non-null  int64         
 6   countriesAndTerritories  27831 non-null  object        
 7   geoId                    27718 non-null  object        
 8   countryterritoryCode     27767 non-null  object        
 9   popData2019              27767 non-null  float64       
 10  continentExp             27831 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(5), object(4)
memory usage: 2.3+ MB


In [79]:
# a quick look at the head of the data to see how many columns we have and if we can get rid of some that are not needed
df.head()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp
0,2020-07-05,5,7,2020,348,7,Afghanistan,AF,AFG,38041757.0,Asia
1,2020-07-04,4,7,2020,302,12,Afghanistan,AF,AFG,38041757.0,Asia
2,2020-07-03,3,7,2020,186,33,Afghanistan,AF,AFG,38041757.0,Asia
3,2020-07-02,2,7,2020,319,28,Afghanistan,AF,AFG,38041757.0,Asia
4,2020-07-01,1,7,2020,279,13,Afghanistan,AF,AFG,38041757.0,Asia


In [80]:
# Statistical description of the data
df.describe()

Unnamed: 0,day,month,year,cases,deaths,popData2019
count,27831.0,27831.0,27831.0,27831.0,27831.0,27767.0
mean,15.814811,4.312996,2019.997593,403.925658,19.067515,46406260.0
std,8.994855,1.617419,0.049007,2351.306962,120.833161,166428500.0
min,1.0,1.0,2019.0,-29726.0,-1918.0,815.0
25%,8.0,3.0,2020.0,0.0,0.0,1798506.0
50%,16.0,4.0,2020.0,4.0,0.0,8776119.0
75%,24.0,6.0,2020.0,72.0,1.0,31825300.0
max,31.0,12.0,2020.0,54771.0,4928.0,1433784000.0


In [81]:
# shape of the data is (rows, columns)
df.shape

(27831, 11)

In [82]:
# column headers
df.keys()

Index(['dateRep', 'day', 'month', 'year', 'cases', 'deaths',
       'countriesAndTerritories', 'geoId', 'countryterritoryCode',
       'popData2019', 'continentExp'],
      dtype='object')

In [83]:
# we will drop the columns we do not need
df = df.drop(['day', 'month', 'year','geoId', 'countryterritoryCode', 'popData2019', 'continentExp'], axis=1)

In [84]:
# head of our new slimed down dataframe
df.head()

Unnamed: 0,dateRep,cases,deaths,countriesAndTerritories
0,2020-07-05,348,7,Afghanistan
1,2020-07-04,302,12,Afghanistan
2,2020-07-03,186,33,Afghanistan
3,2020-07-02,319,28,Afghanistan
4,2020-07-01,279,13,Afghanistan


In [85]:
#final 5 columns
df.tail()

Unnamed: 0,dateRep,cases,deaths,countriesAndTerritories
27826,2020-03-25,0,0,Zimbabwe
27827,2020-03-24,0,1,Zimbabwe
27828,2020-03-23,0,0,Zimbabwe
27829,2020-03-22,1,0,Zimbabwe
27830,2020-03-21,1,0,Zimbabwe


In [86]:
df_cases = df.drop(['deaths'], axis=1)
df_cases.head()

Unnamed: 0,dateRep,cases,countriesAndTerritories
0,2020-07-05,348,Afghanistan
1,2020-07-04,302,Afghanistan
2,2020-07-03,186,Afghanistan
3,2020-07-02,319,Afghanistan
4,2020-07-01,279,Afghanistan


In [87]:
df_cases.set_index('dateRep')

Unnamed: 0_level_0,cases,countriesAndTerritories
dateRep,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-07-05,348,Afghanistan
2020-07-04,302,Afghanistan
2020-07-03,186,Afghanistan
2020-07-02,319,Afghanistan
2020-07-01,279,Afghanistan
...,...,...
2020-03-25,0,Zimbabwe
2020-03-24,0,Zimbabwe
2020-03-23,0,Zimbabwe
2020-03-22,1,Zimbabwe


In [88]:
#cases = df_cases.pivot(index='countriesAndTerritories', columns='dateRep', values=['cases'])
#cases

In [89]:
cases = df_cases.pivot(index='dateRep', columns='countriesAndTerritories', values='cases')
cases.head()

countriesAndTerritories,Afghanistan,Albania,Algeria,Andorra,Angola,Anguilla,Antigua_and_Barbuda,Argentina,Armenia,Aruba,...,United_States_Virgin_Islands,United_States_of_America,Uruguay,Uzbekistan,Venezuela,Vietnam,Western_Sahara,Yemen,Zambia,Zimbabwe
dateRep,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-12-31,0.0,,0.0,,,,,,0.0,,...,,0.0,,,,0.0,,,,
2020-01-01,0.0,,0.0,,,,,,0.0,,...,,0.0,,,,0.0,,,,
2020-01-02,0.0,,0.0,,,,,,0.0,,...,,0.0,,,,0.0,,,,
2020-01-03,0.0,,0.0,,,,,,0.0,,...,,0.0,,,,0.0,,,,
2020-01-04,0.0,,0.0,,,,,,0.0,,...,,0.0,,,,0.0,,,,


In [90]:
cases.fillna(0, inplace=True)
cases

countriesAndTerritories,Afghanistan,Albania,Algeria,Andorra,Angola,Anguilla,Antigua_and_Barbuda,Argentina,Armenia,Aruba,...,United_States_Virgin_Islands,United_States_of_America,Uruguay,Uzbekistan,Venezuela,Vietnam,Western_Sahara,Yemen,Zambia,Zimbabwe
dateRep,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-12-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-07-01,279.0,69.0,336.0,0.0,8.0,0.0,0.0,2262.0,415.0,0.0,...,0.0,43880.0,4.0,329.0,302.0,0.0,172.0,30.0,26.0,17.0
2020-07-02,319.0,45.0,365.0,0.0,7.0,0.0,0.0,2667.0,523.0,0.0,...,8.0,52048.0,7.0,277.0,230.0,0.0,81.0,32.0,38.0,14.0
2020-07-03,186.0,82.0,385.0,0.0,24.0,0.0,2.0,2744.0,593.0,1.0,...,6.0,53399.0,4.0,295.0,211.0,0.0,0.0,31.0,0.0,12.0
2020-07-04,302.0,90.0,413.0,0.0,13.0,0.0,0.0,2845.0,662.0,0.0,...,13.0,54442.0,5.0,301.0,264.0,0.0,58.0,19.0,0.0,8.0


In [91]:
cases_14_days = cases.iloc[-28:,:]
cases_14_days

countriesAndTerritories,Afghanistan,Albania,Algeria,Andorra,Angola,Anguilla,Antigua_and_Barbuda,Argentina,Armenia,Aruba,...,United_States_Virgin_Islands,United_States_of_America,Uruguay,Uzbekistan,Venezuela,Vietnam,Western_Sahara,Yemen,Zambia,Zimbabwe
dateRep,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-06-08,791.0,14.0,104.0,0.0,5.0,0.0,0.0,774.0,766.0,0.0,...,0.0,22302.0,0.0,171.0,61.0,2.0,0.0,13.0,0.0,3.0
2020-06-09,575.0,17.0,111.0,0.0,1.0,0.0,0.0,826.0,195.0,0.0,...,0.0,18822.0,0.0,96.0,96.0,1.0,0.0,10.0,46.0,5.0
2020-06-10,542.0,36.0,117.0,0.0,4.0,0.0,0.0,1141.0,350.0,0.0,...,0.0,18665.0,1.0,99.0,159.0,0.0,0.0,28.0,0.0,27.0
2020-06-11,684.0,42.0,102.0,0.0,17.0,0.0,0.0,1239.0,428.0,0.0,...,1.0,20614.0,1.0,148.0,106.0,0.0,0.0,36.0,52.0,6.0
2020-06-12,747.0,44.0,105.0,0.0,5.0,0.0,0.0,1373.0,566.0,0.0,...,0.0,22883.0,0.0,124.0,76.0,0.0,0.0,31.0,0.0,12.0
2020-06-13,656.0,31.0,109.0,1.0,12.0,0.0,0.0,1391.0,612.0,0.0,...,0.0,25639.0,0.0,82.0,65.0,1.0,0.0,41.0,69.0,11.0
2020-06-14,556.0,48.0,112.0,0.0,8.0,0.0,0.0,1531.0,723.0,0.0,...,0.0,25540.0,0.0,93.0,25.0,1.0,0.0,73.0,36.0,13.0
2020-06-15,664.0,57.0,109.0,0.0,2.0,0.0,0.0,1282.0,663.0,0.0,...,1.0,19543.0,1.0,109.0,74.0,0.0,0.0,23.0,1.0,27.0
2020-06-16,761.0,69.0,112.0,0.0,2.0,0.0,0.0,1208.0,397.0,0.0,...,0.0,19957.0,0.0,190.0,84.0,0.0,0.0,116.0,24.0,4.0
2020-06-17,783.0,82.0,116.0,1.0,0.0,0.0,0.0,1374.0,425.0,0.0,...,0.0,23705.0,1.0,268.0,88.0,1.0,0.0,45.0,23.0,7.0


In [92]:
cases_14_days_rolling_average = cases_14_days.rolling(window=14).mean()
cases_14_days_rolling_average

countriesAndTerritories,Afghanistan,Albania,Algeria,Andorra,Angola,Anguilla,Antigua_and_Barbuda,Argentina,Armenia,Aruba,...,United_States_Virgin_Islands,United_States_of_America,Uruguay,Uzbekistan,Venezuela,Vietnam,Western_Sahara,Yemen,Zambia,Zimbabwe
dateRep,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-06-08,,,,,,,,,,,...,,,,,,,,,,
2020-06-09,,,,,,,,,,,...,,,,,,,,,,
2020-06-10,,,,,,,,,,,...,,,,,,,,,,
2020-06-11,,,,,,,,,,,...,,,,,,,,,,
2020-06-12,,,,,,,,,,,...,,,,,,,,,,
2020-06-13,,,,,,,,,,,...,,,,,,,,,,
2020-06-14,,,,,,,,,,,...,,,,,,,,,,
2020-06-15,,,,,,,,,,,...,,,,,,,,,,
2020-06-16,,,,,,,,,,,...,,,,,,,,,,
2020-06-17,,,,,,,,,,,...,,,,,,,,,,
