In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')
%matplotlib inline

import chart_studio.plotly as py

In [2]:
import plotly.graph_objs as go 
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [3]:
import cufflinks as cf

In [4]:
# For Notebooks
init_notebook_mode(connected=True)

In [5]:
# For offline use
cf.go_offline()

# COVID-19

## Data Analysis and Visualization 

### Overview

This project is based on a data set with daily information about: 
        - number of new cases of Covid-19 registered
        - number of deaths registered due to Covid-19

for every country in the world, until 29/07/2020.

On the Data Frame there is also each country's population size as of 2019, as well as the cumulative number for 14  days of COVID-19 cases per 100000 people.

### Analysis

On this notebook we look at the data from three different prespectives:
    - World
    - Europe
    - Portugal

There will be analysis and visualizations about the data's behavior for this three different 'regions'.

### Note

With this analysis I am trying to learn more about data science and how to play with pandas and visualization.
There is a long way to go but trying and getting experience never hurt. I am using this data set to develop my skills as an aspiring data scientist since the best way to learn is to practice, not so much to bring new information to the table. There are so many amazing analysis out there about the pandemic's numbers and I am not trying to compare mine to those by any means.

In [6]:
covid_world = pd.read_csv('covid19-world')

covid_world.head()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
0,29/07/2020,29,7,2020,103,1,Afghanistan,AF,AFG,38041757.0,Asia,4.550263
1,28/07/2020,28,7,2020,105,1,Afghanistan,AF,AFG,38041757.0,Asia,5.028685
2,27/07/2020,27,7,2020,106,10,Afghanistan,AF,AFG,38041757.0,Asia,4.763187
3,26/07/2020,26,7,2020,121,13,Afghanistan,AF,AFG,38041757.0,Asia,4.484546
4,25/07/2020,25,7,2020,108,35,Afghanistan,AF,AFG,38041757.0,Asia,4.389913


## Portugal

In [7]:
# Create a new data frame only with information for Portugal

data_portugal = pd.DataFrame()
data_portugal = covid_world[covid_world['countriesAndTerritories']=='Portugal']



In [8]:
# Total casos desde 03-03-2020
data_portugal['cases'].sum()

50410

In [9]:
# Total mortes desde 03-03-2020
data_portugal['deaths'].sum()

1722

In [10]:
data_portugal.reset_index(inplace=True)
data_portugal.head()

Unnamed: 0,index,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
0,24369,29/07/2020,29,7,2020,111,3,Portugal,PT,PRT,10276617.0,Europe,32.685854
1,24370,28/07/2020,28,7,2020,135,2,Portugal,PT,PRT,10276617.0,Europe,33.873015
2,24371,27/07/2020,27,7,2020,209,1,Portugal,PT,PRT,10276617.0,Europe,35.536987
3,24372,26/07/2020,26,7,2020,263,4,Portugal,PT,PRT,10276617.0,Europe,36.334915
4,24373,25/07/2020,25,7,2020,313,7,Portugal,PT,PRT,10276617.0,Europe,39.049816


In [11]:
# Total de novos casos registados em cada mês

data_portugal.groupby('month')['cases'].sum()

month
3     6408
4    18284
5     7511
6     9709
7     8498
Name: cases, dtype: int64

In [12]:
# Plot with the evolution of the new cases per month 

data_portugal.groupby('month')['cases'].sum().iplot(kind='bar', title='New Cases per Month')

In [13]:
# Total de mortes registadas em cada mês

data_portugal.groupby('month')['deaths'].sum()

month
3    140
4    849
5    407
6    172
7    154
Name: deaths, dtype: int64

In [14]:
# Plot with the evolution of deaths per month 

data_portugal.groupby('month')['deaths'].sum().iplot(kind='bar')

In [15]:
# Single day with the most new cases registered

data_portugal[data_portugal['cases'] == data_portugal['cases'].max()]

Unnamed: 0,index,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
109,24478,11/04/2020,11,4,2020,1516,26,Portugal,PT,PRT,10276617.0,Europe,109.024205


In [16]:
# Single day with the most deaths registered

data_portugal[data_portugal['deaths'] == data_portugal['deaths'].max()]

Unnamed: 0,index,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
95,24464,25/04/2020,25,4,2020,918,60,Portugal,PT,PRT,10276617.0,Europe,75.890733


In [17]:
# Create a new column with the growth in percentage in number of cases registered per day

data_portugal['Change'] = data_portugal['cases'].pct_change()
data_portugal.head()

Unnamed: 0,index,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000,Change
0,24369,29/07/2020,29,7,2020,111,3,Portugal,PT,PRT,10276617.0,Europe,32.685854,
1,24370,28/07/2020,28,7,2020,135,2,Portugal,PT,PRT,10276617.0,Europe,33.873015,0.216216
2,24371,27/07/2020,27,7,2020,209,1,Portugal,PT,PRT,10276617.0,Europe,35.536987,0.548148
3,24372,26/07/2020,26,7,2020,263,4,Portugal,PT,PRT,10276617.0,Europe,36.334915,0.258373
4,24373,25/07/2020,25,7,2020,313,7,Portugal,PT,PRT,10276617.0,Europe,39.049816,0.190114


## World

Total registered cases of Covid-19 per country

In [18]:
covid_world.groupby('countriesAndTerritories')['cases'].sum().sort_values(ascending=False)

countriesAndTerritories
United_States_of_America             4351997
Brazil                               2483191
India                                1531669
Russia                                823515
South_Africa                          459761
Mexico                                402697
Peru                                  395005
Chile                                 349800
United_Kingdom                        300692
Iran                                  296273
Spain                                 280610
Pakistan                              276288
Saudi_Arabia                          270831
Colombia                              267385
Italy                                 246488
Bangladesh                            229185
Turkey                                227982
Germany                               206926
France                                183804
Argentina                             167403
Iraq                                  115332
Canada                         

In [19]:
# Day/Country with the highest number of new cases regitered

covid_world[covid_world['cases']==covid_world['cases'].max()]

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
31428,25/07/2020,25,7,2020,78427,1304,United_States_of_America,US,USA,329064917.0,America,281.979619


Plot the total cases per country on a world map

In [20]:
# Creating a new Data Frame with the total by country and with the correpsondent country code

total = pd.DataFrame(columns=['CountryCode', 'CountryName', 'TotalCases'])

total['CountryCode'] = list(covid_world.groupby('countriesAndTerritories')['countryterritoryCode'].unique())
total['CountryName'] = list(covid_world.groupby('countriesAndTerritories')['countriesAndTerritories'].unique())
total['TotalCases'] = list(covid_world.groupby('countriesAndTerritories')['cases'].sum())

total['CountryCode'] = total['CountryCode'].str.get(0)
total['CountryName'] = total['CountryName'].str.get(0)

total.head()

Unnamed: 0,CountryCode,CountryName,TotalCases
0,AFG,Afghanistan,36471
1,ALB,Albania,4997
2,DZA,Algeria,27973
3,AND,Andorra,907
4,AGO,Angola,932


In [21]:
data = dict(type='choropleth',
           locations = total['CountryCode'],
           z = total['TotalCases'],
           colorbar = {'title': 'number of cases'},
        )

In [22]:
layout = dict(title = 'Total Number of COVID-19 Cases',
             geo = dict(showframe=False,
                       projection = {'type': 'mercator'},
                       )
             )

In [23]:
choromap = go.Figure(data = [data], layout = layout)
iplot(choromap)

Plot the total cases per country, divided by the population, on a world map

In [24]:
# Add a column to the total Data Frame called 'Population' with the population number for each country as of 2019

total['Population'] = list(covid_world.groupby('countriesAndTerritories')['popData2019'].unique())

total['Population'] = total['Population'].str.get(0)
total.head()

Unnamed: 0,CountryCode,CountryName,TotalCases,Population
0,AFG,Afghanistan,36471,38041757.0
1,ALB,Albania,4997,2862427.0
2,DZA,Algeria,27973,43053054.0
3,AND,Andorra,907,76177.0
4,AGO,Angola,932,31825299.0


In [25]:
data = dict(type='choropleth',
           locations = total['CountryCode'],
           z = total['TotalCases']/total['Population'],
           colorbar = {'title': 'number of cases'},
        )

In [26]:
layout = dict(title = 'Total Number of COVID-19 Cases considering Population size',
             geo = dict(showframe=False,
                       projection = {'type': 'mercator'},
                       ),
             )

In [27]:
choromap = go.Figure(data = [data], layout = layout)
iplot(choromap)

## Europe

First create a new Data Frame only with european countries

In [28]:
europe = pd.DataFrame()
europe = covid_world[covid_world['continentExp']=='Europe']

europe.head()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
202,29/07/2020,29,7,2020,117,4,Albania,AL,ALB,2862427.0,Europe,46.464067
203,28/07/2020,28,7,2020,117,6,Albania,AL,ALB,2862427.0,Europe,45.730424
204,27/07/2020,27,7,2020,126,4,Albania,AL,ALB,2862427.0,Europe,45.730424
205,26/07/2020,26,7,2020,67,6,Albania,AL,ALB,2862427.0,Europe,44.228202
206,25/07/2020,25,7,2020,104,5,Albania,AL,ALB,2862427.0,Europe,45.136522


In [29]:
# Create a new Data Frame with the totals for the european countries

europe_total = pd.DataFrame()
europe_total = total[total['CountryName'].isin(europe['countriesAndTerritories'])]

europe_total.sort_values('TotalCases', ascending=False)

Unnamed: 0,CountryCode,CountryName,TotalCases,Population
159,RUS,Russia,823515,145872260.0
198,GBR,United_Kingdom,300692,66647112.0
179,ESP,Spain,280610,46937060.0
99,ITA,Italy,246488,60359546.0
75,DEU,Germany,206926,83019213.0
70,FRA,France,183804,67012883.0
183,SWE,Sweden,79494,10230185.0
17,BLR,Belarus,67366,9452409.0
196,UKR,Ukraine,66575,43993643.0
18,BEL,Belgium,66569,11455519.0


In [30]:
# Day/Country with the highest number of new cases registered on a single day

europe[europe['cases']==europe['cases'].max()]

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
25071,18/07/2020,18,7,2020,12640,310,Russia,RU,RUS,145872260.0,Europe,66.87632


Analysing the single month with the highest number of new cases registered

In [31]:
europe.groupby(by=['countriesAndTerritories', 'countryterritoryCode', 'month'])['cases'].sum().idxmax()

('Russia', 'RUS', 5)

In [32]:
europe.groupby(by=['countriesAndTerritories', 'countryterritoryCode', 'month'])['cases'].sum().max()

297176

This means that the country with the most cases registered in a month (in Europe) was Russia in May with 297176 new cases

In [33]:
europe.head()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
202,29/07/2020,29,7,2020,117,4,Albania,AL,ALB,2862427.0,Europe,46.464067
203,28/07/2020,28,7,2020,117,6,Albania,AL,ALB,2862427.0,Europe,45.730424
204,27/07/2020,27,7,2020,126,4,Albania,AL,ALB,2862427.0,Europe,45.730424
205,26/07/2020,26,7,2020,67,6,Albania,AL,ALB,2862427.0,Europe,44.228202
206,25/07/2020,25,7,2020,104,5,Albania,AL,ALB,2862427.0,Europe,45.136522
