# Problem 1. (**Covid-19 Data Analysis & Visualization**)

In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
%matplotlib inline

from plotly import __version__
import cufflinks as cf
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
init_notebook_mode(connected=True)
cf.go_offline()

import warnings
warnings.filterwarnings('ignore')

config = {
  'toImageButtonOptions': {
    'format': 'svg', # one of png, svg, jpeg, webp
    'filename': 'img',
    'height': 500,
    'width':1000,
    'scale': 1 , # Multiply title/legend/axis/canvas sizes by this factor,
    'scrollZoom': True
  }
}

In [38]:
df = pd.read_csv('Covid-19_clean_data.csv')
df['Date'] = pd.to_datetime(df['Date'])
df.head(6)

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered
0,,Afghanistan,33.0,65.0,2020-01-22,0,0,0
1,,Albania,41.1533,20.1683,2020-01-22,0,0,0
2,,Algeria,28.0339,1.6596,2020-01-22,0,0,0
3,,Andorra,42.5063,1.5218,2020-01-22,0,0,0
4,,Angola,-11.2027,17.8739,2020-01-22,0,0,0
5,,Antigua and Barbuda,17.0608,-61.7964,2020-01-22,0,0,0


In [39]:
print(df.info())
print(df.isna().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16055 entries, 0 to 16054
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Province/State  4875 non-null   object        
 1   Country/Region  16055 non-null  object        
 2   Lat             16055 non-null  float64       
 3   Long            16055 non-null  float64       
 4   Date            16055 non-null  datetime64[ns]
 5   Confirmed       16055 non-null  int64         
 6   Deaths          16055 non-null  int64         
 7   Recovered       16055 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(3), object(2)
memory usage: 1003.6+ KB
None
Province/State    11180
Country/Region        0
Lat                   0
Long                  0
Date                  0
Confirmed             0
Deaths                0
Recovered             0
dtype: int64


In [40]:
df.describe()

Unnamed: 0,Lat,Long,Confirmed,Deaths,Recovered
count,16055.0,16055.0,16055.0,16055.0,16055.0
mean,22.364044,24.248533,435.497789,16.352102,144.815322
std,24.566311,71.360898,4139.044316,214.426702,2142.53347
min,-41.4545,-135.0,0.0,0.0,0.0
25%,8.538,-15.3101,0.0,0.0,0.0
50%,24.974,21.0059,0.0,0.0,0.0
75%,41.6086,88.0924,20.0,0.0,1.0
max,71.7069,178.065,83836.0,8215.0,61201.0


## Data Pre-processing

In [41]:
import pycountry_convert as pc

def continent(country):
    try:
        country_code = pc.country_name_to_country_alpha2(country, cn_name_format="default")
        try:
            continent_name = pc.country_alpha2_to_continent_code(country_code)

        except:
            continent_name = "Unknown"
    except:
        continent_name = "Unknown"
    return continent_name

In [42]:
df_clean = df
df_clean["Province/State"].fillna("", inplace=True)
df_clean["Active"] = df_clean["Confirmed"]-df_clean["Recovered"]-df_clean["Deaths"]
df_clean["Active"] = df_clean["Active"].fillna(0)
df_clean["Recovered"] = df_clean["Recovered"].fillna(0)
df_clean["Deaths"] = df_clean["Deaths"].fillna(0)
df_clean.loc[df_clean['Country/Region'] == 'US', 'Country/Region'] = 'USA'

for i in range(len(df_clean)) : 
  df_clean.loc[i,"Continent"]=continent(df_clean.loc[i, "Country/Region"])

df_clean.loc[df_clean['Continent'] == 'AS', 'Continent'] = 'Asia'
df_clean.loc[df_clean['Continent'] == 'EU', 'Continent'] = 'Europe'
df_clean.loc[df_clean['Continent'] == 'AF', 'Continent'] = 'Africa'
df_clean.loc[df_clean['Continent'] == 'NA', 'Continent'] = 'North America'
df_clean.loc[df_clean['Continent'] == 'SA', 'Continent'] = 'South America'
df_clean.loc[df_clean['Continent'] == 'OC', 'Continent'] = 'Australia'


df_clean = df_clean.sort_values(by="Date")
df_clean['time']=df_clean.Date.apply(lambda x: x.date()).apply(str)

df_clean.head()


Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered,Active,Continent,time
0,,Afghanistan,33.0,65.0,2020-01-22,0,0,0,0,Asia,2020-01-22
157,,Mauritius,-20.2,57.5,2020-01-22,0,0,0,0,Africa,2020-01-22
158,,Mexico,23.6345,-102.5528,2020-01-22,0,0,0,0,North America,2020-01-22
159,,Moldova,47.4116,28.3699,2020-01-22,0,0,0,0,Europe,2020-01-22
160,,Monaco,43.7333,7.4167,2020-01-22,0,0,0,0,Europe,2020-01-22


In [43]:
df_group_by_country_date = df_clean.groupby(['Date','Country/Region'])['Confirmed','Recovered', 'Deaths', 'Active'].sum().reset_index()

## Data available till 26/3/2020

In [44]:
df_total = df_clean.groupby(['Country/Region', 'Province/State'])['Date','Confirmed', 'Deaths', 'Recovered', 'Active' ].max()
df_total['Date'].max()

Timestamp('2020-03-26 00:00:00')

## Total cases and Death Rate until 26-03-2020

In [45]:
total_cases = df_total['Confirmed'].sum()
total_deaths = df_total['Deaths'].sum()
total_recovered = df_total['Recovered'].sum()
date = df_total['Date'].max()
death_rate = (total_deaths/total_cases)*100
x = pd.DataFrame({'Date':date,'Total Confirmed':total_cases , 'Total Deaths': total_deaths, 'Total Recovered': total_recovered
             ,'Death Rate':death_rate}, index=[0])
x.head()

Unnamed: 0,Date,Total Confirmed,Total Deaths,Total Recovered,Death Rate
0,2020-03-26,529607,23979,122033,4.527697


In [46]:
df_group_by_date = df_clean.groupby(['Date'])['Confirmed','Recovered', 'Deaths', 'Active'].sum().reset_index()
df_group_by_date.head()

Unnamed: 0,Date,Confirmed,Recovered,Deaths,Active
0,2020-01-22,555,28,17,510
1,2020-01-23,654,30,18,606
2,2020-01-24,941,36,26,879
3,2020-01-25,1434,39,42,1353
4,2020-01-26,2118,52,56,2010


# A.  Animated bubble graph: x-axis is the number of deaths, y-axis is the number of recovered cases, the size of each country bubble is the # of its cases.

In [47]:
df1 = df[["Country/Region","Confirmed", "Recovered", "Deaths"]]
df2 = df1.groupby("Country/Region").sum()
df2.rename(index = {"United_States_of_America":"United_States"}, inplace=True)
df2.rename(index = {"South_Korea": "Korea"}, inplace=True)
df2.tail(9)

Unnamed: 0_level_0,Confirmed,Recovered,Deaths
Country/Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
United Arab Emirates,3077,615,14
United Kingdom,65970,1238,2905
Uruguay,1266,0,0
Uzbekistan,405,0,0
Venezuela,681,75,0
Vietnam,1990,639,0
West Bank and Gaza,822,119,1
Zambia,45,0,0
Zimbabwe,19,0,4


In [48]:
df2.reset_index(inplace=True)
df2.tail(9)

Unnamed: 0,Country/Region,Confirmed,Recovered,Deaths
166,United Arab Emirates,3077,615,14
167,United Kingdom,65970,1238,2905
168,Uruguay,1266,0,0
169,Uzbekistan,405,0,0
170,Venezuela,681,75,0
171,Vietnam,1990,639,0
172,West Bank and Gaza,822,119,1
173,Zambia,45,0,0
174,Zimbabwe,19,0,4


### Recovered Cases vs Deaths per Country

In [49]:
df2.iplot(x="Deaths", y="Recovered", mode="markers", xTitle="Total Deaths", yTitle="Total Recovered cases", title="Recovered Cases vs Deaths per Country", categories="Country/Region")

In [50]:
df2["death rate"] = df2["Deaths"]/df2["Confirmed"]
df2

Unnamed: 0,Country/Region,Confirmed,Recovered,Deaths,death rate
0,Afghanistan,541,13,9,0.016636
1,Albania,1171,50,37,0.031597
2,Algeria,2154,420,159,0.073816
3,Andorra,1132,14,7,0.006184
4,Angola,18,0,0,0.000000
...,...,...,...,...,...
170,Venezuela,681,75,0,0.000000
171,Vietnam,1990,639,0,0.000000
172,West Bank and Gaza,822,119,1,0.001217
173,Zambia,45,0,0,0.000000


In [51]:
df2[df2["death rate"]==df2["death rate"].max()]

Unnamed: 0,Country/Region,Confirmed,Recovered,Deaths,death rate
150,Sudan,26,0,14,0.538462


In [52]:
df2[df2["Confirmed"]==df2["Confirmed"].max()]

Unnamed: 0,Country/Region,Confirmed,Recovered,Deaths,death rate
33,China,3777808,2008446,129278,0.03422


In [53]:
df2.iloc[-12]

Country/Region          USA
Confirmed            372968
Recovered              2544
Deaths                 5273
death rate        0.0141379
Name: 163, dtype: object

## Bubble graph for Confirmed Cases vs Deaths Per Country

In [54]:
df2.iplot(kind="bubble", x="Deaths", y="Confirmed", size="death rate", xTitle="Total Deaths", yTitle="Total Confirmed Cases", title="Confirmed Cases vs Deaths Per Country, considering death rate", categories = "Country/Region")

In [55]:
## Bubble graph for Recovered Cases vs Deaths Per Country

In [None]:
df2.iplot(kind="bubble", x="Deaths", y="Recovered", size="death rate", xTitle="Total Deaths", yTitle="Total Recovered Cases", title="Recovered Cases vs Deaths Per Country, considering death rate", categories = "Country/Region")

## Total Confirmed cases, deaths, recovered cases and mortality rate

In [26]:
data = {'Parent':  ['Confirmed', 'Confirmed', 'Confirmed'],
        'Child': ['Active', 'Deaths','Recovered'],
         'Cases': [total_cases-total_deaths-total_recovered,total_deaths,total_recovered]
        }

dfx = pd.DataFrame (data, columns = ['Parent','Child','Cases'])
dfx
fig = px.sunburst(dfx, path=['Parent','Child'], values='Cases',
#                   color='Cases',
                  color_continuous_scale="Agsunset_r",
#                    branchvalues="remainder",
#                   insidetextorientation='radial'
                 )
fig.update_layout(
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="black"
    ),
    title={
        'text': "Breakup of total confirmed cases",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        },
legend_font_size=16
)
fig.data[0].textinfo = 'label+text+value'
fig.show(config=config)

In [27]:
df_latest = df_clean[df_clean['Date'] == max(df_clean['Date'])].reset_index()
df_latest = df_latest.groupby('Country/Region')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()

stats = df_latest[['Country/Region','Confirmed', 'Active', 'Deaths', 'Recovered']]

stats = stats.sort_values(by='Confirmed', ascending=False)
stats = stats.reset_index(drop=True)

stats.style.background_gradient(cmap="Blues", subset=['Confirmed', 'Active'])\
            .background_gradient(cmap="Greens", subset=['Recovered'])\
            .background_gradient(cmap="Reds", subset=['Deaths'])

Unnamed: 0,Country/Region,Confirmed,Active,Deaths,Recovered
0,USA,83836,81946,1209,681
1,China,81782,4310,3291,74181
2,Italy,80589,62013,8215,10361
3,Spain,57786,46406,4365,7015
4,Germany,43938,37998,267,5673
5,France,29551,22898,1698,4955
6,Iran,29406,16715,2234,10457
7,United Kingdom,11812,11082,580,150
8,Switzerland,11811,11489,191,131
9,South Korea,9241,4966,131,4144


In [28]:
stats = stats.sort_values(by='Deaths', ascending=False).reset_index(drop=True)
stats['Death Rate %'] = (stats['Deaths']/stats['Confirmed'])*100
death_stats = stats[['Country/Region','Deaths','Death Rate %']]
death_stats.style.background_gradient(cmap="Reds", subset=['Deaths'])

Unnamed: 0,Country/Region,Deaths,Death Rate %
0,Italy,8215,10.193699
1,Spain,4365,7.553733
2,China,3291,4.024113
3,Iran,2234,7.597089
4,France,1698,5.745998
5,USA,1209,1.442101
6,United Kingdom,580,4.910261
7,Netherlands,435,5.824853
8,Germany,267,0.607674
9,Belgium,220,3.528468


In [29]:
for i in range(len(df_latest)) : 
  df_latest.loc[i,"Continent"]=continent(df_latest.loc[i, "Country/Region"])

df_latest.loc[df_latest['Continent'] == 'AS', 'Continent'] = 'Asia'
df_latest.loc[df_latest['Continent'] == 'EU', 'Continent'] = 'Europe'
df_latest.loc[df_latest['Continent'] == 'AF', 'Continent'] = 'Africa'
df_latest.loc[df_latest['Continent'] == 'NA', 'Continent'] = 'North America'
df_latest.loc[df_latest['Continent'] == 'SA', 'Continent'] = 'South America'
df_latest.loc[df_latest['Continent'] == 'OC', 'Continent'] = 'Australia'

df_latest.head()

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active,Continent
0,Afghanistan,94,4,2,88,Asia
1,Albania,174,6,17,151,Europe
2,Algeria,367,25,29,313,Africa
3,Andorra,224,3,1,220,Europe
4,Angola,4,0,0,4,Africa


In [30]:
df_group_by_country_date = df_group_by_country_date.sort_values(by="Date")
df_group_by_country_date['time']=df_group_by_country_date.Date.apply(lambda x: x.date()).apply(str)

for i in range(len(df_group_by_country_date)) : 
  df_group_by_country_date.loc[i,"Continent"]=continent(df_group_by_country_date.loc[i, "Country/Region"])

df_group_by_country_date.loc[df_group_by_country_date['Continent'] == 'AS', 'Continent'] = 'Asia'
df_group_by_country_date.loc[df_group_by_country_date['Continent'] == 'EU', 'Continent'] = 'Europe'
df_group_by_country_date.loc[df_group_by_country_date['Continent'] == 'AF', 'Continent'] = 'Africa'
df_group_by_country_date.loc[df_group_by_country_date['Continent'] == 'NA', 'Continent'] = 'North America'
df_group_by_country_date.loc[df_group_by_country_date['Continent'] == 'SA', 'Continent'] = 'South America'
df_group_by_country_date.loc[df_group_by_country_date['Continent'] == 'OC', 'Continent'] = 'Australia'

## Composition of Cases

In [31]:
import plotly.express as px
import numpy as np

fig = px.sunburst(df_latest, path=['Continent','Country/Region'], values='Confirmed',
                  color='Confirmed',
                  color_continuous_scale=px.colors.diverging.BrBG
                 )
fig.data[0].textinfo = 'label+text+value'

fig.update_layout(
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="black"
    ),
    title={
        'text': "Confirmed cases continent-wise and country-wise",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        }
    ,
legend_font_size=16
)
fig.show(config=config)

# B. Animated maps graph: Using some colormap, each country in each frame is assigned a color that indicates the # cases or # number of death cases in this country. The user can click on any country on the map to plot its number of cases/deaths through time. 

In [32]:
fig = go.Figure(data=go.Choropleth(
    locations = df_latest['Country/Region'],
    locationmode = 'country names',
    z = np.log(df_latest["Confirmed"]),
    text = df_latest["Confirmed"],
    hoverinfo ='location+text',
    
    colorscale = 'Thermal',
    reversescale=True,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    showscale = True,
    colorbar_title = '<b>Confirmed <br>Cases</b> <br>log(Confirmed)',
))

fig.update_layout(
     font=dict(
        family="Courier New, monospace",
        size=18,
        color="black"
    ),
    title={
        'text': "Confirmed cases all over world",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        },
legend_font_size=14
)
config = {
  'toImageButtonOptions': {
    'format': 'svg', # one of png, svg, jpeg, webp
    'filename': '111',
    'height': 500,
    'width':1000,
    'scale': 1 , # Multiply title/legend/axis/canvas sizes by this factor,
    'scrollZoom': True
  }
}
fig.show(config=config)

In [33]:
fig = go.Figure(data=go.Choropleth(
    locations = df_latest['Country/Region'],
    locationmode = 'country names',
    z = np.log(df_latest["Deaths"]),
    text = df_latest["Deaths"],
    hoverinfo ='location+text',
    
    colorscale = 'Thermal',
    reversescale=True,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    showscale = True,
    colorbar_title = '<b>Deaths <br>Cases</b> <br>log(Deaths)',
))

fig.update_layout(
     font=dict(
        family="Courier New, monospace",
        size=18,
        color="black"
    ),
    title={
        'text': "Deaths cases all over world",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        },
legend_font_size=14
)
config = {
  'toImageButtonOptions': {
    'format': 'svg', # one of png, svg, jpeg, webp
    'filename': '111',
    'height': 500,
    'width':1000,
    'scale': 1 , # Multiply title/legend/axis/canvas sizes by this factor,
    'scrollZoom': True
  }
}
fig.show(config=config)

# C. Animated Sorted chart (i.e. bar rank): each country is represented as bar on the chart. The length of the bar is the #cases or #deaths or both. The bars are always sorted (according to #cases or #deaths.) while running through time. 

In [34]:
import plotly.express as px


fig = px.bar(df_group_by_country_date, x="Continent", y="Confirmed", color="Continent",
  animation_frame="time", animation_group="Country/Region",range_y=[0,300000])
fig.update_layout(
    font=dict(
        family="Courier New, monospace",
        size=12,
        color="black"
    ),
    title={
        'text': "Confirmed Cases continent-wise and shift of epicentre from China (Asia) to Europe",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        },
legend_font_size=16
)

config = {
  'toImageButtonOptions': {
    'format': 'svg', # one of png, svg, jpeg, webp
    'filename': '111',
    'height': 500,
    'width':1000,
    'scale': 1 , # Multiply title/legend/axis/canvas sizes by this factor,
    'scrollZoom': True
  }
}

fig.show(config=config)

In [35]:
import plotly.express as px


fig = px.bar(df_group_by_country_date, x="Continent", y="Deaths", color="Continent",
  animation_frame="time", animation_group="Country/Region",range_y=[0,300000])
fig.update_layout(
    font=dict(
        family="Courier New, monospace",
        size=12,
        color="black"
    ),
    title={
        'text': "Deaths Cases continent-wise and shift of epicentre from China (Asia) to Europe",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        },
legend_font_size=16
)

config = {
  'toImageButtonOptions': {
    'format': 'svg', # one of png, svg, jpeg, webp
    'filename': '111',
    'height': 500,
    'width':1000,
    'scale': 1 , # Multiply title/legend/axis/canvas sizes by this factor,
    'scrollZoom': True
  }
}

fig.show(config=config)

# D. D. Animated bubble graph tailored to an idea of your own. i.e. you can choose the different axis (x, y, bubble size, bubble color) to elaborate or emphasize some specific observation/idea/conclusion based on this data.

## Animate Bubble map graph shows spreading of cases and shift of pandemic epicentre

In [36]:
df_group_by_country_date['size'] = df_group_by_country_date['Confirmed'].pow(0.3)
fig = px.scatter_geo(df_group_by_country_date, locations="Country/Region", locationmode='country names', 
                     color="Confirmed", size='size', hover_name="Country/Region", 
                     range_color= [0, max(df_group_by_country_date['Confirmed'])+2], animation_frame="time", 
                     )
fig.update(layout_coloraxis_showscale=True)
fig.update_layout(
     font=dict(
        family="Courier New, monospace",
        size=18,
        color="black"
    ),
    title={
        'text': "World-wide spread over time",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        },
legend_font_size=14)
config = {
  'toImageButtonOptions': {
    'format': 'svg', # one of png, svg, jpeg, webp
    'filename': '111',
    'height': 500,
    'width':1000,
    'scale': 1 , # Multiply title/legend/axis/canvas sizes by this factor,
    'scrollZoom': True
  }
}
fig.show(config=config)