## This post is inspired by 2019 Novel Coronavirus COVID-19 (2019-nCoV) situation in China. The purpose is for private use and will not be updated frequently. Data sources include JHU CSSE and other official repo.

Sources: https://github.com/CSSEGISandData/COVID-19

https://github.com/BlankerL/DXY-COVID-19-Data

In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
import folium # geomap visualization
import folium.plugins as plugins
import numpy as np
import datetime

In [2]:
# load dataset
data = pd.read_csv("/home/kesci/input/2019ncov5600/2019_nCoV_data.csv")
data

Unnamed: 0,Sno,Date,Province/State,Country,Last Update,Confirmed,Deaths,Recovered
0,1,01/22/2020 12:00:00,Anhui,China,01/22/2020 12:00:00,1,0,0
1,2,01/22/2020 12:00:00,Beijing,China,01/22/2020 12:00:00,14,0,0
2,3,01/22/2020 12:00:00,Chongqing,China,01/22/2020 12:00:00,6,0,0
3,4,01/22/2020 12:00:00,Fujian,China,01/22/2020 12:00:00,1,0,0
4,5,01/22/2020 12:00:00,Gansu,China,01/22/2020 12:00:00,0,0,0
5,6,01/22/2020 12:00:00,Guangdong,China,01/22/2020 12:00:00,26,0,0
6,7,01/22/2020 12:00:00,Guangxi,China,01/22/2020 12:00:00,2,0,0
7,8,01/22/2020 12:00:00,Guizhou,China,01/22/2020 12:00:00,1,0,0
8,9,01/22/2020 12:00:00,Hainan,China,01/22/2020 12:00:00,4,0,0
9,10,01/22/2020 12:00:00,Hebei,China,01/22/2020 12:00:00,1,0,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1644 entries, 0 to 1643
Data columns (total 8 columns):
Sno               1644 non-null int64
Date              1644 non-null object
Province/State    1204 non-null object
Country           1644 non-null object
Last Update       1644 non-null object
Confirmed         1644 non-null int64
Deaths            1644 non-null int64
Recovered         1644 non-null int64
dtypes: int64(4), object(4)
memory usage: 102.8+ KB


### Data Cleaning

In [4]:
# drop Sno column
data.drop(['Sno'],axis = 1,inplace = True);

In [5]:
# replace missing value in the province/state column with space
data['Province/State'].fillna('',inplace = True)

In [6]:
# drop the countries where do not have confirmed cases
data.drop(data[data['Confirmed']==0].index.to_list(),axis = 0,inplace = True)

In [7]:
# check the remaining countries that have cases
countries = data['Country'].unique().tolist()
print(countries)

['China', 'US', 'Japan', 'Thailand', 'South Korea', 'Mainland China', 'Hong Kong', 'Macau', 'Taiwan', 'Singapore', 'Vietnam', 'France', 'Australia', 'Nepal', 'Malaysia', 'Canada', 'Cambodia', 'Sri Lanka', 'Germany', 'Finland', 'United Arab Emirates', 'Philippines', 'India', 'Italy', 'Sweden', 'Russia', 'Spain', 'UK', 'Belgium', 'Others', 'Egypt']


In [8]:
## as we can see there exists Mainland China besides China, and Macau/Hong Kong are listed outside separately
## so we need to fix that

In [17]:
data['Country'].replace({'Mainland China':'China','Hong Kong':'China','Macau':'China'},inplace=True)
countries = data[data['Confirmed']!=0]['Country'].unique().tolist()
print('Countries that have confirmed cases：', countries)
print('\n')
print('Number of countries that have confirmed cases：', len(countries))

Countries that have confirmed cases： ['China', 'US', 'Japan', 'Thailand', 'South Korea', 'Taiwan', 'Singapore', 'Vietnam', 'France', 'Australia', 'Nepal', 'Malaysia', 'Canada', 'Cambodia', 'Sri Lanka', 'Germany', 'Finland', 'United Arab Emirates', 'Philippines', 'India', 'Italy', 'Sweden', 'Russia', 'Spain', 'UK', 'Belgium', 'Others', 'Egypt']


Number of countries that have confirmed cases： 28


In [11]:
# we can see the data is not sorted by time
# but we are interested in the latest data for each country to compare

In [14]:
# change the column Date to datetime type in pandas
data['Date'] = data['Date'].apply(pd.to_datetime)
data.head()

Unnamed: 0,Date,Province/State,Country,Last Update,Confirmed,Deaths,Recovered
0,2020-01-22 12:00:00,Anhui,China,01/22/2020 12:00:00,1,0,0
1,2020-01-22 12:00:00,Beijing,China,01/22/2020 12:00:00,14,0,0
2,2020-01-22 12:00:00,Chongqing,China,01/22/2020 12:00:00,6,0,0
3,2020-01-22 12:00:00,Fujian,China,01/22/2020 12:00:00,1,0,0
5,2020-01-22 12:00:00,Guangdong,China,01/22/2020 12:00:00,26,0,0


In [15]:
# group by country and region and filter by date
grouped = data.groupby(['Country','Province/State'])
latest = grouped['Date'].idxmax()
data_latest = data.loc[latest]
data_latest

Unnamed: 0,Date,Province/State,Country,Last Update,Confirmed,Deaths,Recovered
211,2020-01-26 23:00:00,,Australia,01/26/2020 23:00:00,4,0,0
1614,2020-02-16 20:44:00,New South Wales,Australia,2020-02-13T17:53:03,4,0,4
1613,2020-02-16 20:44:00,Queensland,Australia,2020-02-09T19:33:02,5,0,0
1620,2020-02-16 20:44:00,South Australia,Australia,2020-02-02T22:33:07,2,0,0
1615,2020-02-16 20:44:00,Victoria,Australia,2020-02-13T17:53:03,4,0,4
1628,2020-02-16 20:44:00,,Belgium,2020-02-04T15:43:02,1,0,0
1629,2020-02-16 20:44:00,,Cambodia,2020-02-12T07:43:02,1,0,1
1616,2020-02-16 20:44:00,British Columbia,Canada,2020-02-07T05:43:03,4,0,0
1630,2020-02-16 20:44:00,"London, ON",Canada,2020-02-12T18:53:03,1,0,1
608,2020-02-02 21:00:00,Ontario,Canada,2020/1/2 18:12,3,0,0


In [19]:
# filter and remove data where city is missing and the country has more than one city value
useless = data_latest[
    (data_latest['Province/State']=='') & 
    (data_latest['Country'].apply(lambda x:data_latest['Country'].value_counts()[x])>1)
    ].index.to_list()
    
useless

[211, 1607]

In [20]:
data_latest.drop(useless,inplace=True)

### Visualization

In [22]:
# check the total number of cases by country
world_cases = data_latest.groupby(['Country'])['Confirmed','Deaths','Recovered'].sum().sort_values(by = 'Confirmed', ascending = False)
world_cases

Unnamed: 0_level_0,Confirmed,Deaths,Recovered
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
China,70514,1766,10755
Others,416,0,0
Singapore,75,0,18
Japan,59,1,12
Thailand,34,0,14
South Korea,29,0,9
US,23,0,3
Malaysia,22,0,7
Taiwan,20,1,2
Vietnam,16,0,7


In [23]:
plt.figure(figsize = (12, 8))
plt.barh(y = world_cases[1:].index, width = world_cases['Confirmed'][1:], color = 'lightcoral')

<BarContainer object of 27 artists>

In [25]:
# load coordinated data
world_coordinates = pd.read_csv('/home/kesci/input/2019ncov5600/world_coordinates.csv')

In [26]:
# combine the location data and the country cases
world_data = pd.merge(world_coordinates,world_cases,on = 'Country').sort_values(by = 'Confirmed', ascending = False)
world_data

Unnamed: 0,Code,Country,latitude,longitude,Confirmed,Deaths,Recovered
4,CN,China,35.86166,104.195397,70514,1766,10755
22,SG,Singapore,1.352083,103.819836,75,0,18
13,JP,Japan,36.204824,138.252924,59,1,12
23,TH,Thailand,15.870032,100.992541,34,0,14
15,KR,South Korea,35.907757,127.766922,29,0,9
25,US,US,37.09024,-95.712891,23,0,3
17,MY,Malaysia,4.210484,101.975766,22,0,7
24,TW,Taiwan,23.69781,120.960515,20,1,2
26,VN,Vietnam,14.058324,108.277199,16,0,7
1,AU,Australia,-25.274398,133.775136,15,0,8


In [27]:
# initialize world map using folium
world_map = folium.Map(location = [35, 0], zoom_start = 2.3)
world_map

In [28]:
# build coordinates
lat = world_data.iloc[0]['latitude']
lon = world_data.iloc[0]['longitude']

folium.CircleMarker([lat, lon],
                    radius =20,
                    color ='red',
                    fill_color ='red',
                    fill_opacity =0.7,
                    popup = ('<strong>Country</strong>: China<br>'
                                '<strong>Confirmed</strong>: ' + str(world_data.iloc[0]['Confirmed']) + '<br>')
                    ).add_to(world_map)
world_map

In [29]:
# plot other countries

for lat, lon, value, name in zip(
    world_data.iloc[1:]['latitude'], 
    world_data.iloc[1:]['longitude'], 
    world_data.iloc[1:]['Confirmed'],
    world_data.iloc[1:]['Country']):
    
    folium.CircleMarker([lat, lon],
                        radius = value*0.2,
                        color ='red',
                        fill_color ='red',
                        popup = ('<strong>Country</strong>: ' + str(name).capitalize() + '<br>'
                                '<strong>Confirmed</strong>: ' + str(value) + '<br>')
                        ).add_to(world_map)
world_map

In [33]:
# check data for China
china_cases = data_latest[data_latest['Country']=='China'].sort_values(by = 'Confirmed', ascending = False)
china_cases.head()

Unnamed: 0,Date,Province/State,Country,Last Update,Confirmed,Deaths,Recovered
1569,2020-02-16 20:44:00,Hubei,China,2020-02-16T23:53:01,58182,1696,6639
1570,2020-02-16 20:44:00,Guangdong,China,2020-02-16T12:03:06,1316,2,465
1571,2020-02-16 20:44:00,Henan,China,2020-02-16T13:53:03,1231,13,440
1572,2020-02-16 20:44:00,Zhejiang,China,2020-02-16T09:33:02,1167,0,456
1573,2020-02-16 20:44:00,Hunan,China,2020-02-16T13:13:04,1004,3,464


In [34]:
# pie chart to check distribution outside Hubei Province
explode = np.hstack((np.zeros(10),np.linspace(0,3,len(china_cases)-11)))
plt.figure(figsize = (12, 8))
plt.pie(china_cases['Confirmed'][1:],
        labels = china_cases['Province/State'][1:],
        autopct = '%1.1f%%',explode = explode);

In [35]:
# plot confirmed & recover cases data outside Hubei Province
plt.figure(figsize = (12, 8))
plt.barh(width = "Confirmed", y = "Province/State", data = china_cases[1:], color = "lightcoral");
plt.barh(width = "Recovered", y = "Province/State", data = china_cases[1:], color = "palegreen");

In [36]:
# load China's coordinates
china_coordinates = pd.read_csv("/home/kesci/input/2019ncov5600/china_Province_coordinates.csv")
china_coordinates.head()

Unnamed: 0,id,name,name2,lon,lat,name3
0,110000,北京市,北京,116.395645,39.929986,beijing
1,120000,天津市,天津,117.210813,39.14393,tianjin
2,310000,上海市,上海,121.4879,31.249162,shanghai
3,500000,重庆市,重庆,106.530635,29.544606,chongqing
4,130000,河北省,河北,115.661434,38.61384,hebei


In [37]:
# fix the column name to match and join the two tables
china_coordinates.rename(columns = {'name3':'Province/State'},inplace = True)
china_cases['Province/State'] = china_cases['Province/State'].map(lambda x:x.lower())

In [38]:
china_data = china_cases.merge(china_coordinates)
china_data.head()

Unnamed: 0,Date,Province/State,Country,Last Update,Confirmed,Deaths,Recovered,id,name,name2,lon,lat
0,2020-02-16 20:44:00,hubei,China,2020-02-16T23:53:01,58182,1696,6639,420000,湖北省,湖北,112.410562,31.209316
1,2020-02-16 20:44:00,guangdong,China,2020-02-16T12:03:06,1316,2,465,440000,广东省,广东,113.394818,23.408004
2,2020-02-16 20:44:00,henan,China,2020-02-16T13:53:03,1231,13,440,410000,河南省,河南,113.486804,34.157184
3,2020-02-16 20:44:00,zhejiang,China,2020-02-16T09:33:02,1167,0,456,330000,浙江省,浙江,119.957202,29.159494
4,2020-02-16 20:44:00,hunan,China,2020-02-16T13:13:04,1004,3,464,430000,湖南省,湖南,111.720664,27.695864


In [39]:
# plot the distribution in China
latitude = 39.91666667
longitude = 116.383333
 
china_map1 = folium.Map(location = [latitude, longitude], zoom_start = 4)
china_map1

In [40]:
# plot Wuhan first
folium.CircleMarker([china_data.iloc[0]['lat'],china_data.iloc[0]['lon']],
                    radius = 20,
                    color = 'red',
                    fill_color = 'red',
                    fill_opacity = 0.7,
                    popup = ('<strong>Province</strong>: 湖北<br>'
                        '<strong>Confirmed: </strong>' + str(china_data.iloc[0]['Confirmed']) + '<br>'),
                    ).add_to(china_map1)
china_map1

In [41]:
# plot the rest
for lat, lon, value, name in zip(
    china_data.iloc[1:]['lat'], 
    china_data.iloc[1:]['lon'], 
    china_data.iloc[1:]['Confirmed'],
    china_data.iloc[1:]['name2']):
    
    folium.CircleMarker([lat, lon],
                        radius = value*0.01,
                        color = 'red',
                        fill_color = 'red',
                        popup = ('<strong>Province/Municipal City</strong>: ' + str(name) + '<br>'
                                '<strong>Confirmed</strong>: ' + str(value) + '<br>')
                        ).add_to(china_map1)
china_map1

### More Visualization - Dynamic Map

In [42]:
# load another dataset which has more detailed info
china_history=pd.read_csv('/home/kesci/input/ncov6321/DXYArea.csv')
china_history.head()

Unnamed: 0,provinceName,provinceEnglishName,province_zipCode,cityName,cityEnglishName,city_zipCode,province_confirmedCount,province_suspectedCount,province_curedCount,province_deadCount,city_confirmedCount,city_suspectedCount,city_curedCount,city_deadCount,updateTime
0,广东省,Guangdong,440000,深圳,Shenzhen,440300.0,1322,0,494,4,415,0,131,2,2020-02-17 17:09:18.718
1,广东省,Guangdong,440000,广州,Guangzhou,440100.0,1322,0,494,4,339,0,131,0,2020-02-17 17:09:18.718
2,广东省,Guangdong,440000,东莞,Dongguan,441900.0,1322,0,494,4,89,0,15,1,2020-02-17 17:09:18.718
3,广东省,Guangdong,440000,佛山,Foshan,440600.0,1322,0,494,4,84,0,23,0,2020-02-17 17:09:18.718
4,广东省,Guangdong,440000,珠海,Zhuhai,440400.0,1322,0,494,4,97,0,41,0,2020-02-17 17:09:18.718


In [43]:
# preprocessing, remove unnecessary columns
china_history.drop([
    'province_confirmedCount', 
    'province_suspectedCount', 
    'province_curedCount', 
    'province_deadCount'],
    axis = 1,
    inplace = True
    )

In [44]:
# change updateTime to pd's datetime form
china_history['updateTime'] = china_history['updateTime'].apply(pd.to_datetime).dt.date

In [45]:
# load detailed coordinates data
china_coor2 = pd.read_csv('/home/kesci/input/chinacoor9160/china_coordinates.csv',header = None)
china_coor2.head()

Unnamed: 0,0,1,2,3
0,110000,北京市,116.395645,39.929986
1,110101,东城区,113.612838,37.857865
2,110102,西城区,113.612838,37.857865
3,110105,朝阳区,116.521695,39.958953
4,110106,丰台区,116.25837,39.841938


In [47]:
# rename the columns
china_coor2.rename(columns = {0:'code',1:'cityName',2:'lon',3:'lat'},inplace = True)

In [48]:
china_coor2['cityName'] = china_coor2['cityName'].apply(lambda x:x[0:len(x)-1])
china_coor2.head()

Unnamed: 0,code,cityName,lon,lat
0,110000,北京,116.395645,39.929986
1,110101,东城,113.612838,37.857865
2,110102,西城,113.612838,37.857865
3,110105,朝阳,116.521695,39.958953
4,110106,丰台,116.25837,39.841938


In [49]:
# merge the two data tables
china_history_data = china_history.merge(china_coor2)
china_history_data.head()

Unnamed: 0,provinceName,provinceEnglishName,province_zipCode,cityName,cityEnglishName,city_zipCode,city_confirmedCount,city_suspectedCount,city_curedCount,city_deadCount,updateTime,code,lon,lat
0,广东省,Guangdong,440000,深圳,Shenzhen,440300.0,415,0,131,2,2020-02-17,440300,114.025974,22.546054
1,广东省,Guangdong,440000,深圳,Shenzhen,440300.0,415,0,131,2,2020-02-17,440300,114.025974,22.546054
2,广东省,Guangdong,440000,深圳,Shenzhen,440300.0,415,0,131,2,2020-02-17,440300,114.025974,22.546054
3,广东省,Guangdong,440000,深圳,Shenzhen,440300.0,415,0,131,2,2020-02-17,440300,114.025974,22.546054
4,广东省,Guangdong,440000,深圳,Shenzhen,440300.0,415,0,131,2,2020-02-17,440300,114.025974,22.546054


To create a dynamic map, we need a three-dimensional array like [[[Y1,X1,W1],...,[Y1,X1,W1]],[[Y1,X1,W1],...,[Y1,X1,W1]],...,[[Y1,X1,W1],...,[Y1,X1,W1]]]，the outermost is time, inside includes coordinates and values.

In [50]:
# filter out the time
update_time = china_history_data['updateTime'].unique()

In [51]:
# create a 3-dim array
data_move = []
for i in update_time[::-1]:
    tmp_pd = china_history_data[china_history_data['updateTime'] == i]
    data_move.append(tmp_pd[['lat','lon','city_confirmedCount']].values.tolist())

In [53]:
# create dynamic map with folium, auto_paly set to True
dynamic_map = folium.Map(location = [latitude, longitude], zoom_start = 4)
plugins.HeatMapWithTime(data_move[:15],auto_play = True).add_to(m)
dynamic_map