In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pycountry_convert as pc
import os
import geoplot
import geopandas
import pycountry
from github import Github
from datetime import datetime
from functools import reduce

plt.rcParams["figure.figsize"] = (16, 10)

In [2]:
who_daily_reports_dir_name = "who_daily_reports/"
twitter_daily_reports_dir_name = "twitter_daily_data/"
combined_dataframe_name = "combined_df.csv"

# WHO daily reports parsing

In [3]:
column_names = ['Province/State', 'Country/Region', 'Confirmed', 'Deaths', 'Recovered']
new_column_names = ['Province_State', 'Country_Region', 'Confirmed', 'Deaths', 'Recovered']

reports_files_names = sorted(os.listdir(who_daily_reports_dir_name))
reports = []
for report_file_name in reports_files_names:
    date = datetime.strptime(report_file_name[:-4], '%m-%d-%Y')
    dataframe = pd.read_csv(os.path.join(who_daily_reports_dir_name, report_file_name))
    
    try:
        dataframe = dataframe[column_names]
    except KeyError:
        dataframe = dataframe[new_column_names]
        dataframe = dataframe.rename(columns={'Province_State': 'Province/State', 'Country_Region': 'Country/Region'})
    
    dataframe['Province/State'].fillna("NotSpecified", inplace=True)
    dataframe.fillna(0, inplace=True)

    dataframe = dataframe.astype({'Confirmed': 'int32', 'Deaths': 'int32', 'Recovered': 'int32'}, copy=False)
    
    reports.append((date, dataframe))
    
all_who_reports = np.array(reports)
all_who_reports.shape

(94, 2)

# Twitter daily reports parsing

In [4]:
reports_files_names = os.listdir(twitter_daily_reports_dir_name)
twitter_reports = []
for report_file_name in reports_files_names:
    date = datetime.strptime(report_file_name[:-4], '%d-%m-%Y')
    dataframe = pd.read_csv(os.path.join(twitter_daily_reports_dir_name, report_file_name), index_col=0)    
    dataframe = dataframe.astype({'tweets': 'int64'}, copy=False)
    twitter_reports.append((date, dataframe))
all_twitter_reports = np.array(sorted(twitter_reports, key=(lambda report: report[0])))
all_twitter_reports.shape

(37, 2)

# Finding common dates range of who an twitter reports

In [5]:
start_twitter_date = all_twitter_reports[1, 0]
start_who_index = np.argwhere(all_who_reports[:, 0] == start_twitter_date)[0, 0]

end_twitter_date = all_twitter_reports[-2, 0]
end_who_index = np.argwhere(all_who_reports[:, 0] == end_twitter_date)[0, 0]

twitter_reports = all_twitter_reports[1: -1, :]
# extended_who_reports = all_who_reports[start_who_index - 1: end_who_index + 1, :]
who_reports = all_who_reports[start_who_index: end_who_index + 1, :]
print(twitter_reports.shape, who_reports.shape) #, extended_who_reports.shape)

(35, 2) (35, 2)


# Creating who and twitter data time series

In [6]:
unrecognizable_countries = ['Others', 'Diamond Princess', 'MS Zaandam', 'Cruise Ship']

In [7]:
countries_data = dict()

for country in pycountry.countries:
    country_data = {
        'confirmed': [0],
        'deaths': [0],
        'recovered': [0],
        'active': [0],
        'tweets': [0],
    }
    countries_data[country.name] = country_data

In [8]:
for report in who_reports:
    rep = report[1].groupby('Country/Region', as_index=False).sum()
    rep = rep.loc[~rep['Country/Region'].isin(unrecognizable_countries), :]
    
    for country in pycountry.countries:
        if rep[rep['Country/Region'] == country.name].empty:
            # duplicate last known info if certain country is not present in a report
            for statistic in ['confirmed', 'deaths', 'recovered', 'active']:
                countries_data[country.name][statistic].append(countries_data[country.name][statistic][-1])
        else:
            confirmed = rep.loc[rep['Country/Region'] == country.name, 'Confirmed'].values[0]
            deaths = rep.loc[rep['Country/Region'] == country.name, 'Deaths'].values[0]
            recovered = rep.loc[rep['Country/Region'] == country.name, 'Recovered'].values[0]
            
            countries_data[country.name]['confirmed'].append(confirmed)
            countries_data[country.name]['deaths'].append(deaths)
            countries_data[country.name]['recovered'].append(recovered)
            countries_data[country.name]['active'].append(confirmed - deaths - recovered)
        
for report in twitter_reports[:, 1]:    
    for country in pycountry.countries:
        tweets = report.loc[report['country'] == country.name.lower(), 'tweets'].values[0]
        countries_data[country.name]['tweets'].append(tweets + countries_data[country.name]['tweets'][-1])

In [9]:
# get rid of temporary 0's at the begining of lists
for country in pycountry.countries:
    for statistic in ['confirmed', 'deaths', 'recovered', 'active', 'tweets']:
        countries_data[country.name][statistic] = countries_data[country.name][statistic][1:]

In [10]:
countries_data

{'Aruba': {'confirmed': [0,
   0,
   0,
   0,
   0,
   2,
   2,
   2,
   2,
   3,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4],
  'deaths': [0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  'recovered': [0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  'active': [0,
   0,
   0,
   0,
   0,
   2,
   2,
   2,
   2,
   3,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4,
   4],
  'tweets': [105,
   227,
   318,
   396,
   500,
   607,
   696,
   796,
   895,
   1001,
   1071,
 

# Creating dataframe containing both twitter and who data

In [11]:
data = []
countries_names_list = []

for country in pycountry.countries:
    data.append([])
    countries_names_list.append(country.name)

In [12]:
columns = []

for i, date in enumerate(who_reports[:, 0]):
    
    for statistic in ['confirmed', 'deaths', 'recovered', 'active', 'tweets']:
        columns.append((date, statistic))
    
    for country_idx, country_name in enumerate(countries_names_list):
        for statistic in ['confirmed', 'deaths', 'recovered', 'active', 'tweets']:
            data[country_idx].append(countries_data[country_name][statistic][i])

In [13]:
df = pd.DataFrame(data, columns=columns, index=countries_names_list)
df.columns = pd.MultiIndex.from_tuples(df.columns)
df

Unnamed: 0_level_0,2020-03-08,2020-03-08,2020-03-08,2020-03-08,2020-03-08,2020-03-09,2020-03-09,2020-03-09,2020-03-09,2020-03-09,...,2020-04-10,2020-04-10,2020-04-10,2020-04-10,2020-04-10,2020-04-11,2020-04-11,2020-04-11,2020-04-11,2020-04-11
Unnamed: 0_level_1,confirmed,deaths,recovered,active,tweets,confirmed,deaths,recovered,active,tweets,...,confirmed,deaths,recovered,active,tweets,confirmed,deaths,recovered,active,tweets
Aruba,0,0,0,0,105,0,0,0,0,227,...,4,0,0,4,2752,4,0,0,4,2817
Afghanistan,4,0,0,4,252,4,0,0,4,458,...,521,15,32,474,13636,555,18,32,505,14158
Angola,0,0,0,0,191,0,0,0,0,360,...,19,2,2,15,10614,19,2,4,13,10850
Anguilla,0,0,0,0,15,0,0,0,0,22,...,0,0,0,0,354,0,0,0,0,389
Åland Islands,0,0,0,0,24,0,0,0,0,40,...,0,0,0,0,708,0,0,0,0,755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Samoa,0,0,0,0,25,0,0,0,0,50,...,0,0,0,0,963,0,0,0,0,991
Yemen,0,0,0,0,209,0,0,0,0,448,...,1,0,0,1,12257,1,0,0,1,12630
South Africa,3,0,0,3,14026,3,0,0,3,31201,...,2003,24,410,1569,510583,2028,25,410,1593,525729
Zambia,0,0,0,0,179,0,0,0,0,415,...,40,2,25,13,12273,40,2,28,10,12598


In [14]:
df.to_csv(combined_dataframe_name)

In [15]:
df.loc['Aruba', :][:, 'tweets']

2020-03-08     105
2020-03-09     227
2020-03-10     318
2020-03-11     396
2020-03-12     500
2020-03-13     607
2020-03-14     696
2020-03-15     796
2020-03-16     895
2020-03-17    1001
2020-03-18    1071
2020-03-19    1148
2020-03-20    1230
2020-03-21    1309
2020-03-22    1392
2020-03-23    1487
2020-03-24    1567
2020-03-25    1638
2020-03-26    1704
2020-03-27    1773
2020-03-28    1860
2020-03-29    1955
2020-03-30    2026
2020-03-31    2079
2020-04-01    2145
2020-04-02    2232
2020-04-03    2303
2020-04-04    2374
2020-04-05    2441
2020-04-06    2499
2020-04-07    2563
2020-04-08    2637
2020-04-09    2698
2020-04-10    2752
2020-04-11    2817
Name: Aruba, dtype: int64