# Libraries

In [None]:
!pip install geopandas --quiet

In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

import statsmodels.formula.api as smf
import geopandas as gpd

from fuzzywuzzy import process
from fuzzywuzzy import fuzz
import datetime

# Load data

In [None]:
inputPath = '../input/covid-data-worldometer'
outputPath = './'

In [None]:
df_data = pd.read_csv(
    os.path.join(inputPath,'time_series.csv'),
    parse_dates=['Date']
)
df_data.head()

In [None]:
df_continent = pd.read_csv(os.path.join(inputPath,'continent.csv'))
df_continent.head()

# Join tables

In [None]:
df = df_data.join(
    df_continent.set_index('Country,Other'),
    on='Country,Other'
)
df.head()

# Summary

In [None]:
df.info()

In [None]:
df.describe()

# Latest date EDA

## Get latest date

In [None]:
# selected_date = df['Date'].max()
selected_date = '2022-06-10'
df_single = df[df['Date']==selected_date]
df_single.head()

In [None]:
df_single.count()

## Top 10 total cases

In [None]:
top10_total_index = df_single['TotalCases'].values.argsort()[::-1]
top10_total = df_single.iloc[top10_total_index[:10]]

plt.figure(figsize=(15,10))
sns.barplot(data=top10_total,x='TotalCases',y='Country,Other', color='royalblue')
plt.title("Top 10 Countries With The Most Covid-19 Cases")
plt.ylabel("Country")
plt.xlabel("Total cases")
plt.show()

## Top 10 rates of population over total cases

In [None]:
df_rate_case_pop = df_single['Population']/df_single['TotalCases'] 
index = df_rate_case_pop.values.argsort()[::-1][:10]
top10_rate_case_pop = df_single.iloc[index]
top10_rate_case_pop['Rate'] = df_rate_case_pop.iloc[index]

plt.figure(figsize=(15,10))
sns.barplot(data=top10_rate_case_pop,y='Country,Other',x='Rate', color="royalblue")
plt.xlabel("Rate (%)")
plt.ylabel("Country")
plt.title("Rate total population respected to total cases over countries")
plt.show()

## Create dataframe to group by continents

In [None]:
df_continent = df_single.groupby("Continent").sum()
df_continent = df_continent.reset_index()
df_continent.head()

## Comparing the cases number between continents

In [None]:
fig, ax = plt.subplots(1,2,figsize=(20,10))

rate_test_pop_continent = df_continent['TotalTests']/df_continent['Population']
index = rate_test_pop_continent.values.argsort()[::-1]
df_rate_test_pop_continent = df_continent.iloc[index]
df_rate_test_pop_continent['Test/pop'] = rate_test_pop_continent[index]

ax[0].pie(
    x=df_rate_test_pop_continent['TotalCases'],
    labels=df_rate_test_pop_continent['Continent'],
    autopct='%.0f%%',
    wedgeprops = {'edgecolor':'k', 'linestyle': 'dashed'},
)
ax[0].set_title("Total case over continents")

sns.barplot(
    ax=ax[1],
    y=df_rate_test_pop_continent['Test/pop'],
    x=df_rate_test_pop_continent['Continent']
)
ax[1].set_title("Rate test respected to population over continents")
ax[1].set_xlabel("Continent")
ax[1].set_ylabel("Rate (%)")

plt.show()

## Correlation between variables

In [None]:
sns.pairplot(
    data=df_single,
    vars=['TotalCases','NewCases','TotalDeaths','NewDeaths','TotalRecovered','NewRecovered','ActiveCases','Serious,Critical'],
    corner=True
)
plt.show()

In [None]:
plt.figure(figsize=(10,8))
plt.title("Pearson's correlation")

corr = df_single.corr()
ax = sns.heatmap(
    corr, 
    cmap='Blues', 
    fmt='.2f', 
    annot=True
)
plt.show()

In [None]:
corr_total_cases = corr['TotalCases'].round(2).drop('TotalCases')
corr_total_cases = corr_total_cases.sort_values()[::-1]

plt.figure(figsize=(10,8))
ax = sns.barplot(
    y=corr_total_cases.index,
    x=corr_total_cases,
    color='royalblue'
)
plt.title("Top 10 correlation compared to Total cases")
ax.bar_label(ax.containers[0])
plt.show()

## Histogram of Deaths over a 1-millions population

In [None]:
mean = df_single['Deaths/1M pop'].mean()
median = df_single['Deaths/1M pop'].median() 

plt.figure(figsize=(10,8))
plt.hist(
    df_single['Deaths/1M pop'],
    color='royalblue'
)

plt.axvline(mean, color='red', linestyle='--', label="Mean")
plt.axvline(median, color='red', linestyle='-', label="Median")

plt.legend()
plt.xlabel('Deaths/1M population Histogram')
plt.show()

## Relationship between serious cases to deaths

In [None]:
def z_score(x):
    return (x-x.mean())/x.std()

In [None]:
df_single['Serious_Log_Std'] = z_score(np.log(df_single['Serious,Critical']))
df_single['TotalDeaths_Log_Std'] = z_score(np.log(df_single['TotalDeaths']))

plt.figure(figsize=(10,8))

sns.regplot(
    data=df_single,
    x='Serious_Log_Std',
    y='TotalDeaths_Log_Std'
)
plt.xlabel("Log(Serious)")
plt.ylabel("Log(Death)")
plt.title("Relationship between log-log of death and serious case")
plt.show()

## The rate of dead between recovered cases

In [None]:
df_rate = pd.DataFrame(df_single['Country,Other'])
df_rate['Dead rate'] = df_single['TotalDeaths'] / df_single['TotalCases'] 
df_rate['Recovered rate'] = df_single['TotalRecovered'] / df_single['TotalCases'] 

plt.figure(figsize=(10,8))

df_rate = df_rate.melt(
    'Country,Other',
    var_name='Rate type', 
    value_name='Rate (%)'
)
sns.swarmplot(
    data=df_rate,
    y='Rate (%)',
    x='Rate type',
    color='royalblue'
)
plt.title("Dead and recovered rate")
plt.show()

## Create world maps

In [None]:
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
world.head()

## Correct the coutry name and map to the world map

In [None]:
countries = df_single['Country,Other']

def fuzzy_match(findStr, listAC, nElement=1):
    matches = process.extract(
        findStr, 
        listAC, 
        limit=nElement, 
        scorer = fuzz.token_sort_ratio
    )
    return matches

acceptPercentage = 76
for i, country in enumerate(countries):
    candidates = fuzzy_match(country, world['name'])
    percentage = candidates[0][1]
    if (percentage >= acceptPercentage and percentage < 100):
        print(country, '\t\t', candidates)
        countries[i] = candidates[0][0]

countries.replace("USA","United States of America",inplace=True)
countries.replace("UAE","United Arab Emirates",inplace=True)
countries.replace("UK","United Kingdom",inplace=True)

df_single['Country'] = countries
df_single['Rate_case'] = df_single['Population']/df_single['TotalCases'] 
world_country = world.merge(
    df_single,
    how='left',
    left_on='name',
    right_on='Country'
)

In [None]:
fig, ax = plt.subplots(1,1,figsize=(15,8))
world_country.plot(
    column='TotalCases',
    ax=ax,
    legend=True,
    cmap=sns.color_palette("Blues", as_cmap=True),
    edgecolors='grey'
)
plt.title("Total cases over countries")
plt.show()

# Time-series EDA

## Total cases over dates of Asia

In [None]:
countryAsia = [
    'Vietnam',
    'Timor-Leste',
    'Thailand',
    'Singapore',
    'Philipines',
    'Myanmar',
    'Malaysia',
    'Laos',
    'Indonesia',
    'Cambodia',
    'Brunei'
]
df_asia = df[np.in1d(df['Country,Other'],countryAsia)]

plt.figure(figsize=(16,8))
sns.lineplot(
    data=df_asia,
    x='Date',
    y='TotalCases',
    hue='Country,Other'
)
plt.title('Total cases over dates of Asia')
plt.show()

## Active cases over dates of Asia

In [None]:
plt.figure(figsize=(16,8))
sns.lineplot(
    data=df_asia,
    x='Date',
    y='ActiveCases',
    hue='Country,Other'
)
plt.title('Active cases over dates of Asia')
plt.show()

## Group by continents

In [None]:
df_continent = df.groupby(by=['Continent','Date']).sum().reset_index()
df_continent.head()

## Total cases by continents

In [None]:
plt.figure(figsize=(16,8))
sns.lineplot(
    data = df_continent,
    x = 'Date',
    y = 'TotalCases',
    hue = 'Continent'
)
plt.show()

## Total recovered by continents

In [None]:
plt.figure(figsize=(16,8))
sns.lineplot(
    data = df_continent,
    x = 'Date',
    y = 'TotalRecovered',
    hue = 'Continent'
)
plt.show()

## Total recovered of North America until middle of April

In [None]:
plt.figure(figsize=(16,8))
condition1 = (df['Continent']=='North America')
condition2 = (df['Date'] < '2022-04-15')
df_north_america = df[condition1 & condition2]
sns.lineplot(
    data = df_north_america,
    x = 'Date',
    y = 'TotalRecovered',
    hue = 'Country,Other'
)
plt.legend(ncol=2)
plt.show()

# New cases of USA until middle of April

In [None]:
plt.figure(figsize=(16,8))
condition1 = (df['Country,Other']=='USA')
condition2 = (df['Date'] < '2022-04-15')
df_usa = df[condition1 & condition2]
sns.lineplot(
    data = df_usa,
    x = 'Date',
    y = 'NewCases'
)
plt.show()