## Import the necessary libraries

In [13]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import re

import pandas as pd
import numpy as np

from datetime import date

---

## Crawling data from web
The dataset used for analysising is collected from [Worldometer](https://www.worldometers.info/coronavirus/)

- Get chrome driver to browse in `Chrome`

In [14]:
# browser = webdriver.Chrome(executable_path="./chromedriver_win32/chromedriver.exe")
browser = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

In [15]:
browser.get("https://www.worldometers.info/coronavirus/")
browser.implicitly_wait(20)
browser.minimize_window()

html_text = BeautifulSoup(browser.page_source, "html.parser")
browser.quit()# Close the Chrome browser

In [16]:
def get_string(row):
    row.pop(0)# remove id value
    if len(row) == 17:
        row.pop(-3)# remove redundant empty value
    if len(row) == 16:
        row.pop(1)# remove duplicated country name
    for i in range(len(row)):
        row[i] = row[i].string# pure get data
        if row[i] in [' ']: row[i] = np.nan
        try:
            row[i] = int(row[i].replace(',',''))# astype numeric values
        except: pass
        
    return row

In [17]:
rows = html_text.select("tr:not([style='display: none'])")[:233]# get 233 rows of the table
columns = re.findall('>([\w\d,/ ]*)<',str(rows[0]))# get columns name from html
columns = str(columns[2:-1])[2:-2].replace("', '', '",'|').replace("', '",' ').split('|')# get columns name from list 
print(columns, len(columns))

#get data of each table's cell, except 2 first rows
rows = list(map(lambda line: line.find_all(['td','a']), rows[2:]))
rows = list(map(lambda line: get_string(line), rows))

['Country, Other', 'Total Cases', 'New Cases', 'Total Deaths', 'New Deaths', 'Total Recovered', 'New Recovered', 'Active Cases', 'Serious, Critical', '1M pop', 'Deaths/ 1M pop', 'Total Tests', 'Tests/ 1M pop', 'Population', 'Continent'] 15


In [18]:
df = pd.DataFrame(rows, columns=columns)
df['date'] = [date.today()]*df.shape[0]
# df

## Pre-processing

- Missing ratio of attributes

In [19]:
df.apply(lambda c: c.isna().mean())

Country, Other       0.000000
Total Cases          0.000000
New Cases            0.991342
Total Deaths         0.025974
New Deaths           0.995671
Total Recovered      0.008658
New Recovered        0.874459
Active Cases         0.000000
Serious, Critical    0.406926
1M pop               0.008658
Deaths/ 1M pop       0.034632
Total Tests          0.077922
Tests/ 1M pop        0.077922
Population           0.008658
Continent            0.008658
date                 0.000000
dtype: float64

`Comment:` Because the covid situation is no longer serious, the attributes consist of 'New Cases', 'New Deaths', 'New Recovered' are not updated regularly (have high missing ratio). They wil be removed. 

In [20]:
df.drop(['New Cases','New Deaths','New Recovered'], axis=1, inplace=True)
df

Unnamed: 0,"Country, Other",Total Cases,Total Deaths,Total Recovered,Active Cases,"Serious, Critical",1M pop,Deaths/ 1M pop,Total Tests,Tests/ 1M pop,Population,Continent,date
0,USA,106475031,1158255.0,104340108,976668,1594,318021.0,3459.0,1.176056e+09,3512657.0,3.348053e+08,North America,2023-04-18
1,India,44827226,531141.0,44235772,60313,,31868.0,378.0,9.240577e+08,656929.0,1.406632e+09,Asia,2023-04-18
2,France,39903419,166024.0,39605680,131715,869,608427.0,2531.0,2.714902e+08,4139547.0,6.558452e+07,Europe,2023-04-18
3,Germany,38385526,172086.0,38162100,51340,,457605.0,2051.0,1.223324e+08,1458359.0,8.388360e+07,Europe,2023-04-18
4,Brazil,37358092,700811.0,36249161,408120,,173473.0,3254.0,6.377617e+07,296146.0,2.153536e+08,South America,2023-04-18
...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,Diamond Princess,712,13.0,699,0,,,,,,,,2023-04-18
227,Vatican City,29,,29,0,,36295.0,,,,7.990000e+02,Europe,2023-04-18
228,Western Sahara,10,1.0,9,0,,16.0,2.0,,,6.261610e+05,Africa,2023-04-18
229,MS Zaandam,9,2.0,7,0,,,,,,,,2023-04-18


In [21]:
# df.to_csv("2023-02-20.csv")
df.to_csv(f"{date.today()}.csv")

In [22]:
from datetime import datetime
day = datetime.strptime('2023-02-20','%Y-%m-%d').date()
print(day, type(day))
df = pd.read_csv(str(day)+'.csv')
df = df.drop(df.columns[0], axis=1)
df['date'] = [day]*df.shape[0]
print(df.head())
df.to_csv(str(day)+'.csv')

2023-02-20 <class 'datetime.date'>
  Country, Other  Total Cases  Total Deaths  Total Recovered  Active Cases  \
0            USA    105665981     1149253.0      103319965.0     1196763.0   
1          India     44691956      530789.0       44156970.0        4197.0   
2         France     39667102      165240.0       39435308.0       66554.0   
3        Germany     38276190      169345.0       37906300.0      200545.0   
4         Brazil     37085520      699310.0       36249161.0      137049.0   

   Serious, Critical    1M pop  Deaths/ 1M pop   Total Tests  Tests/ 1M pop  \
0             2157.0  315604.0          3433.0  1.169268e+09      3492381.0   
1                NaN   31772.0           377.0  9.198164e+08       653914.0   
2              869.0  604824.0          2519.0  2.714902e+08      4139547.0   
3                NaN  456301.0          2019.0  1.223324e+08      1458359.0   
4                NaN  172208.0          3247.0  6.377617e+07       296146.0   

     Population      

In [23]:
df = pd.read_csv(str(day)+'.csv')
df = df.drop(df.columns[0], axis=1)

In [24]:
type(df['date'][0])

str