## Imports

In [33]:
import os
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from matplotlib import pyplot as plt
from datetime import datetime

#turn off annoying warnigns
pd.options.mode.chained_assignment = None  # default='warn'

## Function 1: Data Collection

In [34]:
url = "https://www.worldometers.info/coronavirus/"
htmlContent = requests.get(url).text

soup = BeautifulSoup(htmlContent, 'html.parser')
covidTable = soup.find("table",attrs={"id": "main_table_countries_today"})

## Function 2: Main Data Extract

In [35]:
# extract the table headings from the soup
head = covidTable.thead.find_all("tr")
headings = []

for th in head[0].find_all("th"):
    headings.append(th.text.replace("\n", "").strip())

# extract actual data from the soup
body = covidTable.tbody.find_all("tr")
data = []

# iterate through every row in the html
for r in range(1,len(body)):
    row = []
    # find all column entries in that particular row
    for tr in body[r].find_all("td"):
        row.append(tr.text.replace("\n","").strip())
    len(row)
    data.append(row)

df = pd.DataFrame(data,columns = headings)
df.head(20)

######################################################################
#                         CLEAN DATA TABLE                           #
######################################################################

# define nan object
NaN = np.nan

# recover appropriate columns
df = df.filter(['Continent', 'Country,Other','TotalCases','TotalRecovered','Serious,Critical','ActiveCases','TotalDeaths']) 

# strip artefacts
df = df.replace(',','', regex=True)
df = df.replace('\+','', regex=True)
df = df.replace('', NaN, regex=True)

# convert appropriate columns to integer type
df[['TotalCases', 'TotalRecovered','Serious,Critical','ActiveCases','TotalDeaths']] = df[['TotalCases', 'TotalRecovered','Serious,Critical','ActiveCases','TotalDeaths']].apply(pd.to_numeric, downcast = 'float', errors = 'coerce')
df.head(20)

Unnamed: 0,Continent,"Country,Other",TotalCases,TotalRecovered,"Serious,Critical",ActiveCases,TotalDeaths
0,Asia,Asia,55146240.0,52532612.0,26321.0,1833540.0,780090.0
1,South America,South America,32359664.0,29279598.0,31062.0,2090698.0,989368.0
2,Europe,Europe,47700040.0,45208400.0,7686.0,1394132.0,1097509.0
3,Africa,Africa,5398550.0,4737478.0,4491.0,520505.0,140567.0
4,Australia/Oceania,Oceania,72891.0,68497.0,10.0,3121.0,1273.0
5,,,721.0,706.0,0.0,0.0,15.0
6,All,World,181159472.0,165746400.0,80549.0,11488667.0,3924404.0
7,North America,USA,34481764.0,28896472.0,3875.0,4966157.0,619135.0
8,Asia,India,30182468.0,29185560.0,8944.0,602386.0,394524.0
9,South America,Brazil,18322760.0,16548159.0,8318.0,1263329.0,511272.0


## Function 3: Extract Tables

In [36]:
############# world
world_df = df.loc[df['Country,Other'] == "World"]

# record worldwide cases
worldwideCases = world_df.iloc[0]['TotalCases']

# select our data for plotting:
world_df = world_df.filter(['TotalCases','TotalRecovered','Serious,Critical','ActiveCases','TotalDeaths'])

# this is the correct way to add a new column from existing columns!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
world_df = world_df.assign(NonCritical_Active = float(world_df['ActiveCases']) - float(world_df['Serious,Critical']))
world_df.rename(columns = {'NonCritical_Active':'Non-Critical, Active', 'Serious,Critical':'Critical, Active'}, inplace = True)

# add in percentage columns
world_df = world_df.assign(TotalRecoveredPerc = (int(world_df['TotalRecovered']) / int(world_df['TotalCases'])) * 100)
world_df = world_df.assign(CritPerc = (int(world_df['Critical, Active']) / int(world_df['TotalCases'])) * 100)
world_df = world_df.assign(DeathPerc = (int(world_df['TotalDeaths']) / int(world_df['TotalCases'])) * 100)
world_df = world_df.assign(NonCritPerc = (int(world_df['Non-Critical, Active']) / int(world_df['TotalCases'])) * 100)

# drop redundant cols
world_df = world_df.drop(columns = ['ActiveCases']).reset_index(drop=True)

############ continent 
continents_df = df.iloc[0:5]
continents_df.drop(columns=['Country,Other'], inplace=True)

na = df.loc[df['Continent'] == "North America"]
del na["Continent"]

na = na.replace('','0', regex=True)

# create a row of data to append to the continents for North America
row = []
row.append("North America")

for column in na.columns[1:]:
    na[column] = pd.to_numeric(na[column], errors='coerce')
    row.append(round(na[column].sum()))
row = [row]

continents_df = continents_df.append(pd.DataFrame( row, columns=continents_df.columns), ignore_index = True)
continents_df = continents_df.sort_values(by=['TotalCases'], ascending=False)

############ country
countries_df = df.loc[7:len(df)]
countries_df = countries_df.reset_index()
countries_df = countries_df.drop(columns = ['index', 'Continent'])

# sort the data by total cases, as we did with continents:
countries_df = countries_df.sort_values(by=['TotalCases'],ascending=False)



## Function 4: Create CSVs

In [42]:
# create a folder for the csv data extracts if there isn't aready one using the os module

path = os.getcwd()
directory = 'csv_extracts'

if os.path.isdir(directory):
    print("Extracts directory found at '%s'" % directory)
    path = os.path.join(path, directory)
else:
    print('Extracts directory not found, creating csv extracts directory and files...')
    path = os.path.join(path, directory)
    os.mkdir(path)
    if os.path.isdir(directory):
        print("Directory '% s' successfully created" % directory)
    else:
        print("Could not create directory '% s'" % directory)
        
# create csv files from extracts

# dd/mm/YY H:M:S
dt_string = now.strftime("%d%m%Y")

# world df
csv_name = path + "/world_csv_" + dt_string + ".csv"
print(csv_name)
world_df.to_csv(csv_name)

# continents df
csv_name = path + "/continent_csv_" + dt_string + ".csv"
countries_df.to_csv(csv_name)

# countries df
csv_name = path + "/country_csv_" + dt_string + ".csv"
world_df.to_csv(csv_name)

Extracts directory not found, creating csv extracts directory and files...
Directory 'csv_extracts' successfully created
c:\Users\marja\Documents\Repositories\Personal Code\Python-Projects-and-Challenges\Python Covid Tracker\covid-task-scheduler\csv_extracts/world_csv_25062021.csv
