## Imports

In [1]:
import os
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from matplotlib import pyplot as plt
from datetime import datetime

#turn off annoying warnigns
pd.options.mode.chained_assignment = None  # default='warn'

## Function 1: Data Collection

In [2]:
url = "https://www.worldometers.info/coronavirus/"
htmlContent = requests.get(url).text

soup = BeautifulSoup(htmlContent, 'html.parser')
covidTable = soup.find("table",attrs={"id": "main_table_countries_today"})

## Function 2: Main Data Extract

In [3]:
# extract the table headings from the soup
head = covidTable.thead.find_all("tr")
headings = []

for th in head[0].find_all("th"):
    headings.append(th.text.replace("\n", "").strip())

# extract actual data from the soup
body = covidTable.tbody.find_all("tr")
data = []

# iterate through every row in the html
for r in range(1,len(body)):
    row = []
    # find all column entries in that particular row
    for tr in body[r].find_all("td"):
        row.append(tr.text.replace("\n","").strip())
    len(row)
    data.append(row)

df = pd.DataFrame(data,columns = headings)
df.head(20)

######################################################################
#                         CLEAN DATA TABLE                           #
######################################################################

# define nan object
NaN = np.nan

# recover appropriate columns
df = df.filter(['Continent', 'Country,Other','TotalCases','TotalRecovered','Serious,Critical','ActiveCases','TotalDeaths']) 

# strip artefacts
df = df.replace(',','', regex=True)
df = df.replace('\+','', regex=True)
df = df.replace('', NaN, regex=True)

# convert appropriate columns to integer type
df[['TotalCases', 'TotalRecovered','Serious,Critical','ActiveCases','TotalDeaths']] = df[['TotalCases', 'TotalRecovered','Serious,Critical','ActiveCases','TotalDeaths']].apply(pd.to_numeric, downcast = 'float', errors = 'coerce')
df.head(20)

# add date of processing to all rows
now = datetime.now()
dt_string = now.strftime("%d%m%Y")
df['DateProcessed'] = dt_string
df

Unnamed: 0,Continent,"Country,Other",TotalCases,TotalRecovered,"Serious,Critical",ActiveCases,TotalDeaths,DateProcessed
0,Asia,Asia,56139468.0,53482132.0,26559.0,1860731.0,796603.0,03072021
1,South America,South America,33160028.0,30137216.0,29368.0,2010869.0,1011943.0,03072021
2,Europe,Europe,48134376.0,45512156.0,6601.0,1517479.0,1104742.0,03072021
3,Africa,Africa,5639668.0,4893587.0,4583.0,601274.0,144807.0,03072021
4,Australia/Oceania,Oceania,75390.0,68913.0,29.0,5193.0,1284.0,03072021
...,...,...,...,...,...,...,...,...
224,Australia/Oceania,Marshall Islands,4.0,4.0,,0.0,,03072021
225,Australia/Oceania,Samoa,3.0,3.0,,0.0,,03072021
226,Africa,Saint Helena,2.0,2.0,,0.0,,03072021
227,Australia/Oceania,Micronesia,1.0,1.0,,0.0,,03072021


## Function 3: Extract Tables

In [4]:
############# world
world_df = df.loc[df['Country,Other'] == "World"]

# record worldwide cases
worldwideCases = world_df.iloc[0]['TotalCases']

# select our data for plotting:
world_df = world_df.filter(['TotalCases','TotalRecovered','Serious,Critical','ActiveCases','TotalDeaths', 'DateProcessed'])

# this is the correct way to add a new column from existing columns!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
world_df = world_df.assign(NonCritical_Active = float(world_df['ActiveCases']) - float(world_df['Serious,Critical']))
world_df.rename(columns = {'NonCritical_Active':'Non-Critical, Active', 'Serious,Critical':'Critical, Active'}, inplace = True)

# add in percentage columns
world_df = world_df.assign(TotalRecoveredPerc = (int(world_df['TotalRecovered']) / int(world_df['TotalCases'])) * 100)
world_df = world_df.assign(CritPerc = (int(world_df['Critical, Active']) / int(world_df['TotalCases'])) * 100)
world_df = world_df.assign(DeathPerc = (int(world_df['TotalDeaths']) / int(world_df['TotalCases'])) * 100)
world_df = world_df.assign(NonCritPerc = (int(world_df['Non-Critical, Active']) / int(world_df['TotalCases'])) * 100)

# drop redundant cols
world_df = world_df.drop(columns = ['ActiveCases']).reset_index(drop=True)

############ continent 
continents_df = df.iloc[0:5]
continents_df.drop(columns=['Country,Other'], inplace=True)

na = df.loc[df['Continent'] == "North America"]
del na["Continent"]

na = na.replace('','0', regex=True)

# create a row of data to append to the continents for North America
row = []
row.append("North America")

for column in na.columns[1:]:
    na[column] = pd.to_numeric(na[column], errors='coerce')
    row.append(round(na[column].sum()))
row = [row]

continents_df = continents_df.append(pd.DataFrame( row, columns=continents_df.columns), ignore_index = True)
continents_df = continents_df.sort_values(by=['TotalCases'], ascending=False)

############ country
countries_df = df.loc[7:len(df)]
countries_df = countries_df.reset_index()
countries_df = countries_df.drop(columns = ['index', 'Continent'])

# sort the data by total cases, as we did with continents:
countries_df = countries_df.sort_values(by=['TotalCases'],ascending=False)


## Function 4: Create CSVs

In [5]:
# create a folder for the csv data extracts if there isn't aready one using the os module

path = os.getcwd()
directory = 'csv_extracts'

if os.path.isdir(directory):
    print("Extracts directory found at '%s'" % directory)
    path = os.path.join(path, directory)
    # for File in os.listdir('./csv_extracts'):
    #         os.remove(File)
    #         print("removed '%s'" % File)
else:
    print('Extracts directory not found, creating csv extracts directory and files...')
    path = os.path.join(path, directory)
    os.mkdir(path)
    if os.path.isdir(directory):
        print("Directory '% s' successfully created" % directory)
    else:
        print("Could not create directory '% s'" % directory)
        
# create csv files from extracts

# world df
csv_name = path + "/world_csv_" + dt_string + ".csv"
print(csv_name)
world_df.to_csv(csv_name)

# continents df
csv_name = path + "/continent_csv_" + dt_string + ".csv"
print(csv_name)
continents_df.to_csv(csv_name)

# countries df
csv_name = path + "/country_csv_" + dt_string + ".csv"
print(csv_name)
countries_df.to_csv(csv_name)

world_df

Extracts directory not found, creating csv extracts directory and files...
Directory 'csv_extracts' successfully created
/Users/markdunbar/Documents/Repositories/Python-Projects-and-Challenges/Python Covid Tracker/src/covid-task-scheduler/csv_extracts/world_csv_03072021.csv
/Users/markdunbar/Documents/Repositories/Python-Projects-and-Challenges/Python Covid Tracker/src/covid-task-scheduler/csv_extracts/continent_csv_03072021.csv
/Users/markdunbar/Documents/Repositories/Python-Projects-and-Challenges/Python Covid Tracker/src/covid-task-scheduler/csv_extracts/country_csv_03072021.csv


Unnamed: 0,TotalCases,TotalRecovered,"Critical, Active",TotalDeaths,DateProcessed,"Non-Critical, Active",TotalRecoveredPerc,CritPerc,DeathPerc,NonCritPerc
0,183825696.0,168259008.0,78064.0,3979135.0,3072021,11509491.0,91.531822,0.042466,2.164624,6.261089


In [6]:
engine = create_engine('sqlite:///../application/site.db', echo=True)
sqlite_connection = engine.connect()

2021-07-03 00:30:55,215 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2021-07-03 00:30:55,216 INFO sqlalchemy.engine.base.Engine ()
2021-07-03 00:30:55,218 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2021-07-03 00:30:55,222 INFO sqlalchemy.engine.base.Engine ()
