In [1]:
#from git repo
from bs4 import BeautifulSoup
import requests, os
import pandas as pd
from datetime import datetime
from time import sleep
import time

# #date object and string from it to use as files name for order
# #Set time to Nigeria time
# os.environ["TZ"] = "Africa/Lagos"
# time.tzset()


def d_time():
    dateTimeObj = datetime.now()
    timestampStr = dateTimeObj.strftime("%d-%b-%Y %H:%M:%S")

    return timestampStr

# define our clear function
def clear():
    """
    For a neat console, clear console depending on the os.
    """
    # for windows
    if os.name == 'nt':
        _ = os.system('cls')

    # for mac and linux(here, os.name is 'posix')
    else:
        _ = os.system('clear')

def covidHelp():
    """
    Scrape data from covid.help website
    """
    #site address
    url = 'https://corona.help/#countries-nav'

    print("Reading the web page ...")
    #get the page with requests
    r = requests.get(url)

    #make into a bs4 four item to lend itself to html ta searching
    soup = BeautifulSoup(r.content, "lxml")

    #From eyeballing, 'td' tags hold our data of interest .find all td tags.
    tables = soup.find_all('td')

    #Get text of each line of 'tables' and strip whitespaces
    our_list = [line.text.strip() for line in tables]

    #From observation, each country, including its name, has 5 consecutive rows
    #Make arrays to match this order
    countries = our_list[::5] #step five places through lines to get countries.
    infected = our_list[1::5]
    deaths = our_list[2::5]
    recovered = our_list[3::5]
    active = our_list[4::5]

    #Convert our array of numbers to integer and add total
    infected = [int(line) for line in infected]
    infected.append(sum(infected))

    deaths = [int(line) for line in deaths]
    deaths.append(sum(deaths))

    recovered = [int(line) for line in recovered]
    recovered.append(sum(recovered))

    active = [int(line) for line in active]
    active.append(sum(active))

    #Add an extra line/row 'Total' to countries array to match len of other arrays in anticipation of a dataframe
    countries.append("Total")

    clear()
    print("Data retrieved, cleaned.\nMaking into a dataframe ...")
    sleep(2)
    #Make empty df and snap in the data
    df = pd.DataFrame()
    df['countries'] = countries
    df['infected'] = infected
    df['deaths'] = deaths
    df['recovered'] = recovered
    df['active'] = active

    clear()
    print("Calculating recovery and death rates...")
    sleep(3)
    #Rating calculator
    #Get rate of each event(recovery/death) relative to total occurences
    #Round to a single decimal place
    rating = lambda x,y: round((x / y) * 100, 1)

    #Add rates to df with rating ;lambda
    try:
        df['death_rate'] = rating(df['deaths'], df['infected'])
        df['recovery_rate'] = rating(df['recovered'], df['infected'])
    except:
        infected_no = df['infected'].tolist()
        death_no = df['deaths'].tolist()
        recovs = df['recovered'].tolist()

        death_rate,  recovery_rate = [], []
        for i_no, det, rec in zip(infected_no, death_no, recovs):
            death_rate.append(rating(float(det), i_no))
            recovery_rate.append(rating(float(rec), i_no))

        df['death_rate'] = death_rate
        df['recovery_rate'] = recovery_rate


    clear()
    print("We will be writing the data to a folder called 'Data'.\nLet's see if folder exists...")
    sleep(2)
    #Check if data foler exists, if not create one
    # if not os.path.exists('my_folder'):
    #     os.makedirs('my_folder')
    try:
        os.makedirs('data')
        clear()
        print("We just made a 'Data' folder as it didn't exist before")
        #Save to data folder.
        try:
            timestampStr = d_time()
            df.to_csv('/home/fesh/data/' + 'covid_help_data_' + timestampStr + '.csv', index=False)
            clear()
        except:
            timestampStr = d_time()
            df.to_csv('/home/fesh/data/' + 'covid_help_data_' + timestampStr + '.csv', encoding = 'utf-8', index=False)
            clear()
        print("Data written to data folder.")
        sleep(2)
        clear()
        timestampStr = d_time()
        print("\nThe current covid-19 global death rate according to this data is ~{}%".format(df.iloc[-1]['death_rate']))
    except:
        #Save to data folder.
        clear()
        print("The folder exists!")
        sleep(2)
        timestampStr = d_time()
        df.to_csv('/home/fesh/data/' + 'covid_help_data_' + timestampStr + '.csv', encoding = 'utf-8', index=False)
        clear()
        print("Data written to data folder.")
        sleep(2)
        clear()
        print("\nThe current covid-19 global death rate according to this data is ~{}%".format(df.iloc[-1]['death_rate']))

    return countries #for use in the collecting country by country

def oya(line, prefix = 'https://corona.help/country/'):
    url = prefix + line

    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'lxml')
    target = soup.find_all('div', {'class':'col-xl-2 col-md-4 col-sm-6'})

    count = []

    for item in target:
        hold = item.text.strip().replace('\n', '|')
        x = hold.split('|')
        count.append(x[0])
    count.insert(0, line)

    df = pd.DataFrame()

    df['Country'] = [count[0]]
    df['Total confirmed cases'] = [count[1]]
    df['Total deaths'] = [count[2]]
    df['Confirmed recoveries'] = [count[3]]
    df['Cases confirmed today'] = [count[4]]
    df['Deaths today'] = [count[5]]
    df['Recoveries confirmed today'] = [count[6]]
    return df

# clear()
print("Commencing covid-19 global data from covid.help...")
sleep(2)
countries = covidHelp()
clear()
print("Done!")

#All countries
countries_x = []
for line in countries:
    line = line.lower()
    x = line.split()
    if len(x) == 1:
        countries_x.append(line)
    elif len(x) == 2:
        countries_x.append(x[0].replace('.', '') + '-' + x[1].replace('(', '').replace(')', ''))
    elif len(x) == 3:
        countries_x.append(x[0].replace('.', '') + '-' + x[1] + '-' + x[2])

countries_x = countries_x[:-1]

#Execuation for all countries table
chai = []

for line in countries_x:
    try:
        df = oya(line)
        chai.append(df)
    except:
        try:
            print("Failed at {}\nRetrying...".format(line.encode('utf-8')))
            sleep(2)
            df = oya(line)
            chai.append(df)
        except:
            print("Gave up after 2 tries, so ...")
    print("done.")
    sleep(2)
    clear()

combo = pd.concat(chai).reset_index(drop=True)
timestampStr = d_time()
combo.to_csv('/home/fesh/data/' + 'worldwide_' + timestampStr + '.csv', encoding = 'utf-8', index=False)

Commencing covid-19 global data from covid.help...
Reading the web page ...


ValueError: invalid literal for int() with base 10: '368,441'