Step 0: Importing Relevant Packages (0%)

In [1]:
# Web Scraping Libraries
from bs4 import BeautifulSoup  # for scraping
import requests  # for sending HTTP requests
# Data Processing Libraries
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Step 1: Crawl a Real World Data Set (2%)

In [3]:
url = "https://www.worldometers.info/coronavirus/#main_table"  # Web address of data
response = requests.get(url)  # Performing a get request for the data at the url location
soup_object = BeautifulSoup(response.content)  # Parsing the request response to create a soup_object that can be searched for a table
covid_table = soup_object.find("table")  # Searching the soup_object for a table

with open("Live_Covid_Data.csv", "w", newline="") as csv_file:  # Opening a CSV file to be written into
    writer = csv.writer(csv_file)  # Defining the writer as a csv writer that writes to the opened file

    for row in covid_table.find_all("tr"):  # Iterate through the rows of the table, "tr" points to a row in the table
        cells = row.find_all("td")  # Get the cells in the row, "td" points to a cell in the row

        if cells:  # If there are cells...
            values = [cell.text for cell in cells]  # Values is a list of values for each cell in the row of cells
            writer.writerow(values)  # Write the values to the CSV


Step 2: Data Preparation and Cleaning (5%)

In [104]:
# Creating list of column names
col_names = ["Country", "Total Cases", "New Cases", "Total Deaths", "New Deaths", "Total Recovered", "New Recovered", "Active Cases", "Critical Cases",
             "Total Cases / 1M People", "Total Deaths / 1M People", "Total Tests", "Total Tests / 1M People", "Population", "Continent", "One Case / n People",
             "One Death / n People", "One Test / n People", "New Cases / 1M People", "New Deaths / 1M People", "Active Cases / 1M People"]

# Reading the csv to a dataframe with the column names specified, encoding with "latin-1" to get around "utf-8" decoding issue.
df = pd.read_csv("Live_Covid_Data.csv", names=col_names, header=None, encoding="latin-1")

# Tidying up the names of the continent by removing the "\n"'s on it, i.e. "\nNorth America\n"
old_continent_list = df.iloc[0:7]["Country"].tolist()  # Locating the continent names and putting them into a list
new_continent_list = []  # Defining a list for the formatted continent names
for old_continent in old_continent_list:
     new_continent = "Antarctica" if old_continent == "\n\n" else old_continent.replace("\n", "")
     new_continent_list.append(new_continent)

# Changing the index to the name of the Country/Continent
df.set_index("Country", inplace=True)

# Changing the index name to the re-formatted continents. set_value has been depricated, also before setting the index as Country the continent rows had
# an index of NaN so at[row, col] couldn't be used to assign values either. Therefore, application of the reformatted names had to after set_index was done.
n = 0
for new_continent in new_continent_list:
    df.rename(index={old_continent_list[n]: new_continent}, inplace=True)
    n += 1

# Moving country population to first column as it is important for comparison to total cases etc
country_pop_col = df.pop("Population")
df.insert(0, "Population", country_pop_col)

# Removing the "Total:" countries from DF as well as cruise ships "MS Zaandam" and "Diamond Princess"


# Calculating the population of each of the continent groups based on the countries assigned to that group
# NA_pop = df.loc[df["Continent"] == "North America", "Population"].sum()
# print(NA_pop)

df

Unnamed: 0_level_0,Population,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,New Recovered,Active Cases,Critical Cases,Total Cases / 1M People,Total Deaths / 1M People,Total Tests,Total Tests / 1M People,Continent,One Case / n People,One Death / n People,One Test / n People,New Cases / 1M People,New Deaths / 1M People,Active Cases / 1M People
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
North America,,121262081,+5968,1578160,28.0,116064929,+5895,3618992,9463,,,,,North America,\n,,,,,
Asia,,206595856,+323654,1511157,635.0,193360313,+100774,11724386,11331,,,,,Asia,\n,,,,,
Europe,,242719800,+15247,1980119,110.0,236784979,+66776,3954702,7569,,,,,Europe,\n,,,,,
South America,,66758003,+3145,1341406,1.0,64253133,+23085,1163464,10341,,,,,South America,\n,,,,,
Oceania,,13582848,+2758,23298,24.0,13232703,+20,326847,161,,,,,Australia/Oceania,\n,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Total:,,66758003,+3145,1341406,1.0,64253133,+23085,1163464,10341,,,,,South America,,,,,,
Total:,,13582848,+2758,23298,24.0,13232703,+20,326847,161,,,,,Australia/Oceania,,,,,,
Total:,,12739722,,258339,,12037414,,443969,843,,,,,Africa,,,,,,
Total:,,721,,15,,706,,0,0,,,,,,,,,,,


Step 3: Exploratory Data Analysis (8%)

Step 4a: Question 1 (10%)

Step 4b: Question 2 (10%)

Step 4c: Question 3 (10%)

Step 5: Summary and Conclusion (5%)

Write summary/conc in here