Step 0: Importing Relevant Packages (0%)

In [32]:
# Web Scraping Libraries
from bs4 import BeautifulSoup  # for scraping
import requests  # for sending HTTP requests
# Data Processing Libraries
import csv
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Step 1: Crawl a Real World Data Set (2%)

In [3]:
url = "https://www.worldometers.info/coronavirus/#main_table"  # Web address of data
response = requests.get(url)  # Performing a get request for the data at the url location
soup_object = BeautifulSoup(response.content)  # Parsing the request response to create a soup_object that can be searched for a table
covid_table = soup_object.find("table")  # Searching the soup_object for a table

with open("Live_Covid_Data.csv", "w", newline="") as csv_file:  # Opening a CSV file to be written into
    writer = csv.writer(csv_file)  # Defining the writer as a csv writer that writes to the opened file

    for row in covid_table.find_all("tr"):  # Iterate through the rows of the table, "tr" points to a row in the table
        cells = row.find_all("td")  # Get the cells in the row, "td" points to a cell in the row

        if cells:  # If there are cells...
            values = [cell.text for cell in cells]  # Values is a list of values for each cell in the row of cells
            writer.writerow(values)  # Write the values to the CSV


Step 2: Data Preparation and Cleaning (5%)

In [105]:
# Creating list of column names
col_names = ["Country", "Total Cases", "New Cases", "Total Deaths", "New Deaths", "Total Recovered", "New Recovered", "Active Cases", "Critical Cases",
             "Total Cases / 1M People", "Total Deaths / 1M People", "Total Tests", "Total Tests / 1M People", "Population", "Continent", "One Case / n People",
             "One Death / n People", "One Test / n People", "New Cases / 1M People", "New Deaths / 1M People", "Active Cases / 1M People"]

# Reading the csv to a dataframe with the column names specified, encoding with "latin-1" to get around "utf-8" decoding issue.
df = pd.read_csv("Live_Covid_Data.csv", names=col_names, header=None, encoding="latin-1")

# Tidying up the names of the continent by removing the "\n"'s on it, i.e. "\nNorth America\n"
old_continent_list = df.iloc[0:7]["Country"].tolist()  # Locating the continent names and putting them into a list
new_continent_list = []  # Defining a list for the formatted continent names

for old_continent in old_continent_list:  # Iterate through current continent names
     new_continent = "Antarctica" if old_continent == "\n\n" else old_continent.replace("\n", "")  # Removing \n from continent names/identifying Antarctica
     if new_continent == "Oceania":  # In "Continent" column countries are assigned Australia/Oceania not just Oceania
         new_continent = "Australia/Oceania"  # Renaming continent
     new_continent_list.append(new_continent)  # Addding the renamed continenet to the new list

# Changing the index to the name of the Country/Continent
df.set_index("Country", inplace=True)  # inplace=True updates the original datafram immediately

# Changing the index name to the re-formatted continents. set_value has been depricated, also before setting the index as Country the continent rows had
# an index of NaN so at[row, col] couldn't be used to assign values either. Therefore, application of the reformatted names had to after set_index was done.
n = 0
for new_continent in new_continent_list:  # Iterate through the new continent names
    df.rename(index={old_continent_list[n]: new_continent}, inplace=True)  # Replace the old continent name with corresponding new continent name
    n += 1

# Moving country population to first column as it is important for comparison to total cases etc
country_pop_col = df.pop("Population")  # Removes the column from DF while saving it to a variable
df.insert(0, "Population", country_pop_col)  # Replacing the column in first position using previously assigned variable

# Removing non-ccountries/non-territories from the DF
drop_list = ["Diamond Princess", "MS Zaandam", "Total:"]  # List of unwanted non-countries/non-territories
for item in drop_list:  # Iterate through list
    df.drop(index=item, axis=0, inplace=True)  # Permanently delete the corresponding row in place

# Calculating the population of each continent and the world
global_population = 0  # Defining global population as 0
for continent in new_continent_list:  # iterate through the continents
    # If a row has continent "continent" and is not the row for the continent itself, population is added to the corresponding list
    population_list = df.loc[(df["Continent"] == continent) & (df.index != continent), "Population"].tolist()
    continent_population = 0  # Defining the continent's population as 0

    for population in population_list:  # Iterate through all populations in the list
        new_population = re.sub(",", "", population)  # Removing the commas from the strings seperating 1000's
        continent_population += int(new_population)  # Adding the population to the continent populatuon

    global_population += continent_population  # Adding the continent population to the global population
    df.at[continent, "Population"] = continent_population  # Replacing the continents NaN population with the calculated population
df.at["World", "Population"] = global_population  # Replacing the world's NaN population with the calculated population

# Dropping Antarctica
df.drop(index="Antarctica", axis=0, inplace=True)

# Remove commas from all columns and convert all columns to floats while avoiding non-numeric data
df = df.replace(",", "", regex=True)  # Removing commas from entire dataframe, regex=True automates searching and replacing
for column in df:  # Iterate through all columns in the dataframe
    try:  # Try the following experssion:
        df[column] = df[column].astype(float)  # Convert the column to floats
    except ValueError:  # If this error is caused, raise an exception for the Nan/string value
        pass  # Skip the conversion if the column contains non-numeric data

# Dropping columns with the format "New..." as "New" is ambiguous and not described on the website, they also contain a lot of NaN.
for column in df:
    if "New" in column:
        df.drop(columns=column, inplace=True)

# Handling missing values in the "Total Deaths" column
for country in df.index:
    print(country)
    if df.loc[country, "Total Deaths"] == "":
        df.at[country, "Total Deaths"] = np.nan

df

North America
Asia
Europe
South America
Australia/Oceania
Africa
World
USA
India
France
Germany
Brazil
Japan
S. Korea
Italy
UK
Russia
Turkey
Spain
Vietnam
Australia
Argentina
Taiwan
Netherlands
Iran
Mexico
Indonesia
Poland
Colombia
Austria
Portugal
Greece
Ukraine
Malaysia
Chile
DPRK
Israel
Thailand
Belgium
Czechia
Canada
Peru
Switzerland
Philippines
South Africa
Romania
Denmark
Sweden
Hong Kong
Iraq
Serbia
Singapore
Hungary
New Zealand
Bangladesh
Slovakia
Georgia
Jordan
Ireland
Pakistan
Norway
Finland
Kazakhstan
Slovenia
Bulgaria
Lithuania
Morocco
Croatia
Lebanon
Guatemala
Costa Rica
Bolivia
Tunisia
Cuba
UAE
Ecuador
Panama
Uruguay
Mongolia
Nepal
Belarus
Latvia
Saudi Arabia
Azerbaijan
Paraguay
Bahrain
Sri Lanka
Kuwait
Dominican Republic
Myanmar
Cyprus
Palestine
Estonia
Moldova
Venezuela
Egypt
Libya
Ethiopia
Qatar
Réunion
Honduras
Armenia
Bosnia and Herzegovina
Oman
North Macedonia
Kenya
Zambia
Albania
Botswana
Luxembourg
Montenegro
Algeria
Nigeria
Brunei 
Zimbabwe
Uzbekistan
Mozambique


Unnamed: 0_level_0,Population,Total Cases,Total Deaths,Total Recovered,Active Cases,Critical Cases,Total Cases / 1M People,Total Deaths / 1M People,Total Tests,Total Tests / 1M People,Continent,One Case / n People,One Death / n People,One Test / n People,Active Cases / 1M People
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
North America,5.981409e+08,121262081.0,1578160,116064929.0,3618992.0,9463.0,,,,,North America,\n,,,
Asia,4.711357e+09,206595856.0,1511157,193360313.0,11724386.0,11331.0,,,,,Asia,\n,,,
Europe,7.475438e+08,242719800.0,1980119,236784979.0,3954702.0,7569.0,,,,,Europe,\n,,,
South America,4.376944e+08,66758003.0,1341406,64253133.0,1163464.0,10341.0,,,,,South America,\n,,,
Australia/Oceania,4.346903e+07,13582848.0,23298,13232703.0,326847.0,161.0,,,,,Australia/Oceania,\n,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wallis and Futuna,1.098200e+04,761.0,7,438.0,316.0,,69295.0,637.0,20508.0,1867419.0,Australia/Oceania,14,1569.0,1.0,28774.0
Niue,1.622000e+03,458.0,,290.0,168.0,,282367.0,,,,Australia/Oceania,4,,,103576.0
Vatican City,7.990000e+02,29.0,,29.0,0.0,,36295.0,,,,Europe,28,,,
Western Sahara,6.261610e+05,10.0,1,9.0,0.0,,16.0,2.0,,,Africa,62616,626161.0,,


Step 3: Exploratory Data Analysis (8%)

Step 4a: Question 1 (10%)

Step 4b: Question 2 (10%)

Step 4c: Question 3 (10%)

Step 5: Summary and Conclusion (5%)

Write summary/conc in here