# Introduction

what we want to achieve

## Checking if we can scrape

In [1]:
import urllib.robotparser

In [2]:
def can_scrape(url: str, user_agent: str = "*") -> bool:
    # create an instance of robot parser
    rp = urllib.robotparser.RobotFileParser()

    # parse the robots.txt file on the website
    rp.set_url(url + "/robots.txt")
    rp.read()

    # check if scraping is allowed for the given user agent
    return rp.can_fetch(user_agent, url)

In [None]:
website_url = "https://www.wikipedia.org"
user_agent = "*"

can_scrape(website_url, user_agent)

Importing libraries

In [4]:
import requests
from bs4 import BeautifulSoup
import re

define custom function for getting html, calling it on the url and saving it

In [5]:
def get_html(url, path):
    response = requests.get(url)
    with open(path, "w", encoding = "utf-8") as f:
        f.write(response.text)

url = "https://en.wikipedia.org/wiki/List_of_old-growth_forests"

get_html(url, path="HTML/wiki_old_forrest.html")

create `BeautifulSoup` object and extract all `wikitable sortable` classes

In [6]:
with open("HTML/wiki_old_forrest.html", "r", encoding = "utf-8") as f:
    html = f.read()

soup = BeautifulSoup(html, "html.parser")
tables = soup.find_all("table", attrs={"class": "wikitable sortable"})

Extract the table names based on headers in the html

In [None]:
data = {}
for table in tables:
    headings = table.find_previous(["h2", "h3"]).text
    print(headings)
    data[headings] = table

Extract the headers of tables using the Australia table as template 

In [None]:
table = data["Australia"]
first_row = table.tr

columns = []
for td in first_row:
    if td.text.strip() != "":
        columns.append(td.text.strip())

columns

There is a missing td in United States table (lol), so in order for everything to work we need to add it.

In [9]:
def extract_row_data(columns, row):
    row_data = {}
    table_cells = row.find_all("td")
    
    # Add missing <td> cells if the number of columns is greater than table_cells
    while len(table_cells) < len(columns):
        empty_td = soup.new_tag("td")   # Create a new empty <td> tag
        
        # Add a whitespace character that matches `\s`, such as a space
        #empty_td.string = " "           # Single space character
        
        row.append(empty_td)            # Append to the row
        table_cells.append(empty_td)     # Add to the table_cells list

    # Loop through the cells and map them to columns
    for i in range(len(columns)):
        row_data[columns[i]] = table_cells[i].text.strip()  # Extract text content

    return row_data

In [10]:
def clean_row_data(row: dict):

    for k in row.keys():
        val = row[k]

        if re.match("\s", val.text):
            row[k] = "No data"

        links = val.find_all("a")

        for l in links:
            if l.get("title") is not None and "(page does not exist)" in l.get("title"):
                l.replace_with(l.text)

            if "cite" in l.get("href"):
                l.parent.decompose()

        if k == "Old-growth extent" and row[k] != "No data":
            data = row[k].text.strip()

            data = data.replace("\xa0", " ")
            
            # 2,000, 7,800,000
            data = re.search("\d+(?:,\d{3})*(?:\.\d*)? (?:hectares|square kilometres|ha|acres)", data).group()

            parent = row[k].parent
            row[k].decompose()

            new_tag = soup.new_tag("td")
            new_tag.string = data
            parent.append(new_tag)

            row[k] = new_tag

    return row

In [14]:
def prepare_table_data(columns, table):
    table_data = []

    rows = table.find_all("tr")
    rows.pop(0)

    for r in rows:
        r = extract_row_data(columns=columns, row=r)
        r = clean_row_data(r)
        table_data.append(r)

    return table_data

In [12]:
def prepare_all_tables(columns, data):
    for k in data.keys():
        data[k] = prepare_table_data(columns, data[k])

    return data

In [15]:
cleaned_data = prepare_all_tables(columns, data)

In [None]:
rows = data["United States"].find_all("tr")

In [None]:
rows = data["United States"].find_all("tr")
rows.pop(0)
rows[167].find_all("td")

In [None]:
rows[165].find_all("td")

In [None]:
row_data = {}
table_cells = rows[165].find_all("td")
for i in range(len(table_cells)):
    row_data[columns[i]] = table_cells[i].text.strip() 
    
row_data

In [None]:
row_data = {}
table_cells = rows[167].find_all("td")
for i in range(len(table_cells)):
    row_data[columns[i]] = table_cells[i].text.strip() 
    
row_data

In [None]:
rows = data["United States"].find_all("tr")
rows.pop(0)

table_data = []


for row in rows:
    r = extract_row_data(columns, row)
    rr = clean_row_data(r)
    table_data.append(rr)

table_data

# Data Analytics

In [None]:
cleaned_data["Australia"][3]["Old-growth extent"].text

In [None]:
# How many of the listed forests are in France?
europe = data["Europe"]
france = [r for r in europe if "France" in r["Country"].text]
len(france)

In [None]:
# How many of the listed forests are in Tasmania?
len([r for r in data["Australia"] if "Tasmania" in r["Area"].text])

In [None]:
# In tasmania, of those that have data, what is the total area of these?
australia = data["Australia"]
tasmania = [r for r in data["Australia"] if "Tasmania" in r["Area"].text]
tasmania_area_data = [r for r in tasmania if r["Old-growth extent"] != "No data"]

total = 0
for r in tasmania_area_data:
    area = r["Old-growth extent"].text

    area = area.replace(",", "")
    val = re.search("\d*", area).group()
    val = float(val)

    if "square kilometres" in area:
        val = val * 100

    total += val

print("Total area for Tasmania forests: ", total, " ha")

## From the data of bulgaria's forests, what is the proportion of bulgarias total area that is covered by these?

In [None]:
bulgaria_rows = []
for row in data["Europe"]:
    if row["Country"].text.strip() == "Bulgaria":
        bulgaria_rows.append(row)

In [None]:
# get the link of the bulgaria article
bulgaria_link = "https://wikipedia.org" + bulgaria_rows[0]["Country"].a["href"]

In [None]:
get_html(bulgaria_link, path="HTML/bulgaria.html")

In [None]:
with open("HTML/bulgaria.html", "r", encoding = "utf-8") as f:
    html_bulgaria = f.read()

In [None]:
bulgaria_soup = BeautifulSoup(html_bulgaria, "html.parser")
bulgaria_soup.title

In [None]:
def get_bulgaria_area(tag):
    return tag.name == "td" and 'km' in tag.text and 'Total' in tag.parent.text

In [None]:
km_tags = [t.text for t in bulgaria_soup.find_all(get_bulgaria_area)]

In [None]:
area_tag = km_tags[0]
area_tag

In [None]:
b_area = re.search("\d+(?:,\d{3})*(?:\.\d*)?", area_tag).group()
b_area = float(b_area.replace(',', ''))

In [None]:
b_area = b_area * 100

In [None]:
forest_total = 0
for row in bulgaria_rows:
    forest_data = row['Old-growth extent'].text
    forest_data = re.search("\d+(?:,\d{3})*(?:\.\d*)?", forest_data).group()
    forest_data = float(forest_data.replace(',', ''))

    forest_total += forest_data

forest_total

In [None]:
print(f'Percentage of bulgarian land area accounted for old growth: {round((forest_total / b_area)*100, 5)}%')

### How many US states have forests with some variety of oak tree?

In [None]:
us_table = data['United States']
states = set()

In [None]:
us_oak = [r for r in us_table if "oak" in r["Old-growth forest type"].text]
len(us_oak)

In [None]:
for r in us_table:
    f_type = r["Old-growth forest type"]
    if f_type is not None and f_type != 'No data':
        if 'oak' in f_type.text.lower():
            states.add(r["Country"].text.strip())
states