In [79]:
import requests
from bs4 import BeautifulSoup
import re

In [80]:
def get_html(url, path):
    response = requests.get(url)
    with open(path, "w", encoding = "utf-8") as f:
        f.write(response.text)

In [81]:
with open("wiki_test.html", "r", encoding = "utf-8") as f:
    html = f.read()

In [82]:
soup = BeautifulSoup(html, "html.parser")
tables = soup.find_all("table", attrs={"class": "wikitable sortable"})

In [83]:
data = {}
for table in tables:
    headings = table.find_previous(["h2", "h3"]).text
    print(headings)
    data[headings] = table

Africa
Asia
Australia
Europe
Canada
United States
Central America
Caribbean
South America


In [84]:
table = data["Australia"]
first_row = table.tr
for td in first_row:
    print(td.text)



Country



Area



Old-growth extent



WWF ecoregion



Old-growth forest type



In [85]:
columns = []
for td in first_row:
    if td.text.strip() != "":
        columns.append(td.text.strip())

columns

['Country',
 'Area',
 'Old-growth extent',
 'WWF ecoregion',
 'Old-growth forest type']

In [86]:
rows = table.tbody.find_all("tr")

In [87]:
def extract_row_data(columns, row):
    row_data = {}
    table_cells = row.find_all("td")
    for i in range(len(table_cells)):
        row_data[columns[i]] = table_cells[i]
    
    return row_data

In [88]:
def clean_row_data(row: dict):

    for k in row.keys():
        val = row[k]
        if re.match("\s", val.text):
            row[k] = "No data"

        links = val.find_all("a")

        for l in links:
            if l.get("title") is not None and "(page does not exist)" in l.get("title"):
                l.replace_with(l.text)

            if "cite" in l.get("href"):
                l.parent.decompose()

        if k == "Old-growth extent" and row[k] != "No data":
            data = row[k].text.strip()

            data = data.replace("\xa0", " ")
            
            # 2,000, 7,800,000
            data = re.search("\d+(?:,\d{3})*(?:\.\d*)? (?:hectares|square kilometres|ha|acres)", data).group()

            parent = row[k].parent
            row[k].decompose()

            new_tag = soup.new_tag("td")
            new_tag.string = data
            parent.append(new_tag)

            row[k] = new_tag

            
    return row

In [89]:
def prepare_table_data(columns, table):
    table_data = []

    rows = table.find_all("tr")
    rows.pop(0)

    for r in rows:
        r = extract_row_data(columns=columns, row=r)
        r = clean_row_data(r)
        table_data.append(r)

    return table_data

In [90]:
def prepare_all_tables(columns, data):
    for k in data.keys():
        data[k] = prepare_table_data(columns, data[k])

    return data

In [91]:
data = prepare_all_tables(columns, data)
data["Canada"]

[{'Country': <td><a href="/wiki/British_Columbia" title="British Columbia">British Columbia</a>
  </td>,
  'Area': <td><a href="/wiki/Carmanah_Walbran_Provincial_Park" title="Carmanah Walbran Provincial Park">Carmanah Walbran Provincial Park</a>
  </td>,
  'Old-growth extent': <td>164 square kilometres</td>,
  'WWF ecoregion': <td><a href="/wiki/Central_Pacific_coastal_forests" title="Central Pacific coastal forests">Central Pacific coastal forests</a>
  </td>,
  'Old-growth forest type': <td><a class="mw-redirect" href="/wiki/Coniferous" title="Coniferous">coniferous</a> <a href="/wiki/Temperate_rainforest" title="Temperate rainforest">temperate rainforest</a>
  </td>},
 {'Country': <td><a href="/wiki/British_Columbia" title="British Columbia">British Columbia</a>
  </td>,
  'Area': <td><a href="/wiki/Clayoquot_Sound" title="Clayoquot Sound">Clayoquot Sound</a>
  </td>,
  'Old-growth extent': <td>265,000 hectares</td>,
  'WWF ecoregion': <td><a href="/wiki/Central_Pacific_coastal_fore

# Data Analytics

In [92]:
data["Australia"][3]["Old-growth extent"].text

'2,000 square kilometres'

In [93]:
# How many of the listed forests are in France?
europe = data["Europe"]
france = [r for r in europe if "France" in r["Country"].text]
len(france)

12

In [94]:
# How many of the listed forests are in Tasmania?
len([r for r in data["Australia"] if "Tasmania" in r["Area"].text])


6

In [95]:
# In tasmania, of those that have data, what is the total area of these?
australia = data["Australia"]
tasmania = [r for r in data["Australia"] if "Tasmania" in r["Area"].text]
tasmania_area_data = [r for r in tasmania if r["Old-growth extent"] != "No data"]

total = 0
for r in tasmania_area_data:
    area = r["Old-growth extent"].text

    area = area.replace(",", "")
    val = re.search("\d*", area).group()
    val = float(val)

    if "square kilometres" in area:
        val = val * 100

    total += val

print("Total area for Tasmania forests: ", total, " ha")

Total area for Tasmania forests:  200100.0  ha


## From the data of bulgaria's forests, what is the proportion of bulgarias total area that is covered by these?

In [96]:
bulgaria_rows = []
for row in data["Europe"]:
    if row["Country"].text.strip() == "Bulgaria":
        bulgaria_rows.append(row)

In [97]:
# get the link of the bulgaria article
bulgaria_link = "https://wikipedia.org" + bulgaria_rows[0]["Country"].a["href"]

In [98]:
get_html(bulgaria_link, path="bulgaria.html")

In [99]:
with open("bulgaria.html", "r", encoding = "utf-8") as f:
    html_bulgaria = f.read()

In [100]:
bulgaria_soup = BeautifulSoup(html_bulgaria, "html.parser")
bulgaria_soup.title

<title>Bulgaria - Wikipedia</title>

In [101]:
def get_bulgaria_area(tag):
    return tag.name == "td" and 'km' in tag.text and 'Total' in tag.parent.text

In [102]:
km_tags = [t.text for t in bulgaria_soup.find_all(get_bulgaria_area)]

In [103]:
area_tag = km_tags[0]
area_tag

'110,993.6[3]\xa0km2 (42,854.9\xa0sq\xa0mi) (103rd)'

In [104]:
b_area = re.search("\d+(?:,\d{3})*(?:\.\d*)?", area_tag).group()
b_area = float(b_area.replace(',', ''))

In [105]:
b_area = b_area * 100

In [106]:
forest_total = 0
for row in bulgaria_rows:
    forest_data = row['Old-growth extent'].text
    forest_data = re.search("\d+(?:,\d{3})*(?:\.\d*)?", forest_data).group()
    forest_data = float(forest_data.replace(',', ''))

    forest_total += forest_data

forest_total

1107122.6

In [107]:
print(f'Percentage of bulgartian land area accounted for old growth: {round((forest_total / b_area)*100, 5)}%')

Percentage of bulgartian land area accounted for old growth: 9.97465%


### Challange: How many US states have forests with some variety of oak tree?

In [108]:
"""
Get us table
Search for oak in 'Old-growth forrest type'
count
"""

"\nGet us table\nSearch for oak in 'Old-growth forrest type'\ncount\n"

In [109]:
us_table = data['United States']
states = set()

In [110]:
us_oak = [r for r in us_table if "oak" in r["Old-growth forest type"].text]
len(us_oak)

AttributeError: 'str' object has no attribute 'text'

In [111]:
for r in us_table:
    f_type = r["Old-growth forest type"]
    if f_type is not None and f_type != 'No data':
        if 'oak' in f_type.text.lower():
            states.add(r["Country"].text.strip())
states

KeyError: 'Old-growth forest type'