## Olympics environnemental impact analysis

In [363]:
# let's scrape some data from the web
# https://stats.oecd.org/Index.aspx?DataSetCode=air_ghg

Scraping data from : https://stats.oecd.org/Index.aspx?DataSetCode=air_ghg

In [364]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tqdm as tqdm
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select

In [365]:
driver = webdriver.Firefox()
driver.get("https://stats.oecd.org/Index.aspx?DataSetCode=air_ghg")

Drop-down menu :

In [366]:
menu = '//*[@id="PDim_VAR"]' 
element = driver.find_element(By.XPATH,menu).click()

Choosing the emmission sector :

In [367]:
sleep(2)
npage = 2
xpath_expression = "//select/option[@value ='2~ENER_IND']" # 0~GHG
sector = driver.find_element(By.XPATH, xpath_expression).click()   

BeautifulSoup to parse the html : 

In [368]:
soup = bs(driver.page_source, 'html.parser')
table = soup.find('table', {'class': 'DataTable'})
# we want to find all tr in the table with an id starting with row
rows = table.findAll('tr', {'id': lambda L: L and L.startswith('row')})
# we display the first rows
rows[:5]

[<tr class="row1" id="row1"><td class="RowDimLabel" colspan="2">Australia</td><td class="RowDimLabel"></td><td class="Data">438 056.76</td><td class="Data">438 049.29</td><td class="Data">441 752.18</td><td class="Data">442 282.48</td><td class="Data">442 610.56</td><td class="Data">451 076.46</td><td class="Data">457 574.99</td><td class="Data">469 835.69</td><td class="Data">484 037.12</td><td class="Data">489 971.16</td><td class="Data">501 588.99</td><td class="Data">509 456.08</td><td class="Data">513 171.01</td><td class="Data">511 714.21</td><td class="Data">528 827.43</td><td class="Data">535 589.76</td><td class="Data">540 017.78</td><td class="Data">546 739.52</td><td class="Data">549 128.82</td><td class="Data">552 127.35</td><td class="Data">547 172.76</td><td class="Data">549 007.35</td><td class="Data">552 305.55</td><td class="Data">543 798.68</td><td class="Data">535 451.44</td><td class="Data">544 231.81</td><td class="Data">552 354.83</td><td class="Data">559 581.11</

Creation of the dataframe : 

In [369]:
df = pd.DataFrame(columns=['sector', 'country', 'year', 'GH emissions'])

Test of data extraction for the first sector : Energy Industries

In [370]:
# we extract the data from rows and add them to the dataframe df
for row in tqdm.tqdm(rows):
    cells = row.findAll('td')
    sector = '1A1 - Energy Industries'
    country = cells[0].text
    for i in range(0, 32):
        year = 1990 + i
        GH = cells[i+2].text
        df = pd.concat([df, pd.DataFrame([[sector, country, year, GH]], columns=['sector', 'country', 'year', 'GH emissions'])])

100%|██████████| 64/64 [00:01<00:00, 48.77it/s]


In [371]:
# we print the head of the dataframe
df.head(10)

Unnamed: 0,sector,country,year,GH emissions
0,1A1 - Energy Industries,Australia,1990,438 056.76
0,1A1 - Energy Industries,Australia,1991,438 049.29
0,1A1 - Energy Industries,Australia,1992,441 752.18
0,1A1 - Energy Industries,Australia,1993,442 282.48
0,1A1 - Energy Industries,Australia,1994,442 610.56
0,1A1 - Energy Industries,Australia,1995,451 076.46
0,1A1 - Energy Industries,Australia,1996,457 574.99
0,1A1 - Energy Industries,Australia,1997,469 835.69
0,1A1 - Energy Industries,Australia,1998,484 037.12
0,1A1 - Energy Industries,Australia,1999,489 971.16


In [372]:
# npage += 1
# xpath_expression = f"//select/option[contains(@value, '{npage}~ENER_')]"
# sector = driver.find_element(By.XPATH, xpath_expression).click() 

### Automatisation of the scraping process for the OECD website

Function to scrape the data from the website according to the sector chosen : 

In [373]:
# now let's do a function to extract the data from the website starting from npage 2 to npage 8
def extract_data(npage, sector_name, df):
    menu = '//*[@id="PDim_VAR"]' 
    driver.find_element(By.XPATH,menu).click() # menu

    xpath_expression = f"//select/option[contains(@value, '{npage}~ENER_')]"
    driver.find_element(By.XPATH, xpath_expression).click() # page of the secteur
    sleep(7)

    soup = bs(driver.page_source, 'html.parser')
    table = soup.find('table', {'class': 'DataTable'})
    rows = table.findAll('tr', {'id': lambda L: L and L.startswith('row')})

    for row in tqdm.tqdm(rows):
        cells = row.findAll('td')
        country = cells[0].text
        for i in range(0, 32):
            year = 1990 + i
            GH = cells[i+2].text
            df = pd.concat([df, pd.DataFrame([[sector_name, country, year, GH]], columns=['sector', 'country', 'year', 'GH emissions'])])
            
    return df

Extracting the name of the sectors :

In [374]:
# we extract the names of the sectors :
sleep(7)
sectors = []
for i in range(2, 9):
    xpath_expression = f"//select/option[contains(@value, '{i}~ENER_')]"
    sector = driver.find_element(By.XPATH, xpath_expression)
    sectors.append(sector.text)
sectors

['    1A1 - Energy Industries',
 '    1A2 - Manufacturing industries and construction',
 '    1A3 - Transport',
 '    1A4 - Residential and other sectors',
 '    1A5 - Energy - Other',
 '    1B - Fugitive Emissions from Fuels',
 '    1C - CO2 from Transport and Storage']

In [375]:
# in '    1A1 - Energy Industries', we want to keep only 'Energy Industries'. We do that for all the sectors
sectors = [sector.split(' - ')[1] for sector in sectors]
# the sector 'Energy' is re-changed to 'Energy - Other'
sectors[4] = 'Energy - Other'
sectors

['Energy Industries',
 'Manufacturing industries and construction',
 'Transport',
 'Residential and other sectors',
 'Energy - Other',
 'Fugitive Emissions from Fuels',
 'CO2 from Transport and Storage']

In [376]:
# Cleaning the dataframe ...
df = pd.DataFrame(columns=['sector', 'country', 'year', 'GH emissions'])
df

Unnamed: 0,sector,country,year,GH emissions


In [377]:
# we extract the data from npage 2 to npage 8
for i in tqdm.tqdm(range(0, 7)):
    df = extract_data(i+2, sectors[i], df)

100%|██████████| 64/64 [00:01<00:00, 49.61it/s]
100%|██████████| 64/64 [00:01<00:00, 47.50it/s]
100%|██████████| 64/64 [00:01<00:00, 42.09it/s]
100%|██████████| 57/57 [00:01<00:00, 42.58it/s]
100%|██████████| 51/51 [00:01<00:00, 40.53it/s]
100%|██████████| 57/57 [00:01<00:00, 37.57it/s]
100%|██████████| 8/8 [00:00<00:00, 37.68it/s]
100%|██████████| 7/7 [01:01<00:00,  8.80s/it]


In [378]:
df.head()

Unnamed: 0,sector,country,year,GH emissions
0,Energy Industries,Australia,1990,143 172.76
0,Energy Industries,Australia,1991,146 396.55
0,Energy Industries,Australia,1992,149 719.80
0,Energy Industries,Australia,1993,151 492.72
0,Energy Industries,Australia,1994,152 307.64


In [379]:
# we search for the lines that have both country = 'OECD Asia Oceania' and sector = 'CO2 from Transport and Storage' emission
df[(df['country'] == 'France') & (df['sector'] == 'Transport')].head(10)

Unnamed: 0,sector,country,year,GH emissions
0,Transport,France,1990,123 313.04
0,Transport,France,1991,125 899.78
0,Transport,France,1992,130 437.76
0,Transport,France,1993,130 401.20
0,Transport,France,1994,131 485.71
0,Transport,France,1995,133 465.42
0,Transport,France,1996,135 272.86
0,Transport,France,1997,137 816.44
0,Transport,France,1998,140 145.85
0,Transport,France,1999,142 498.36


In [380]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11680 entries, 0 to 0
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   sector        11680 non-null  object
 1   country       11680 non-null  object
 2   year          11680 non-null  object
 3   GH emissions  11680 non-null  object
dtypes: object(4)
memory usage: 456.2+ KB


In [392]:
# we modify a copy of the dataframe
df2 = df.copy()

In [393]:
# we convert sector and country to string, year to int and GH emissions to float GH emmissions are currently of the form '123 313.04'"Web Scrapping CM1.docx"
df2['sector'] = df2['sector'].astype(str)
df2['country'] = df2['country'].astype(str)
df2['year'] = df2['year'].astype(int)
df2['GH emissions'] = df2['GH emissions'].str.replace('\xa0', '').replace("..", 'NaN').astype(float)

ValueError: could not convert string to float: ''

In [394]:
# we find the element 143\xa0172.76 in the dataframe
df[df['GH emissions'] == '\xa0']

Unnamed: 0,sector,country,year,GH emissions
