In [None]:

# Note: in a notebook environment run the line below if packages are missing
# !pip install -q helium selenium beautifulsoup4


In [13]:
# Imports: Helium (Selenium), BeautifulSoup and pandas
from helium import start_chrome, wait_until, S, get_driver, kill_browser, go_to
from bs4 import BeautifulSoup
import pandas as pd
import time
import requests
from selenium.webdriver.chrome.options import Options

## Energy Data

In [14]:
# Target page (table view)
url = 'https://ourworldindata.org/grapher/electricity-generation?tab=table'

In [15]:
# Quick check: can we reach the page with requests (may return a minimal HTML since the table is rendered client-side)
resp = requests.get(url)
resp.status_code
# If the site returns 200 the page is reachable. The visible table is usually rendered dynamically with JS; we will use a browser to render it.

200

In [16]:
# Show last HTTP status code fetched above (resp.status_code)
try:
    print('HTTP status (requests):', resp.status_code)
except NameError:
    print('resp not defined; run the requests cell first')

HTTP status (requests): 200


In [17]:
# Prepare Chrome options and ensure no leftover browser instance
try:
    kill_browser()
except Exception:
    pass
opts = Options()
# Recommended options for headless and CI environments
opts.add_argument('--no-sandbox')
opts.add_argument('--disable-dev-shm-usage')
opts.add_argument('--disable-gpu')
# Note: we do not start the browser here; the next cell will start it headless and extract the table.

## Extraction du tableau et création d'un DataFrame
Les cellules suivantes vont :
1. Lancer un navigateur Chrome en mode headless avec Helium (Selenium)
2. Attendre que le tableau soit rendu côté client
3. Récupérer le HTML rendu, parser la balise <table> avec BeautifulSoup
4. Construire un DataFrame pandas à partir des en-têtes et des lignes du tableau

In [18]:
# Start a headless browser, wait for the table and parse it into a pandas DataFrame
# This cell may take a few seconds as the page renders JavaScript
timeout_secs = 30
# start Chrome headless and navigate to the page
driver = start_chrome(url, headless=True, options=opts)
# wait until a <table> element appears on the page (the table may be added dynamically)
try:
    wait_until(lambda: len(get_driver().find_elements('tag name', 'table')) > 0, timeout_secs=timeout_secs)
except Exception as e:
    print('Table not found within timeout:', e)
# get rendered page source
rendered = get_driver().page_source
soup = BeautifulSoup(rendered, 'html.parser')
tbl = soup.find('table')
if tbl is None:
    print('No <table> found in page source. The table may be rendered differently or inside an iframe.')
else:
    # Extract headers (if present) and rows
    thead = tbl.find('thead')
    if thead:
        headers = [th.get_text(strip=True) for th in thead.find_all('th')]
    else:
        # fallback: try first row as header
        first_row = tbl.find('tr')
        headers = [th.get_text(strip=True) for th in first_row.find_all(['th','td'])] if first_row else []
    tbody = tbl.find('tbody') or tbl
    rows = []
    for tr in tbody.find_all('tr'):
        cells = [td.get_text(strip=True) for td in tr.find_all(['td','th'])]
        # skip empty rows
        if any(c != '' for c in cells):
            rows.append(cells)
    # Normalize row length to headers (pad with None)
    if headers:
        max_cols = len(headers)
        normalized = [r + [None] * (max_cols - len(r)) if len(r) < max_cols else r[:max_cols] for r in rows]
        df = pd.DataFrame(normalized, columns=headers)
    else:
        df = pd.DataFrame(rows)
    # quick info
    print('Extracted DataFrame shape:', df.shape)
    display(df.head())

Extracted DataFrame shape: (260, 6)


Unnamed: 0,Country or region,1985,2024,1985–2024,Absolute Change,Relative Change
0,ASEAN (Ember),,"1,355.58 TWh",,,
1,Afghanistan,,0.97 TWh,,,
2,Africa,258.99 TWh,919.22 TWh,,+660.23 TWh,+255%
3,Africa (EI),258.99 TWh,963.93 TWh,,+704.95 TWh,+272%
4,Africa (Ember),,972.96 TWh,,,


In [21]:
out_file = 'electricity.csv'
df.to_csv(out_file, index=False)
print('Saved', out_file)

Saved electricity.csv


In [20]:
# Close the browser to free resources
try:
    kill_browser()
except Exception as e:
    print('kill_browser() failed or browser already closed:', e)
# The variable `df` contains the scraped table as a pandas DataFrame (if extraction succeeded).

As It's not what we look for as dataframe , I use to proceed by download because the scraping doesn't extract the useful information that it must contain

In [None]:
# Première tentative: télécharger directement le CSV depuis Our World in Data (grapher endpoint)
import os
base = 'https://ourworldindata.org/grapher/electricity-generation'
csv_url = base + '.csv'
print('Attempting to download CSV from', csv_url)
r = requests.get(csv_url)
if r.status_code == 200:
    out_file = 'electricity_generation.csv'
    with open(out_file, 'wb') as f:
        f.write(r.content)
    print('Saved', out_file)
    df = pd.read_csv(out_file)
    display(df.head())
else:
    print('Direct CSV download failed with status', r.status_code)

Attempting to download CSV from https://ourworldindata.org/grapher/electricity-generation.xls


NameError: name 'requests' is not defined