In [2]:
# Install required packages (uncomment to run)
# Note: in a notebook environment run the line below if packages are missing
# !pip install -q helium selenium beautifulsoup4


In [3]:
# Imports: Helium (Selenium), BeautifulSoup and pandas
from helium import start_chrome, wait_until, S, get_driver, kill_browser, go_to
from bs4 import BeautifulSoup
import pandas as pd
import time
import requests
from selenium.webdriver.chrome.options import Options

In [4]:
# Target page (table view)
url = 'https://ourworldindata.org/grapher/electricity-generation?tab=table'

In [5]:
# Quick check: can we reach the page with requests (may return a minimal HTML since the table is rendered client-side)
resp = requests.get(url)
resp.status_code
# If the site returns 200 the page is reachable. The visible table is usually rendered dynamically with JS; we will use a browser to render it.

200

In [6]:
# Show last HTTP status code fetched above (resp.status_code)
try:
    print('HTTP status (requests):', resp.status_code)
except NameError:
    print('resp not defined; run the requests cell first')

HTTP status (requests): 200


In [7]:
# Prepare Chrome options and ensure no leftover browser instance
try:
    kill_browser()
except Exception:
    pass
opts = Options()
# Recommended options for headless and CI environments
opts.add_argument('--no-sandbox')
opts.add_argument('--disable-dev-shm-usage')
opts.add_argument('--disable-gpu')
# Note: we do not start the browser here; the next cell will start it headless and extract the table.

## Extraction du tableau et création d'un DataFrame
Les cellules suivantes vont :
1. Lancer un navigateur Chrome en mode headless avec Helium (Selenium)
2. Attendre que le tableau soit rendu côté client
3. Récupérer le HTML rendu, parser la balise <table> avec BeautifulSoup
4. Construire un DataFrame pandas à partir des en-têtes et des lignes du tableau

In [8]:
# Import et utilisation de la fonction scrape_table_from_url
from scraper_utils import scrape_table_from_url, close_browser

# Scraper le tableau depuis l'URL
df = scrape_table_from_url(url, headless=True, timeout_secs=30, chrome_options=opts)

# Afficher un aperçu
if not df.empty:
    display(df.head())
else:
    print("Aucune donnée n'a été extraite.")

Extracted DataFrame shape: (260, 6)


Unnamed: 0,Country or region,1985,2024,1985–2024,Absolute Change,Relative Change
0,ASEAN (Ember),,"1,355.58 TWh",,,
1,Afghanistan,,0.97 TWh,,,
2,Africa,258.99 TWh,919.22 TWh,,+660.23 TWh,+255%
3,Africa (EI),258.99 TWh,963.93 TWh,,+704.95 TWh,+272%
4,Africa (Ember),,972.96 TWh,,,


In [None]:
#out_file = 'electricity.csv'
#df.to_csv(out_file, index=False)
#print('Saved', out_file)

Saved electricity.csv


In [9]:
# Fermer le navigateur pour libérer les ressources
close_browser()

Browser closed successfully.


As the information that i'm ooking for is not in this data frame, I want to use another methode to come up with it. I want to download the file instead.

In [10]:
# Première tentative: télécharger directement le CSV depuis Our World in Data (grapher endpoint)
import os
base = 'https://ourworldindata.org/grapher/electricity-generation'
csv_url = base + '.csv'
print('Attempting to download CSV from', csv_url)
r = requests.get(csv_url)
if r.status_code == 200:
    out_file = 'electricity_generation.csv'
    with open(out_file, 'wb') as f:
        f.write(r.content)
    print('Saved', out_file)
    df = pd.read_csv(out_file)
    display(df.head())
else:
    print('Direct CSV download failed with status', r.status_code)
    # Fallback: try to parse HTML fragments already grabbed via Selenium/BeautifulSoup
    try:
        # Use globals().get(...) to avoid NameError if the name is not defined
        te = globals().get('table_energy')
        if te is not None:
            print('Parsing HTML fragment `table_energy`')
            df = pd.read_html(str(te))[0]
            out_file = 'electricity_generation_from_html.csv'
            df.to_csv(out_file, index=False)
            print('Saved', out_file)
            display(df.head())
        else:
            pv = globals().get('pivot')
            if pv is not None:
                print('Parsing `pivot` element')
                table = pv.find('table')
                if table is not None:
                    df = pd.read_html(str(table))[0]
                    out_file = 'electricity_generation_from_pivot.csv'
                    df.to_csv(out_file, index=False)
                    print('Saved', out_file)
                    display(df.head())
                else:
                    print('No <table> inside pivot to parse.')
            else:
                print('No HTML fragment available for fallback parsing. Consider checking page_source.html or pivot.html for debugging.')
    except Exception as e:
        print('Fallback parsing failed:', e)

Attempting to download CSV from https://ourworldindata.org/grapher/electricity-generation.csv
Saved electricity_generation.csv
Saved electricity_generation.csv


Unnamed: 0,Entity,Code,Year,Electricity generation - TWh
0,ASEAN (Ember),,2000,378.61
1,ASEAN (Ember),,2001,404.85
2,ASEAN (Ember),,2002,433.19
3,ASEAN (Ember),,2003,458.24
4,ASEAN (Ember),,2004,496.56
