In [None]:
"""
This script is for scrapping the transcription in Chinese, English and French of video from TEDtalks website,
by using selenium and outpui the result in xml file.
"""

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
import time
import xml.etree.ElementTree as ET

In [None]:
def initialize_driver():
    # Initialize Safari driver
    driver = webdriver.Safari()
    return driver

In [None]:
def fetch_video_links(driver, url):
    driver.get(url)
    # wait for the page to load and collect all the video links by xpath
    video_link_elements = WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located(
            (By.XPATH, "//*[@id='maincontent']//a[contains(@href, '/talks/')]")
        )
    )
    # Extract the href attribute from the elements
    video_links = [element.get_attribute("href") for element in video_link_elements]
    return video_links

In [None]:
def accept_cookies(driver):
    # Accept cookies on the page
    try:
        accept_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable(
                (By.XPATH, '//*[@id="onetrust-accept-btn-handler"]')
            )
        )
        driver.execute_script("arguments[0].click();", accept_button)
    except Exception as e:
        print("No cookie button found or error clicking it:", e)

In [None]:
def pause_video(driver):
    # pause the video once the video begins playing
    try:
        WebDriverWait(driver, 10).until(
            lambda driver: driver.execute_script(
                "return document.getElementById('video').currentTime > 0;"
            )
        )
        driver.execute_script("document.getElementById('video').pause();")
    except Exception as e:
        print("Failed to pause the video:", e)

In [None]:
def open_transcript(driver):
    # open the transcript of the video
    try:
        transcript_button_xpath = '//*[@id="transcript-control"]'
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, transcript_button_xpath))
        )
        transcript_button = driver.find_element(By.XPATH, transcript_button_xpath)
        driver.execute_script("arguments[0].click();", transcript_button)
        time.sleep(5)
    except Exception as e:
        print("Error clicking on transcript button:", e)

In [None]:
def switch_language_js(driver, language_code):
    # switch the language of the transcript
    try:
        language_select = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable(
                (
                    By.XPATH,
                    "/html/body/div[2]/div[1]/div[2]/div/div/div/div[2]/div[1]/div/select",
                )
            )
        )
        select = Select(language_select)
        select.select_by_value(language_code)
        driver.execute_script(
            "arguments[0].dispatchEvent(new Event('change'));", language_select
        )
        # try to dispatch an input event if change event is not working
        driver.execute_script(
            "arguments[0].dispatchEvent(new Event('input'));", language_select
        )
        time.sleep(5)
    except Exception as e:
        print(f"Failed to set language with code '{language_code}':", e)

In [None]:
def save_transcript(driver, filename):
    # save the transcript in xml file
    time.sleep(5)  # wait for the transcript to load
    transcript_elements = WebDriverWait(driver, 30).until(
        EC.presence_of_all_elements_located(
            (By.XPATH, "/html/body/div[2]/div[1]/div[2]/div/div/div/div[4]/div")
        )
    )
    root = ET.Element("transcript")
    for element in transcript_elements:
        ET.SubElement(root, "section", text=element.text)
    tree = ET.ElementTree(root)
    tree.write(filename)

In [None]:
"""accept a URL and fetch the video links from the page then accept the cookies on the page, 
pause the video, open the transcript, 
switch the language to Chinese, French 
and save the transcript in xml file.
"""

driver = initialize_driver()
url = "https://www.ted.com/talks?sort=oldest&topics%5B0%5D=3d%20printing&topics%5B1%5D=ai&topics%5B2%5D=blockchain&topics%5B3%5D=computers&topics%5B4%5D=cyber%20security&topics%5B5%5D=data&topics%5B6%5D=driverless%20cars&topics%5B7%5D=drones&topics%5B8%5D=engineering&topics%5B9%5D=future&topics%5B10%5D=internet&topics%5B11%5D=nanotechnology&topics%5B12%5D=robots&topics%5B13%5D=social%20media&topics%5B14%5D=software&topics%5B15%5D=surveillance&topics%5B16%5D=technology&topics%5B17%5D=virtual%20reality"
links = fetch_video_links(driver, url)
accept_cookies(driver)
video_number = 1  # initialize the video number
for link in links:
    driver.get(link)
    pause_video(driver)
    open_transcript(driver)
    save_transcript(driver, f"english-{video_number}.xml")
    switch_language_js(driver, "zh-cn")# switch to Chinese
    save_transcript(driver, f"chinese-{video_number}.xml")
    switch_language_js(driver, "fr")# switch to French
    save_transcript(driver, f"french-{video_number}.xml")
    video_number += 1  # update the video number
driver.quit()

Chaque fois quand je lance ce script, pour les premiers 19 ou 20 vidéos, tout s'est bien passé, mais après, le site va être crashé, et je ne sais absolument pas ce qui s'est passé.

![Tederror](./img/error.jpg)