# Get data from ted.com

## Setup

In [1]:
import sys
sys.path.append("../")
sys.path.append("../../") # to import tools
import os
import requests
from bs4 import BeautifulSoup
from tools import progress_bar, make_name, file_exists, name_to_url
import json
from datetime import datetime
import re
import subprocess
import time
import glob
import pandas as pd
import numpy as np

In [2]:
script_query = {"data-spec":"q"}
json_extract = re.compile("\{.*\}")
transcript_service = "https://ted2srt.org/api/talks/{}/transcripts/download/{}?lang=en" # video id, format 
list_video = "https://www.ted.com/talks?page={}&sort=newest&language=en"
pages = 121 # 1 - 121 included

video_file = "videos.txt" # each line: video url
folder = "Data"
json_file = "data_urls.json" # ted url : [title, description, transcript, video url]
base_url = "https://www.ted.com"
fmt = "srt"

## Find video URLs

In [3]:
def extraxt_urls(r):
    """
    Args:
        r: request for a page containing the list of videos
    Returns:
        status, list of video urls
    """
    if r.ok:
        html = BeautifulSoup(r.text)
        links = html.find_all("a", attrs={"class": "ga-link", "data-ga-context":"talks"}) # 2 links for each video
        return True, list(set([f"{base_url}{i.attrs['href']}" for i in links]))
    else:
        return False, []

def load_video_file(file):
    """Read list of video url"""
    urls = []
    with open(file, "r") as f:
        urls = f.readlines()
    return urls

def save_video_file(file, urls):
    urls = set(urls)
    with open(file, "w") as f:
        for url in urls:
            f.write(f"{url}\n") 

In [9]:
videos = []
if file_exists(video_file):
    videos = load_video_file(video_file)
    print("URLs loaded")
else:
    print("URLs not found")

URLs loaded


In [21]:
progress_bar(0, "Page", tot=pages)
to_do = list(range(1, pages + 1))
count = 0
for page in to_do:
    try:
        r = requests.get(list_video.format(page))
        s, l = extraxt_urls(r)
        if s:
            count += 1            
            videos.extend(l)
        else:
            print(f"Fail on page {page}, retry later")
            to_do.append(page)
    except requests.ConnectionError:
        print(f"connection error on page {page}, retry later")
        to_do.append(page)
    progress_bar(count, "Page", tot=pages)
save_video_file(video_file, videos)

Page: [--------------------------------------------------] 0.00% - 0/12136
Page: [--------------------------------------------------] 0.83% - 1/12136
Page: [--------------------------------------------------] 1.65% - 2/12136
Page: [|-------------------------------------------------] 2.48% - 3/12136
Page: [|-------------------------------------------------] 3.31% - 4/12136
Page: [||------------------------------------------------] 4.13% - 5/121

KeyboardInterrupt: 

In [5]:
print("Videos:", len(videos))

Videos: 4346


## Get data

Obtain title, description, video id and video url from the video page

In [8]:
def get_video_info(r):
    """Return list [title, description, video url, video id]"""
    status = False
    ret_data = []
    if r.ok:
        html = BeautifulSoup(r.text)
        query = html.find("script", attrs=script_query)
        if query is not None and len(query.contents) != 0:
            #print("Query found")
            span = json_extract.search(query.contents[0])            
            video_data = json.loads(query.contents[0][span.start(): span.end()])
            try:
                video_data = video_data["__INITIAL_DATA__"]["talks"][0]
                title = video_data["title"]
                descr = video_data["description"]
                id = video_data["id"]
                video_url = video_data["player_talks"][0]["resources"]["h264"]
                if video_url is not None:
                    video_url = video_url[0]["file"]
                else:
                    video_url = "" # on YT, cannot download (against policy)
                status = True
                ret_data = [title, descr, video_url, id]
            except KeyError as err:
                print("KeyError:", err.args[0])
            except TypeError:
                print("Invalid query")                
        else:
            print("Failed to find the query:", r.url)
    #else:
        #print("Request not ok:", r.status_code)
    return status, ret_data

def save_video_data(name, video_data):
    with open(name, "wt") as f:
        json.dump(video_data, f)
        
def load_video_data(name):
    d = {}
    if file_exists(name):
        with open(name, "rt") as f:
            d = json.load(f)
    return d

def check_data(name, data_dict):
    if name in data_dict:
        d = data_dict[name][0] # HACK
        if len(d) >= 4 and d[0] != "" and d[1] != "" and d[3] != "": # title, descr, (video,) id
            return True
    return False

In [10]:
video_data = load_video_data(f"{folder}/{json_file}")
count = 0
progress_bar(0, "Page", tot=len(videos))
try:
    for video in videos:    
        name = make_name(video)
        if not check_data(name, video_data):
            try:
                r = requests.get(video)
                if r.status_code == 429:
                    time.sleep(5)
                s, l = get_video_info(r)
                #print("Got info ")
                if s and len(l) != 0:
                    #count += 1
                    #print("Count", count)
                    video_data[make_name(video)] = [l] # BUG list of list
                    #if len(l) == 0:
                    #    print("No data retrieved")
                    #    videos.append(video)
                else:
                    videos.append(video)
                    #print("Failed to get video data")
            except requests.ConnectionError:
                print("connection error")
                videos.append(video)
        count += 1
        progress_bar(count, "Page", tot=len(videos))
finally:
    pass
#    save_video_data(f"{folder}/{json_file}", video_data)

Page: [--------------------------------------------------] 0.83% - 36/4346Failed to find the query: https://www.ted.com/talks/gaspard_koenig_do_we_really_own_our_bodies?language=en%0A
Page: [||||||||||----------------------------------------] 21.34% - 929/4353Failed to find the query: https://www.ted.com/talks/sarah_montana_why_forgiveness_is_worth_it?language=en%0A
Page: [||||||||||||||||||||------------------------------] 40.45% - 1764/4361Failed to find the query: https://www.ted.com/talks/ahmad_m_hasnah_rethinking_education_and_celebrating_the_arabic_language?language=en%0A
Page: [|||||||||||||||||||||||||||||||||-----------------] 66.28% - 2901/4377Failed to find the query: https://www.ted.com/talks/james_orsulak_why_we_need_to_move_manufacturing_off_planet?language=en%0A
Page: [||||||||||||||||||||||||||||||||||----------------] 68.42% - 2996/4379Failed to find the query: https://www.ted.com/talks/megan_ming_francis_we_need_to_address_the_real_roots_of_racial_violence?language=en

In [11]:
save_video_data(f"{folder}/{json_file}", video_data)

## Get transcrips

In [26]:
def get_trans(id, fmt):
    #print(id, fmt)
    t = ""
    url = transcript_service.format(id, fmt)
    try:
        r = requests.get(url)
        if r.status_code != 429:
            t = r.text
    except requests.ConnectionError:
        pass
    return t

def save_trans(name, content):
    with open(name, "wt") as f:
        f.write(content)

In [28]:
video_data = load_video_data(f"{folder}/{json_file}")
count = 0
progress_bar(0, "Transcript", tot=len(video_data))
to_do = list(video_data.items())
for v_url, ((v_title, v_descr, v, v_id),) in to_do:
    name = f"{make_name(v_url)}.{fmt}"
    if not file_exists(f"{folder}/{name}"):
        t = get_trans(v_id, fmt)
        if t == "": # some error -> retry 
            to_do.append((v_url, [v_title, v_descr, v, v_id]))
        else:
            save_trans(f"{folder}/{name}", t)
    count += 1
    progress_bar(count, "Transcript", tot=len(to_do))

Transcript: [||||||||||||||||||||||||||||||||||||||||||||||||||] 100.00% - 4346/4346


## Get audio

In [None]:
def save_audio(id_, data_dict, talk_id, folder):
    """
    Args:
        id_: talk id
        data_dict: data_urls.json file
        talks_id: dataframe with talk ids. The index is the id
        folder: where to save the audio
    Returns:
        None
    """
    url = talk_id[id_]["ted"]
    v_url = data_dict[url][0][2].split("?")[0]    
    fmt = v_url.strip()[-3:] # get the format    
    if "youtube" in v_url:
        pass
    elif fmt not in ["mp4"]:
        print("Unknown format:", fmt)
    else:  
        name = make_name(url)+".wav"
        if not file_exists(f"{folder}/{name}"):
            #TODO            
            r = requests.get(video_url, timeout=60)                        
            if r.ok:
                # run ffmpeg, input from pipe, map only audio, on gpu 0, do not overwrite existing files
                cmd = ["ffmpeg", "-i", "pipe:", "-map", "a", "-gpu", "0", "-n", f"{folder}/{file_name}.wav"]
                proc = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) # output will be ignored
                proc.communicate(r.content)
            else:
                print("Request not ok on:", id_)
            except requests.Timeout:
                print("Timeout:", id_)
            except requests.ConnectionError: # e.g. invalid URL
                print("Connection error on:", id_)
            except requests.exceptions.MissingSchema:
                print("Missing schema:", id_)

## Get IDs

(Already done)

In [4]:
id_file = "talk_id.csv"
ted = "https://www.ted.com{}"

In [5]:
def extract_id(url):
    """Find the talk id given the url from TED.com"""        
    sc = 429
    talk_id = None
    status = True
    while sc == 429:
        r = requests.get(url)
        sc = r.status_code
        if r.ok:
            html = BeautifulSoup(r.text)
            query = html.find("script", attrs=script_query)
            if query is not None and len(query.contents) != 0:
                # print("Query found")
                span = json_extract.search(query.contents[0])            
                video_data = json.loads(query.contents[0][span.start(): span.end()])
                try:
                    talk_id = video_data["__INITIAL_DATA__"]["current_talk"]                    
                except KeyError as err:
                    print("KeyError:", err.args[0])
                    print(r.url)
                    status = False
                except TypeError:
                    print("Invalid query", r.url)                
                    status = False
            else:
                print("Failed to find the query:", r.url)
                status = False
        #else:
        #    print("Request not OK")
        if sc == 429:
            time.sleep(5)    
    return status, talk_id

In [6]:
video_data = {}
with open(f"{folder}/{json_file}") as f:
    video_data = json.load(f)
video_id = pd.DataFrame(columns=["ted", "id"])
if file_exists(id_file):
    video_id = pd.read_csv(id_file)
    video_id.drop('Unnamed: 0', axis=1, inplace=True)

In [14]:
tot = len(video_data)
i = 0
progress_bar(i, "ID", tot)
for url in video_data.keys():
    url = name_to_url(url, False)
    if url not in video_id.ted.values:
        status, id_ = extract_id(url)
        if status:
            video_id.loc[video_id.shape[0]] = [url, id_]
    i += 1
    progress_bar(i, "ID", tot)

ID: [||||||||||||||||||||||||||||||||||||||||||||||||||] 100.00% - 4346/4346


In [15]:
video_id

Unnamed: 0,ted,id
0,https://www.ted.com/talks/marc_kushner_why_the...,2183
1,https://www.ted.com/talks/kirk_sorensen_thoriu...,1324
2,https://www.ted.com/talks/jean_francois_bastin...,67748
3,https://www.ted.com/talks/christien_meindertsm...,960
4,https://www.ted.com/talks/srdja_popovic_how_to...,1294
...,...,...
4341,https://www.ted.com/talks/winnie_harlow_how_i_...,41333
4342,https://www.ted.com/talks/megan_campisi_and_pe...,24255
4343,https://www.ted.com/talks/alex_gendler_can_you...,46591
4344,https://www.ted.com/talks/rachel_kleinfeld_a_p...,55562


In [16]:
video_id.to_csv(id_file)

In [7]:
#import pickle
#with open("tmp_"+id_file, "rb") as f:
#    tmp = pickle.load(f)
#video_id = pd.DataFrame(tmp, columns=["ted", "id"])