# Get data from amara.org

Approach:
* start from the links with the list of TEDx, TED and TED-ED talks
* get the link of each individual videos
  * explore not only the first page
* get the link of the english subtitles
* get the link of the subtitle file (txt)
* get the description from the html of the page

## Setup

In [1]:
import sys
sys.path.append("../")
sys.path.append("../../") # to import tools
import os
import requests
from bs4 import BeautifulSoup
from tools import progress_bar, make_name, name_to_url, file_exists
import json
from datetime import datetime
import re
import subprocess
import pandas as pd
import time
import numpy as np

## Collect video and transcription URLs and 

In [2]:
amara = [ # list of video locations
    r"https://amara.org/en/teams/ted/videos/?q=&project=tedtalks&lang-mode=%2B&lang=en&sort=-time",  # TED -> last page missing
    r"https://amara.org/en/teams/ted/videos/?q=&project=tedxtalks&lang-mode=%2B&lang=en&sort=-time", # TEDx -> some video failed
    r"https://amara.org/en/teams/ted/videos/?q=&project=ted-ed&lang-mode=%2B&lang=en&sort=-time",    # TED-ED 
    r"https://amara.org/en/teams/ted/videos/?q=&project=otp-resources&lang-mode=%2B&lang=en&sort=-time#",  # TED-Translators -> done, there are duplicates
    r"https://amara.org/en/teams/ted/videos/?q=&project=ted-series&lang-mode=%2B&lang=en&sort=-time" # TED-series -> done
]
n_pages = [ # number of pages to explore
    218,
    932,
    88,
    3,
    4
]
starting_pages = [
    0,
    0,
    0,
    0,
    0
]
folders = [
    "TED",
    "TEDx",
    "TED-ED",
    "TED-Translator",
    "TED-Series"
]
json_file = "data_urls.json"
fail = "fail.json"
fail_dict = {}
site = "https://amara.org{}"
page_var = "&page={}" # append this to the link to get a specific page
# videos_per_page = 16  # how many videos each page -> used to estimate the total number of videos

In [3]:
def init_fail():
    """Json for failed requests"""
    d = {}
    try:
        with open(fail, "rt") as f:
            d = json.load(f)
    except FileNotFoundError:
        pass
    return d

def save_fail(fail_dict):
    """
    Save the dictionary of failed requests
    
    Key: url
    Values: [0 list of videos, 1 single video, folder]
    """
    with open(fail, "wt") as f:
        json.dump(fail_dict, f)

def retry_fail(fail_dict):
    """
    Try again on failed requests
    """
    url = ""
    folder = ""
    type_ = -1
    data_dicts = {} # folder -> json dict
    l = len(fail_dict)
    count = 0
    try:
        progress_bar(count, "Retry", tot=l)
        while len(fail_dict) != 0:
            url, (type_, folder) = fail_dict.popitem()
            if folder not in data_dicts:
                with open(f"{folder}/{json_file}", "rt") as f:
                    data_dicts[folder] = json.load(f)
            if type_ == 0:
                videos = get_videos(requests.get(url), folder)
                for video in videos:
                    try:
                        r = requests.get(video)
                        dict_data[r.url] = get_data(r, folder)
                    except requests.ConnectionError:
                        fail_dict[url] = [type_, folder]
            elif type_ == 1:
                try:
                    r = requests.get(url)
                    data_dicts[folder][r.url] = get_data(r, folder)
                except requests.ConnectionError:
                    l += 1
                    fail_dict[url] = [type_, folder]
            count += 1
            progress_bar(count, "Retry", tot=l)
    finally:
        if url != "":
            fail_dict[url] = [type_, folder]
        save_fail(fail_dict)
    return fail_dict

In [31]:
def get_videos(r, folder = ""):
    """
    Return the list of video URLs (still on amara)
    
    Args:
        r: request from the amara page with the list of videos
    Returns:
        list of URLs to the videos from the page
    """
    videos = []    
    if r.ok:
        # print("--Request OK")
        html = BeautifulSoup(r.text) # tree structure of the html page
        
        # find the element <ul class="videos listing group"> -> it contains the list of videos
        html = html.find_all("ul", attrs={"class":"videos listing group"})    
        
        # find the link to the videos        
        if len(html) != 0: # the result is not empty
            # print("--Got table of videos")
            html = html[0] # assume only one result
            link = html.find_all("a", attrs={"class":""}) # find all the links to the videos, some duplicates        
            # print("--Got", len(link), "links")
            videos = list(set([site.format(i.attrs["href"]) for i in link]))
    else:
        print("--Request NOT OK")
        fail_dict[r.url] = [0, folder]
    return videos

In [32]:
def find_video_link(r):
    """
    Find the link where to download the information
    """
    url = ""
    if r.ok:
        html = BeautifulSoup(r.text)
        langs = html.find_all("ul", attrs={"id": "subtitles-menu"})
        if len(langs) != 0:   # if there is some results
            langs = langs[0]  # take the list of rows
            langs = langs.find_all("a", text= " English [en]", limit=1) # English link
            if len(langs) != 0:
                url = site.format(langs[0].attrs["href"])
    return url

def find_transcript(urls, format = ".srt"):
    trans = ""
    i = 0
    while i < len(urls) and trans == "":
        if urls[i].strip().endswith(format):
            trans = site.format(urls[i])
        i += 1
    return trans

def get_data(r, folder=""):
    """
    Return the title, the description, the URL to the transcript and the URL to the video
    
    Args:
        r: request to the amara page of a single video
    Returns:
        Tuple of strings with (title, description, URL transcript, URL video)
    """
    url = find_video_link(r)
    video_url = r.url
    title = ""
    descr = ""
    video = ""
    trans = ""
    if url != "":
        r = requests.get(url)
        if r.ok: 
            html = BeautifulSoup(r.text)
            meta = html.find_all(attrs={"class":"metadata"}) # 0: title, 1: description, (2: speaker)
            # title
            try:
                title = meta[0].get_text().strip().split("\n", 2)[-1].strip() # Text include "Title: " -> strip, split on space, take last (actual title) and strip
            except IndexError:
                pass
            # description
            try:
                descr = meta[1].get_text().strip().split("\n", 2)[-1].strip() # Text include "Title: " -> strip, split on space, take last (actual title) and strip
            except IndexError:
                pass
            # video
            div = html.find("div", attrs={"class":"modal-body"})
            if div is not None:
                try:
                    div = BeautifulSoup(div.find_all("pre")[-1].get_text().strip()) # ther is a <div> with the URL of the video but it is as text
                    div = div.find("div")
                    if div is not None:
                        video = div.attrs["data-url"]
                except IndexError:
                    pass
            # transcript
            div = html.find("div", attrs={"class":"sort_button action"})
            if div is not None:
                urls = div.find_all("a")
                urls = [i.attrs["href"] for i in urls]
                trans = find_transcript(urls)
                if trans == "":
                    trans = find_transcript(urls, ".vtt")
                    if trans == "":
                        trans = find_transcript(urls, ".txt")                
        else:            
            fail_dict[video_url] = [1, folder]
    else:
        fail_dict[video_url] = [1, folder]
    return (title, descr, video, trans)

In [35]:
fail_dict = init_fail()
for i, (source, folder) in enumerate(zip(amara, folders)):
    dict_data = {}
    print("Source:", source)
    try:
        # check previous data
        try:
            with open(folder + "/" + json_file, "r") as f:
                dict_data = json.load(f)
        except json.JSONDecodeError:
            pass
        except FileNotFoundError:
            pass
        # get new data
        print(f"--starting job at {datetime.now()}")
        
        for page in range(starting_pages[i], n_pages[i] + 1):
            try: 
                r = requests.get(source + page_var.format(page))
                videos = get_videos(r, folder)
                for video in videos:
                    try:
                        r = requests.get(video)
                    except requests.ConnectionError:
                        fail_dict[video] = [1, folder]
                    dict_data[r.url] = get_data(r, folder)
                    print(f"----video at {datetime.now()}")                
                progress_bar(page, f"Source {folder}", tot=n_pages[i])
            except requests.ConnectionError:
                print("CONNECTION ERROR: skip")
        print("--done at {datetime.now()}")
    finally:
        with open(folder + "/" + json_file, "w") as f:
            json.dump(dict_data, f)
save_fail(fail_dict)

In [34]:
# doesn't work properly
while len(fail_dict) != 0:
    retry_fail(fail_dict)

Retry: [--------------------------------------------------] 0.46% - 7/1527

KeyboardInterrupt: 

## Download transcripts

The json file has the following structure: 

{\
    url: \[title, description, video_url, transcript_url\]\
}

The name of the files (audio and transcript) is the url. The \/ is substituted by __ \
The video_url must end with .mp4 or any other format. \
The transcrip_url must end with .srt, .vtt or .txt

In [3]:
def make_name(url):
    """Make the file name from the url"""
    return url.replace("/", "__")

In [14]:
def save_transcript(entry, folder):
    """Save the transcript"""
    url = entry[0] # used for the name
    trans_url = entry[1][3] # transcript
    fmt = trans_url.strip()[-3:] # get the format
    if len(trans_url) == 0:
        #print("Empty url")
        pass
    else:
        if fmt not in ["srt", "vtt", "txt"]:
            print("Unknown format:", fmt)
        try:
            r = requests.get(trans_url)
            if r.ok:
                file_name = url.replace("/", "__")
                try:
                    with open(f"{folder}/{file_name}.{fmt}", "xt") as f:
                        f.write(r.text)
                except FileExistsError:
                    #print("File already exists:", entry[0])
                    pass
            else:
                print("Request not ok on:", entry[0])
        except requests.ConnectionError: # e.g. invalid URL
            print("Connection error on:", entry[0])
        except requests.exceptions.MissingSchema:
            pass

In [15]:
for folder in folders:
    print("Folder:", folder)
    dict_data = {}
    with open(f"{folder}/{json_file}", "r") as f:
        dict_data = json.load(f)
    tot = len(dict_data)
    count = 0
    if tot != 0:
        progress_bar(count, f"Folder {folder}", tot=tot)
        for entry in dict_data.items():
            save_transcript(entry, folder)
            count += 1
            progress_bar(count, f"Folder {folder}", tot=tot)

Folder: TED-Translator
Folder TED-Translator: [||||||--------------------------------------------] 12.50% - 2/16

KeyboardInterrupt: 

## Download audio

Use the same approach used for the transcripts.
To extraxt the audio ffmpeg is used.

This command seems to work: ffmpeg -i pipe: -map a name_audio.wav

However, the subtitles are still in the audio file

In [3]:
def save_audio(entry, folder):
    """Save the audio"""
    url = entry[0] # used for the name
    video_url = entry[1][2] # transcript
    fmt = video_url.strip()[-3:] # get the format
    if len(video_url) == 0:
        #print("Empty url")
        pass
    elif "youtube" in video_url:
        #print("youtube video")
        pass
    elif fmt not in ["mp4"]:
        print("Unknown format:", fmt)
    else:  
        try:
            file_name = make_name(url) # TODO check if exists, if not download, otherwise skip
            if not os.path.isfile(f"{folder}/{file_name}.wav"):                
                r = requests.get(video_url, timeout=60)
                #print("Done")
                if r.ok:
                    # run ffmpeg, input from pipe, map only audio, on gpu 0, do not overwrite existing files
                    cmd = ["ffmpeg", "-i", "pipe:", "-map", "a", "-gpu", "0", "-n", f"{folder}/{file_name}.wav"]
                    proc = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) # output will be ignored
                    proc.communicate(r.content)
                else:
                    print("Request not ok on:", entry[0])
        except requests.Timeout:
            print("Timeout")
        except requests.ConnectionError: # e.g. invalid URL
            print("Connection error on:", entry[0])
        except requests.exceptions.MissingSchema:
            pass        

In [5]:
for folder in folders:
    print("Folder:", folder)
    dict_data = {}
    with open(f"{folder}/{json_file}", "r") as f:
        dict_data = json.load(f)
    tot = len(dict_data)
    count = 0
    if tot != 0:
        progress_bar(count, f"Folder {folder}", tot=tot)
        for entry in dict_data.items():
            save_audio(entry, folder)
            count += 1
            progress_bar(count, f"Folder {folder}", tot=tot)

Folder: TED
Folder TED: [||||||||||||||||||||||||||||||||||||||||||||||||||] 100.00% - 3480/3480
Folder: TEDx
Folder TEDx: [|-------------------------------------------------] 3.84% - 302/7859Request not ok on: https://amara.org/en/videos/6dRtKYPAV34Z/info/how-to-use-creative-writing-to-bear-witness/
Folder TEDx: [|||||---------------------------------------------] 11.83% - 930/7859Request not ok on: https://amara.org/en/videos/p3luoRcj9HUm/info/how-uncomfortable-conversations-can-save-lives/
Folder TEDx: [||||||||------------------------------------------] 17.56% - 1380/7859Request not ok on: https://amara.org/en/videos/T2u54c2V8MFJ/info/can-a-spade-teach-something-to-an-iphone/
Folder TEDx: [||||||||------------------------------------------] 17.60% - 1383/7859Request not ok on: https://amara.org/en/videos/TDBlayswKYDE/info/discernment-in-the-era-of-fake-news/
Folder TEDx: [|||||||||||||||||||||||||||||||||||||-------------] 74.73% - 5873/7859

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Folder TEDx: [||||||||||||||||||||||||||||||||||||||||||||||||||] 100.00% - 7859/7859
Folder: TED-ED
Folder TED-ED: [||||||||||||||||||||||||||||||||||||||||||||||||||] 100.00% - 489/489


## Talk IDs

TED is no longer using Amara.org (since 31/12/2020) and Amara.org does not provide the talk id

So, search for the title on TED.com and find the id.

In [20]:
id_file = "talk_id.csv"
amara_to_ted = "amara_to_ted.csv"
data_folder = "/media/usb1/Backup - Computer/DKE/Thesis_data/Amara/{}"  # might change, add folder to get each set
search_url = "https://www.ted.com/search?q={}"
ted = "https://www.ted.com{}"
script_query = {"data-spec":"q"}
json_extract = re.compile("\{.*\}")

In [21]:
def clean_title(title):
    # ? and ! seems fine
    return title.strip().replace(" ", "+").replace("/", "+")

In [22]:
def search(title):
    """Search the title on TED.com and return the url of the page"""
    sc = 429 # too many requests 
    url = ""
    title = clean_title(title)
    try:
        while sc == 429:
            r = requests.get(search_url.format(title))
            sc = r.status_code
            if r.ok:
                html = BeautifulSoup(r.text)
                results = html.find_all(attrs={"class": "h7 m4"})
                if len(results) != 0:  # some results
                    result = results[0].find("a")
                    if result is not None:  # the result has a link
                        link = result.attrs.get("href")
                        if link is not None:  # the link has actually a url
                            url = ted.format(link)
            if sc == 429:
                time.sleep(5)
    except requests.TooManyRedirects:
        print("Too many redirects:", search_url.format(title))
    return url

def extract_id(url):
    """Find the talk id given the url from TED.com"""    
    #ret_data = []
    sc = 429
    talk_id = None
    status = True
    while sc == 429:
        r = requests.get(url)
        sc = r.status_code
        if r.ok:
            html = BeautifulSoup(r.text)
            query = html.find("script", attrs=script_query)
            if query is not None and len(query.contents) != 0:
                # print("Query found")
                span = json_extract.search(query.contents[0])            
                video_data = json.loads(query.contents[0][span.start(): span.end()])
                try:
                    video_data = video_data["__INITIAL_DATA__"]["talks"][0]
                    talk_id = video_data.get("id")                    
                    #if talk_id is None:
                    #    talk_id = video_data["talks"][0].get("id")
                except KeyError as err:
                    print("KeyError:", err.args[0])
                    print(r.url)
                    status = False
                except TypeError:
                    print("Invalid query", r.url)                
                    status = False
            else:
                print("Failed to find the query:", r.url)
                status = False
        #else:
        #    print("Request not OK")
        if sc == 429:
            time.sleep(5)    
    return status, talk_id

In [56]:
a2t = pd.DataFrame(columns=["amara", "ted"])
if file_exists(amara_to_ted):
    a2t = pd.read_csv(amara_to_ted)
    a2t.drop('Unnamed: 0', axis=1, inplace=True)

# get TED.com urls
for folder in folders:
    print("Folder:", folder)
    data = data_folder.format(folder) + "/" + json_file
    data_dict = {}
    with open(data) as f:
        data_dict = json.load(f)
    tot = len(data_dict)
    i = 0
    progress_bar(i, "Video", tot)
    for amara_url, video_data in data_dict.items():
        if amara_url not in a2t.amara.values:
            title = video_data[0]
            try:
                ted_url = search(title)
            except Exception as err:
                print(f"Error {type(err)}:", title)
                print(err.args[0])
            a2t.loc[a2t.shape[0]] = [amara_url, ted_url]
        i += 1
        progress_bar(i, "Video", tot)
    print()

Folder: TED
Video: [||||||||||||||||||||||||||||||||||||||||||||||||||] 100.00% - 3480/3480

Folder: TEDx
Video: [||||||||||||||||||||||||||||||||||||||||||||||||||] 100.00% - 7859/7859

Folder: TED-ED
Video: [||||||||||||||||||||||||||||||||||||||||||||||||||] 100.00% - 489/489

Folder: TED-Translator
Video: [||||||||||||||||||||||||||||||||||||||||||||||||||] 100.00% - 16/16

Folder: TED-Series
Video: [||||||||||||||||||||||||||||||||||||||||||||||||||] 100.00% - 52/52



In [57]:
a2t.to_csv(amara_to_ted)

In [53]:
tid = pd.DataFrame(columns=["amara", "id"])

if file_exists(id_file):
    tid = pd.read_csv(id_file)
    tid.drop('Unnamed: 0', axis=1, inplace=True)

tot = len(a2t)
i = 0
progress_bar(i, "Id", tot)
# TODO
for n, urls in a2t.iterrows():
    if urls.amara not in tid.amara.values:
        id_ = None
        try:
            if urls.ted == urls.ted \
            and not urls.ted.startswith(ted.format("/playlists")) \
            and not urls.ted.startswith(ted.format("/speakers")) \
            and not urls.ted.startswith(ted.format("/participate")) \
            and not urls.ted.startswith("https://www.ted.comhttps") and not urls.ted.startswith("https://www.ted.comtedx") and urls.ted != '':
                status, id_ = extract_id(urls.ted)                
        except Exception as err:
            print(f"Error {type(err)}:", urls.amara)
            print(err.args[0])
        if status:                
            tid.loc[tid.shape[0]] = [urls.amara, id_]            
    i += 1
    progress_bar(i, "Id", tot)    

Id: [||||||||||||||||||||||||||||||||||||||||||||||||||] 100.00% - 11896/11896


In [55]:
tid.to_csv(id_file)

In [54]:
tid

Unnamed: 0,amara,id
0,https://amara.org/en/videos/3mxy1kOnFhr4/info/...,364.0
1,https://amara.org/en/videos/ElAfn2xgEtKc/info/...,129.0
2,https://amara.org/en/videos/BzItzJ9Tigui/info/...,377.0
3,https://amara.org/en/videos/SRhk9JomSHTA/info/...,157.0
4,https://amara.org/en/videos/DEaTk2z2AmjO/info/...,394.0
...,...,...
10783,https://amara.org/en/videos/0DnYpBzgZ4h9/info/...,
10784,https://amara.org/en/videos/HWylMwsCEh7v/info/...,
10785,https://amara.org/en/videos/SYjsyT7qCU7s/info/...,
10786,https://amara.org/en/videos/WA8q6uJ2RQxC/info/...,


In [39]:
a2t.amara[a2t.amara == "https://amara.org/en/videos/Jnvr6WtxuYg2/info/dare-to-eat-insects-jakob-lewin-rukov-tedxebs/"].index[0]

5739

In [40]:
a2t.loc[5739]

amara    https://amara.org/en/videos/Jnvr6WtxuYg2/info/...
ted                   https://www.ted.comtedx/events/19848
Name: 5739, dtype: object

In [1]:
tid

NameError: name 'tid' is not defined