# Dataset integration

In [1]:
import sys
sys.path.append("../")
sys.path.append("../../") # to import tools

import pandas as pd
import numpy as np
import json
import yaml
from tools import file_exists, name_to_url, make_name, drop_transcript, progress_bar
import pysrt
import glob
import re
from sklearn.model_selection import train_test_split



## Joining the talk IDs

In [2]:
amara_file = "Amara/talk_id.csv"
amara_to_ted = "Amara/amara_to_ted.csv"
ted_file = "TED/talk_id.csv"
must_file = "/media/gianluca/Backups/en-cs/docs/PreservedTalkIds.txt"
must_dev = "/media/gianluca/Backups/en-cs/data/dev"
must_train = "/media/gianluca/Backups/en-cs/data/train"
must_test_1 = "/media/gianluca/Backups/en-cs/data/tst-HE"
must_test_2 = "/media/gianluca/Backups/en-cs/data/tst-COMMON"

ted_data = "TED/Data/data_urls.json"
amara_base = "/media/gianluca/RASPPI/Backup - Computer/DKE/Thesis_data/Amara"
amara_urls = "data_urls.json"
amara_folders = ["TED", "TEDx", "TED-ED", "TED-Series", "TED-Translator"]

dataset = "integrated_data.csv"

train_ratio = 0.85

In [3]:
def find_id_from_folder(folder):
    """
    Find all the files like "ted_1.wav" and return the list of ids
    """
    if folder[-1] != "/":
        folder = folder + "/"
    l = glob.glob(f"{folder}ted_*.wav") # list of paths
    l = [i.split("/")[-1] for i in l]  # list of file names
    r = re.compile("[0-9]+")  # re to find numbers
    s = [r.search(i) for i in l] # results of the re
    return [int(i[j.start():j.end()]) for i,j in zip(l, s)]  # for each name extract the number

def get_amara_ids(file):
    if file_exists(file):
        df = pd.read_csv(file)
        #print("--Entries:", len(set(df.amara.tolist())))
        return df.id.dropna().tolist()
    else:
        print("Amara: cannot find id file")
        return []

def count_amara_no_id(id_file, url_list):
    if file_exists(id_file) and file_exists(url_list):
        all_vid = set(pd.read_csv(url_list).amara.tolist())
        id_vid = set(pd.read_csv(id_file).dropna().amara.tolist())
        return len(all_vid-id_vid)
    else:
        return 0
def get_ted_ids(file):
    if file_exists(file):
        df = pd.read_csv(file)
        #print("--Entries:", len(set(df.ted.tolist())))
        return df.id.tolist()
    else:
        print("TED: cannot find id file")
        return []

def get_must_ids(*args):
    #if file_exists(file):
    #    lines = []
    #    with open(file) as f:
    #        lines = f.readlines()
    #    lines = [int(i) for i in lines]
    #    return lines
    #else:
    #    print(f"MUST-C: cannot find id file ({file})")
    #    return []
    dev = set(find_id_from_folder(must_dev + "/wav/"))
    train = set(find_id_from_folder(must_train + "/wav/"))
    t1 = set(find_id_from_folder(must_test_1 + "/wav/"))
    t2 = set(find_id_from_folder(must_test_2 + "/wav/"))
    names = ["dev", "train", "test1", "test2"]
    tmp = [dev, train, t1, t2]
    for i in range(len(names)):
        j = i+1
        while j < len(names):
            intersection = tmp[i].intersection(tmp[j])
            if len(intersection) != 0:
                print(f"MUST-C: intersection between {tmp[i]} and {tmp[j]}")
                print(intersection)
            j += 1
    return list(set.union(*tmp))

In [4]:
ted = get_ted_ids(ted_file)
print(f"TED: {len(ted)} ids")
ted = set(ted)
print(f"-- {len(ted)} unique ids")


amara = get_amara_ids(amara_file)
print(f"Amara: {len(amara)} ids")
amara = set(amara)
print(f"-- {len(amara)} unique ids")
print(f"-- Missing ids: {count_amara_no_id(amara_file, amara_to_ted)}")


must = get_must_ids(must_file)
print(f"MUST-C: {len(must)} ids")
must = set(must)
print(f"-- {len(must)} unique ids")


all_ = ted.union(amara).union(must)

TED: 4346 ids
-- 4346 unique ids
Amara: 3948 ids
-- 3664 unique ids
-- Missing ids: 7948
MUST-C: 1302 ids
-- 1302 unique ids


In [5]:
# video id - sources - has transcript (should be all) - has title and description - is a ted talk? - is a tedx talk?
df = pd.DataFrame(columns=["id", "ted", "amara", "must_c", "transcript", "title_descr", "audio", "is_ted", "is_tedx", "train", "test", "drop"])
df.id = list(all_)
df.sort_values("id", inplace=True)
df.id = df.id.astype(int)
df.ted = np.isin(df.id, list(ted))
df.amara = np.isin(df.id, list(amara))
df.must_c = np.isin(df.id, list(must))
df.set_index("id", inplace=True)

df.title_descr = df.amara | df.ted
#for i in "transcript", "audio", "is_ted", "is_tedx", "train", "test", "drop":
#    df[i] = False

In [6]:
amara_ted = (df.amara & df.ted).sum() # talks that are in amara and ted
amara_ted_must = ((df.amara | df.ted) & df.must_c).sum() # talks that is scraped datasets and must-c
common = (df.amara & df.ted & df.must_c).sum() # talk in all the datasets

print("Talks in common between Amara and Ted:", amara_ted)
print("Talks in common between the datasets I extracted and MUST-C:", amara_ted_must)
print("Talk in common between all of them:", common)

df

Talks in common between Amara and Ted: 3661
Talks in common between the datasets I extracted and MUST-C: 1298
Talk in common between all of them: 1265


Unnamed: 0_level_0,ted,amara,must_c,transcript,title_descr,audio,is_ted,is_tedx,train,test,drop
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,True,True,True,,True,,,,,,
2,True,True,False,,True,,,,,,
3,True,True,True,,True,,,,,,
4,True,True,True,,True,,,,,,
5,True,True,False,,True,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
70327,True,True,False,,True,,,,,,
70362,True,False,False,,True,,,,,,
70364,True,True,False,,True,,,,,,
70428,True,True,False,,True,,,,,,


In [7]:
# Find if it is ted
def get_ted_info(file, data_file):
    """For each talk get if it has a transcript and the audio recording. Returns if the transcript has to be dropped"""
    with open(data_file) as f:
        ted_data = json.load(f)
    
    with open(file) as f:
        ted_id = pd.read_csv(f)
    
    transcripts = []
    drop = []
    audio = []
    progress_bar(0, "TED.com", ted_id.shape[0])
    for i in range(ted_id.shape[0]):
        _, url, id_ = ted_id.iloc[i]
        name = make_name(url)
        
        if file_exists(f"TED/Data/{name}.srt"):
            transcripts.append(True)
            t = pysrt.open(f"TED/Data/{name}.srt")
            drop.append(drop_transcript(t.text))                        
        else:
            transcripts.append(False)
            drop.append(False)
        
        video = ted_data[name][0][3].split("?")[0]
        audio.append(video.endswith(".mp4") and "youtube" not in video)
        progress_bar(i+1, "TED.com", ted_id.shape[0])
    ret = pd.DataFrame(columns=["id", "transcript", "audio", "drop"])
    ret.id = ted_id.id
    ret.transcript = transcripts
    ret.audio = audio
    ret["drop"]= drop
    return ret

def get_amara_info(file, data_file, base_path, folders):
    """For each talk get if it has a transcript, an audio recording, is ted, is tedx and if it has to be discarded"""      
    with open(file) as f:
        amara_id = pd.read_csv(f)
    amara_id = amara_id[amara_id.id.notna()]
    transcripts = [False for i in range(amara_id.shape[0])]
    drop = [False for i in range(amara_id.shape[0])]
    audio = [False for i in range(amara_id.shape[0])]
    ted = [False for i in range(amara_id.shape[0])]
    tedx = [False for i in range(amara_id.shape[0])]
    
    progress_bar(0, "Amara.org", amara_id.shape[0])
    for i in range(amara_id.shape[0]):
        _, url, id_ = amara_id.iloc[i]
        name = make_name(url.replace(":", "_"))
        
        found = False # did we found the folder?
        for f in folders:
            if file_exists(f"{base_path}/{f}/{name}.srt"):
                found = True
                transcripts[i] = True
                t = pysrt.open(f"{base_path}/{f}/{name}.srt")
                drop[i] = drop_transcript(t.text)
            if file_exists(f"{base_path}/{f}/{name}.wav"):
                audio[i] = True
                found = True
                
            if found:
                if f == "TED":
                    ted[i] = True
                elif f == "TEDx":
                    tedx[i] = True
                break
        progress_bar(i+1, "Amara.org", amara_id.shape[0])
    
    ret = pd.DataFrame(columns=["id", "transcript", "audio", "drop", "is_ted", "is_tedx"])
    ret.id = amara_id.id
    ret.transcript = transcripts
    ret.audio = audio
    ret["drop"] = drop
    ret.is_ted = ted
    ret.is_tedx = tedx
    return ret

In [8]:
amara_tmp = get_amara_info(amara_file, amara_urls, amara_base, amara_folders)
ted_tmp = get_ted_info(ted_file, ted_data)
amara_tmp.set_index("id", inplace=True)
ted_tmp.set_index("id", inplace=True)

n = 0 
tot = len(amara_tmp)
ambiguous = 0
progress_bar(n, "Integrating Amara", tot)
for i in amara_tmp.index:
    if len(amara_tmp.loc[i].shape) > 1:
        #print("Ambiguous index:", i)
        ambiguous += 1
        df.loc[i,["transcript", "audio", "drop", "is_ted", "is_tedx"]] = [False, False, True, False, False]
    else:
        for j in ["transcript", "audio", "drop", "is_ted", "is_tedx"]:
            df.loc[i, j] = amara_tmp.loc[i,j]
    n += 1
    progress_bar(n, "Integrating Amara", tot)
print("Ambiguous entries:", ambiguous)
    
n = 0 
tot = len(ted_tmp)
progress_bar(n, "Integrating TEd", tot)
for i in ted_tmp.index:
    for j in ["transcript", "audio"]:
        if np.isnan(df.loc[i, j]):
            df.loc[i, j] = ted_tmp.loc[i, j]
        else:
            df.loc[i, j] = df.loc[i, j] or ted_tmp.loc[i, j]
    if np.isnan(df.loc[i, "drop"]):
        df.loc[i, "drop"] = ted_tmp.loc[i, "drop"]
    else:
        df.loc[i, "drop"] = ted_tmp.loc[i, "drop"] and df.loc[i, "drop"]
    n += 1
    progress_bar(n, "Integrating TEd", tot)

Amara.org: [||||||||||||||||||||||||||||||||||||||||||||||||||] 100.00% - 3948/3948
TED.com: [||||||||||||||||||||||||||||||||||||||||||||||||||] 100.00% - 4346/4346
Integrating Amara: [||||||||||||||||||||||||||||||||||||||||||||||||||] 100.00% - 3948/3948
Ambiguous entries: 495
Integrating TEd: [||||||||||||||||||||||||||||||||||||||||||||||||||] 100.00% - 4346/4346


In [9]:
mask = df["must_c"] & ~df["ted"] & ~df["amara"]
df.loc[mask,["transcript", "audio"]] = True  # all must-c have audio and transcript
#df.loc[mask, "audio"] = True
df.fillna({i:False for i in ["transcript", "title_descr", "audio", "is_ted", "is_tedx", "drop"]}, inplace=True)

In [10]:
df.to_csv(dataset)

## Split Test & Training

In [11]:
df = pd.read_csv(dataset, index_col="id")
df

Unnamed: 0_level_0,ted,amara,must_c,transcript,title_descr,audio,is_ted,is_tedx,train,test,drop
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,True,True,True,True,True,True,True,False,,,False
2,True,True,False,True,True,True,True,False,,,False
3,True,True,True,True,True,True,True,False,,,False
4,True,True,True,True,True,True,True,False,,,False
5,True,True,False,True,True,True,True,False,,,False
...,...,...,...,...,...,...,...,...,...,...,...
70327,True,True,False,True,True,False,False,False,,,False
70362,True,False,False,True,True,False,False,False,,,False
70364,True,True,False,True,True,True,False,False,,,False
70428,True,True,False,True,True,True,False,False,,,False


In [12]:
print("TED with transcript:", df[df['is_ted']]['transcript'].sum())
print("TEDx with transcript:", df[df['is_tedx']]['transcript'].sum())

print("TED with audio:", df[df['is_ted']]['audio'].sum())
print("TEDx with audio:", df[df['is_tedx']]['audio'].sum())

print("TED with description:", df[df['is_ted']]['title_descr'].sum())
print("TEDx with description:", df[df['is_tedx']]['title_descr'].sum())

print("Total:", df.shape[0])

TED with transcript: 3146
TEDx with transcript: 31
TED with audio: 3146
TEDx with audio: 1
TED with description: 3146
TEDx with description: 31
Total: 4353


In [13]:
print("Audio and description:", (df["audio"] & df["title_descr"]).sum())
print("Drop:", df["drop"].sum())

Audio and description: 3423
Drop: 11


In [14]:
# Assign MUST-C
train = find_id_from_folder(must_train + "/wav/")
df.loc[train, ["train"]] = True
df.loc[train, ["test"]] = False
test = find_id_from_folder(must_test_1 + "/wav/")
df.loc[test, ["train"]] = False
df.loc[test, ["test"]] = True
test = find_id_from_folder(must_test_2 + "/wav/")
df.loc[test, ["train"]] = False
df.loc[test, ["test"]] = True

In [15]:
df

Unnamed: 0_level_0,ted,amara,must_c,transcript,title_descr,audio,is_ted,is_tedx,train,test,drop
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,True,True,True,True,True,True,True,False,True,False,False
2,True,True,False,True,True,True,True,False,,,False
3,True,True,True,True,True,True,True,False,True,False,False
4,True,True,True,True,True,True,True,False,True,False,False
5,True,True,False,True,True,True,True,False,,,False
...,...,...,...,...,...,...,...,...,...,...,...
70327,True,True,False,True,True,False,False,False,,,False
70362,True,False,False,True,True,False,False,False,,,False
70364,True,True,False,True,True,True,False,False,,,False
70428,True,True,False,True,True,True,False,False,,,False


In [16]:
# assign the others
train, test = train_test_split(df[df["train"].isna() & df["test"].isna()].index, train_size=train_ratio, random_state=43)
df.loc[test, ["train"]] = False
df.loc[test, ["test"]] = True
df.loc[train, ["train"]] = True
df.loc[train, ["test"]] = False

In [17]:
# remove dev from test and training -> used for others stuff
dev = find_id_from_folder(must_dev + "/wav/")
df.loc[dev, ["train"]] = np.nan
df.loc[dev, ["test"]] = np.nan

In [18]:
df.to_csv(dataset)

## Inspect

In [19]:
df = pd.read_csv(dataset, index_col="id")
df

Unnamed: 0_level_0,ted,amara,must_c,transcript,title_descr,audio,is_ted,is_tedx,train,test,drop
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,True,True,True,True,True,True,True,False,True,False,False
2,True,True,False,True,True,True,True,False,True,False,False
3,True,True,True,True,True,True,True,False,True,False,False
4,True,True,True,True,True,True,True,False,True,False,False
5,True,True,False,True,True,True,True,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...
70327,True,True,False,True,True,False,False,False,True,False,False
70362,True,False,False,True,True,False,False,False,True,False,False
70364,True,True,False,True,True,True,False,False,False,True,False
70428,True,True,False,True,True,True,False,False,True,False,False


In [20]:
print("Train:", (df["train"]==True).sum())
print("Test:", (df["test"]==True).sum())
print("Other:", (df["test"].isna()).sum())

Train: 3848
Test: 495
Other: 10


## Add TEDx and other talks without id

In [75]:
def get_unknown_amara(talk_id_file, amara_base, amara_urls, amara_folders):
    """
    Args:
        talk_id_file: talk_id.csv for amara        
        folder: amara folders (e.g., TED, TED-Series, etc, ...)
    """
    df = pd.read_csv(talk_id_file)
    unk = df.set_index("id")
    unk = unk.loc[np.nan] # only those without id
    unk = unk.copy()
    datas = [json.load(open(f"{amara_base}/{i}/{amara_urls}", "r")) for i in amara_folders]
    columns = ["ted", "amara", "must_c", "transcript", "title_descr", "audio", "is_ted", "is_tedx", "train", "test", "drop"]
    out_df = pd.DataFrame(columns = ["id"] + columns).set_index("id")
    
    progress_bar(0, "Unknown", unk.shape[0])
    for i in range(unk.shape[0]):
        url = unk.iloc[i]["amara"]
        name = make_name(url,True)
        for f,d in zip(amara_folders, datas):            
            if url in d:
                path = f"{amara_base}/{f}/{name}.srt"
                transcript = False
                drop = True                
                if file_exists(path):                    
                    drop = drop_transcript(pysrt.open(path).text)
                    transcript = True
                out_df.loc[-i] = {i:j for i,j in zip(columns, [False, True, False, transcript, d[url][1] != "", False, f=="TED", f=="TEDx", np.nan, np.nan, drop])}                
        progress_bar(i+1, "Unknown", unk.shape[0])
    df.loc[df["id"].isna(), "id"] = [-i for i in range(unk.shape[0])]
    return out_df, df

In [86]:
unk, new_id = get_unknown_amara(amara_file, amara_base, amara_urls, ["TEDx"])

KeyError: nan

In [83]:
new_id[["amara", "id"]].to_csv(amara_file)

In [97]:
train, test = train_test_split(unk.index, train_size=train_ratio, random_state=43)
unk.loc[test, ["train"]] = False
unk.loc[test, ["test"]] = True
unk.loc[train, ["train"]] = True
unk.loc[train, ["test"]] = False
unk

Unnamed: 0_level_0,ted,amara,must_c,transcript,title_descr,audio,is_ted,is_tedx,train,test,drop
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
-301,False,True,False,True,True,False,False,True,True,False,False
-302,False,True,False,True,True,False,False,True,True,False,False
-303,False,True,False,True,True,False,False,True,True,False,False
-304,False,True,False,False,False,False,False,True,True,False,True
-305,False,True,False,True,True,False,False,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...
-6835,False,True,False,False,False,False,False,True,True,False,True
-6836,False,True,False,True,True,False,False,True,False,True,False
-6837,False,True,False,True,True,False,False,True,True,False,False
-6838,False,True,False,True,True,False,False,True,True,False,False


In [98]:
df = pd.read_csv(dataset, index_col="id")
df = df.append(unk)

In [99]:
df.to_csv(dataset)