In [23]:
import os
import torchaudio
import pandas as pd
import transformers
from tqdm.notebook import tqdm
from fairseq.examples.speech_to_text.data_utils import load_df_from_tsv, save_df_to_tsv

In [2]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    "/data/user_data/siqiouya/runs/pretrained/llama-2-7b/hf",
    padding_side="right",    
    truncation=False,
    add_special_tokens=False
)

In [3]:
root = '/data/user_data/siqiouya/dataset/must-c-v1.0/en-es'

In [4]:
splits = ['tst-COMMON']

In [5]:
dfs = {}
for split in splits:
    df = load_df_from_tsv(os.path.join(root, '{}.tsv'.format(split)))
    dfs[split] = df

In [10]:
new_dfs = {}
for split in splits:
    df = dfs[split]
    max_duration = 30
    ted_offsets = [(int(df['id'][i].split('_')[1]), int(df['audio'][i].split(':')[1]), i) for i in range(len(df))]
    sorted_ted_offsets = sorted(ted_offsets)
    indices = [i for _, _, i in sorted_ted_offsets]
    df = df.reindex(indices)
    df.reset_index(inplace=True)

    last_ted_id = ''
    last_path = ''
    last_spk = ''
    counter = 0
    offset = 0
    duration = 0
    n_token = 0
    src_text_buffer = ''
    tgt_text_buffer = ''
    # speech_word_buffer = []
    # text_word_buffer = []

    columns = list(df.columns)[1:]
    new_df = pd.DataFrame(columns=columns)

    for i in tqdm(range(len(df))):
        cur_ted_id = df['id'][i].split('_')[1]
        path, cur_offset, cur_n_frames = df['audio'][i].split(':')
        cur_offset = int(cur_offset)
        cur_n_frames = int(cur_n_frames)
        if (cur_ted_id == last_ted_id and (cur_offset + cur_n_frames - offset) / 16000 >= max_duration) or cur_ted_id != last_ted_id:
            if i > 0 and duration > 0:
                cur_df = pd.DataFrame(
                    {
                        "id": ["ted_{}_{}".format(last_ted_id, counter)],
                        "audio": ["{}:{}:{}".format(last_path, offset, duration)],
                        "n_frames": [duration],
                        "speaker": [last_spk],
                        "src_text": src_text_buffer,
                        "tgt_text": tgt_text_buffer,
                        "src_lang": ["en"],
                        "tgt_lang": ["es"],
                    }
                )
                new_df = pd.concat([new_df, cur_df], ignore_index=True)

            counter = counter + 1 if cur_ted_id == last_ted_id else 0
            last_ted_id = cur_ted_id
            last_path = path
            last_spk = df['speaker'][i]
            offset = cur_offset
            duration = cur_n_frames
            src_text_buffer = df['src_text'][i]
            tgt_text_buffer = df['tgt_text'][i]

        else:
            duration = cur_offset + cur_n_frames - offset
            assert duration > 0, (i, offset)
            src_text_buffer = src_text_buffer + ' ' + df['src_text'][i]
            tgt_text_buffer = tgt_text_buffer + ' ' + df['tgt_text'][i]

    if duration > 0:
        cur_df = pd.DataFrame(
            {
                "id": ["ted_{}_{}".format(last_ted_id, counter)],
                "audio": ["{}:{}:{}".format(last_path, offset, duration)],
                "n_frames": [duration],
                "speaker": [last_spk],
                "src_text": src_text_buffer,
                "tgt_text": tgt_text_buffer,
                "src_lang": ["en"],
                "tgt_lang": ["es"],
            }
        )
        new_df = pd.concat([new_df, cur_df], ignore_index=True)
    new_dfs[split] = new_df

  0%|          | 0/2449 [00:00<?, ?it/s]

In [12]:
new_dfs['tst-COMMON']

Unnamed: 0,id,audio,n_frames,speaker,src_text,tgt_text,src_lang,tgt_lang
0,ted_1096_0,/data/user_data/siqiouya/dataset/must-c-v1.0/e...,398080,spk.1096,"Back in New York, I am the head of development...",En Nueva York soy responsable de desarrollo de...,en,es
1,ted_1096_1,/data/user_data/siqiouya/dataset/must-c-v1.0/e...,404959,spk.1096,But still it was a real footrace against the o...,Pero aún así hubo una carrera a pie contra los...,en,es
2,ted_1096_2,/data/user_data/siqiouya/dataset/must-c-v1.0/e...,462399,spk.1096,The other volunteer who had arrived just befor...,El otro voluntario que acababa de llegar antes...,en,es
3,ted_1096_3,/data/user_data/siqiouya/dataset/must-c-v1.0/e...,264320,spk.1096,"Well, I was next. The captain waved me over. H...","Bueno, yo era el siguiente. El capitán me hizo...",en,es
4,ted_1096_4,/data/user_data/siqiouya/dataset/must-c-v1.0/e...,224000,spk.1096,"So, not exactly what I was hoping for, but off...",No era exactamente lo que esperaba pero fui......,en,es
...,...,...,...,...,...,...,...,...
651,ted_1404_6,/data/user_data/siqiouya/dataset/must-c-v1.0/e...,478560,spk.1404,Add this buzzer for some extra punch and you'v...,Añadimos este timbre para crear más impacto y ...,en,es
652,ted_1404_7,/data/user_data/siqiouya/dataset/must-c-v1.0/e...,463521,spk.1404,We want to make every single interaction in th...,Queremos que todas las interacciones en el mun...,en,es
653,ted_1404_8,/data/user_data/siqiouya/dataset/must-c-v1.0/e...,178719,spk.1404,"So for example, we've had designers with no ex...","Así, por ejemplo, los diseñadores sin experien...",en,es
654,ted_1404_9,/data/user_data/siqiouya/dataset/must-c-v1.0/e...,429921,spk.1404,A few weeks ago we took littleBits to RISD and...,Hace unas semanas llevamos littleBits a los di...,en,es


In [13]:
for split in splits:
    save_df_to_tsv(new_dfs[split], os.path.join(root, '{}_30s.tsv'.format(split)))

In [20]:
new_dfs[splits[0]].iloc[0]

id                                             30s_ted_1096_0
audio       /data/user_data/siqiouya/dataset/must-c-v1.0/e...
n_frames                                               398080
speaker                                              spk.1096
src_text    Back in New York, I am the head of development...
tgt_text    En Nueva York soy responsable de desarrollo de...
src_lang                                                   en
tgt_lang                                                   es
Name: 0, dtype: object

In [21]:
root

'/data/user_data/siqiouya/dataset/must-c-v1.0/en-es'

In [24]:
for split in splits:
    df = new_dfs[split]
    with open(os.path.join(root, '{}_30s.source'.format(split)), 'w') as w_source, \
        open(os.path.join(root, '{}_30s.target'.format(split)), 'w') as w_target:
        for i in tqdm(range(len(df))):
            id = df['id'][i]
            path = os.path.join(root, 'data/{}/split/{}.wav'.format(split, id))
            w_source.write(path + '\n')
            w_target.write(df['tgt_text'][i] + '\n')

            ori_path, offset, duration = df['audio'][i].split(':')
            wav, _ = torchaudio.load(ori_path, int(offset), int(duration))
            torchaudio.save(path, wav, 16000)

  0%|          | 0/656 [00:00<?, ?it/s]