In [1]:
import os
import pandas as pd
import transformers
from tqdm.notebook import tqdm
from fairseq.examples.speech_to_text.data_utils import load_df_from_tsv, save_df_to_tsv

[2024-04-29 15:46:54,513] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    "/mnt/taurus/data/xixu/llm/llama-2-7b/hf",
    padding_side="right",    
    truncation=False,
    add_special_tokens=False
)

In [3]:
root = '/mnt/taurus/data/xixu/datasets/must-c-v1.0/en-es'

In [4]:
train_df = load_df_from_tsv(os.path.join(root, 'train_mfa.tsv'))
dev_df = load_df_from_tsv(os.path.join(root, 'dev_mfa.tsv'))
dfs = {
    'train': train_df,
    'dev': dev_df
}

In [5]:
new_dfs = {}
for split in ['train', 'dev']:
    df = dfs[split]
    max_duration = 30
    ted_offsets = [(int(df['id'][i].split('_')[1]), int(df['audio'][i].split(':')[1]), i) for i in range(len(df))]
    sorted_ted_offsets = sorted(ted_offsets)
    indices = [i for _, _, i in sorted_ted_offsets]
    df = df.reindex(indices)
    df.reset_index(inplace=True)

    last_ted_id = ''
    counter = 0
    offset = 0
    duration = 0
    n_token = 0
    src_text_buffer = ''
    tgt_text_buffer = ''
    speech_word_buffer = []
    text_word_buffer = []

    columns = list(df.columns)[1:]
    new_df = pd.DataFrame(columns=columns)

    for i in tqdm(range(len(df))):
        cur_ted_id = df['id'][i].split('_')[1]
        path, cur_offset, cur_n_frames = df['audio'][i].split(':')
        cur_offset = int(cur_offset)
        cur_n_frames = int(cur_n_frames)
        if (cur_ted_id == last_ted_id and (cur_offset + cur_n_frames - offset) / 16000 >= max_duration) or cur_ted_id != last_ted_id:
            if i > 0 and duration > 0:
                cur_df = pd.DataFrame(
                    {
                        "id": "ted_{}_{}".format(last_ted_id, counter),
                        "audio": "{}:{}:{}".format(path, offset, duration),
                        "n_frames": duration,
                        "speaker": df['speaker'][i],
                        "src_text": src_text_buffer,
                        "tgt_text": tgt_text_buffer,
                        "src_lang": "en",
                        "tgt_lang": "es",
                        "speech_word": [speech_word_buffer],
                        "text_word": [text_word_buffer]
                    }
                )
                new_df = pd.concat([new_df, cur_df], ignore_index=True)

            counter = counter + 1 if cur_ted_id == last_ted_id else 0
            last_ted_id = cur_ted_id
            offset = cur_offset
            duration = cur_n_frames
            src_text_buffer = df['src_text'][i]
            tgt_text_buffer = df['tgt_text'][i]
            speech_word_buffer = eval(df['speech_word'][i]) if df['speech_word'][i] != '' else []
            text_word_buffer = eval(df['text_word'][i]) if df['text_word'][i] != '' else []
            n_token = len(tokenizer(df['src_text'][i], add_special_tokens=False).input_ids)

        else:
            duration = cur_offset + cur_n_frames - offset
            assert duration > 0, (i, offset)
            src_text_buffer = src_text_buffer + ' ' + df['src_text'][i]
            tgt_text_buffer = tgt_text_buffer + ' ' + df['tgt_text'][i]

            cur_speech_word = eval(df['speech_word'][i]) if df['speech_word'][i] != '' else []
            cur_text_word = eval(df['text_word'][i]) if df['text_word'][i] != '' else []

            offset_diff = (cur_offset - offset) / 16000
            speech_word_buffer = speech_word_buffer + [(l + offset_diff, r + offset_diff) for l, r in cur_speech_word]

            cur_n_token = len(tokenizer(df['src_text'][i], add_special_tokens=False).input_ids)
            n_token_diff = len(tokenizer(src_text_buffer, add_special_tokens=False).input_ids) - cur_n_token
            text_word_buffer = text_word_buffer + [(l + n_token_diff, r + n_token_diff) for l, r in cur_text_word]
            n_token = n_token_diff + cur_n_token

    if duration > 0:
        cur_df = pd.DataFrame(
            {
                "id": "ted_{}_{}".format(last_ted_id, counter),
                "audio": "{}:{}:{}".format(path, offset, duration),
                "n_frames": duration,
                "speaker": df['speaker'][i],
                "src_text": src_text_buffer,
                "tgt_text": tgt_text_buffer,
                "src_lang": "en",
                "tgt_lang": "es",
                "speech_word": [speech_word_buffer],
                "text_word": [text_word_buffer]
            }
        )
        new_df = pd.concat([new_df, cur_df], ignore_index=True)
    new_dfs[split] = new_df

  0%|          | 0/260041 [00:00<?, ?it/s]

  0%|          | 0/1312 [00:00<?, ?it/s]

In [7]:
save_df_to_tsv(new_dfs['train'], os.path.join(root, 'train_mfa_30s.tsv'))
save_df_to_tsv(new_dfs['dev'], os.path.join(root, 'dev_mfa_30s.tsv'))

In [9]:
for split in ['train', 'dev']:
    new_dfs[split]['id'] = ['30s_' + id for id in new_dfs[split]['id']]

In [11]:
train_df = load_df_from_tsv(os.path.join(root, 'train_mfa.tsv'))
dev_df = load_df_from_tsv(os.path.join(root, 'dev_mfa.tsv'))
dfs = {
    'train': train_df,
    'dev': dev_df
}

In [12]:
mix_dfs = {}
for split in ['train', 'dev']:
    mix_dfs[split] = pd.concat([dfs[split], new_dfs[split]], ignore_index=True)

In [16]:
save_df_to_tsv(mix_dfs['train'], os.path.join(root, 'train_mfa_30s_mix.tsv'))
save_df_to_tsv(mix_dfs['dev'], os.path.join(root, 'dev_mfa_30s_mix.tsv'))

In [29]:
for split in ['train', 'dev']:
    mix_df = mix_dfs[split]
    mask = [s is not None and len(s) > 0 for s in mix_df['speech_word']]
    save_df_to_tsv(mix_df[mask], os.path.join(root, '{}_mfa_30s_mix_filtered.tsv'.format(split)))