In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import csv
import json

import numpy as np
import matplotlib.pyplot as plt

import sacrebleu
import soundfile as sf

import yaml
from tqdm.notebook import tqdm

def read_logs(path):
    logs = []
    with open(path, "r") as r:
        for l in r.readlines():
            l = l.strip()
            if l != "":
                logs.append(json.loads(l))
    return logs

def read_wav(wav_path):
    wav_path, offset, duration = wav_path.split(':')
    offset = int(offset)
    duration = int(duration)
    source, rate = sf.read(wav_path, start=offset, frames=duration)
    return source, rate

def read_tsv(tsv_path):
    import csv
    with open(tsv_path, encoding='utf-8') as f:
        reader = csv.DictReader(
            f,
            delimiter="\t",
            quotechar=None,
            doublequote=False,
            lineterminator="\n",
            quoting=csv.QUOTE_NONE,
        )
        samples = [dict(e) for e in reader]
    return samples

def write_tsv(samples, tsv_path):
    with open(tsv_path, "w", encoding='utf-8') as w:
        writer = csv.DictWriter(
            w,
            samples[0].keys(),
            delimiter="\t",
            quotechar=None,
            doublequote=False,
            lineterminator="\n",
            quoting=csv.QUOTE_NONE,
        )
        writer.writeheader()
        writer.writerows(samples)

def play(audio_path):
    from IPython.display import display, Audio
    display(Audio(read_wav(audio_path)[0], rate=16000))

In [3]:
import os
import csv
import soundfile as sf
from tqdm.notebook import tqdm

In [5]:
base_split = 'dev'
split = 'dev_st_zh_traj_30_gpt-4o-mini-2024-07-18_fa_traj'
tsv_path = "/compute/babel-14-5/siqiouya/en-zh/{}.tsv".format(split)
with open(tsv_path, encoding='utf-8') as f:
    reader = csv.DictReader(
        f,
        delimiter="\t",
        quotechar=None,
        doublequote=False,
        lineterminator="\n",
        quoting=csv.QUOTE_NONE,
    )
    samples = [dict(e) for e in reader]

In [6]:
os.makedirs("/compute/babel-14-5/siqiouya/en-zh/data/{}/wav_split_30/".format(base_split))
for x in tqdm(samples):
    path, offset, n_frame = x['audio'].split(':')
    wav, sr = sf.read(path, frames=int(n_frame), start=int(offset))
    sf.write(f"/compute/babel-14-5/siqiouya/en-zh/data/{base_split}/wav_split_30/{x['id']}.wav", wav, sr)

  0%|          | 0/381 [00:00<?, ?it/s]

In [7]:
n_partition = 8
partition_size = (len(samples) + n_partition - 1) // n_partition
for idx in range(8):
    with open(f"/compute/babel-14-5/siqiouya/en-zh/{split}.source.{idx}", "w") as w_src, \
         open(f"/compute/babel-14-5/siqiouya/en-zh/{split}.target.{idx}", "w") as w_tgt:
        for x in samples[idx * partition_size:(idx + 1) * partition_size]:  # Split samples into 8 parts
            w_src.write(f"/compute/babel-14-5/siqiouya/en-zh/data/{base_split}/wav_split_30/{x['id']}.wav" + "\n")
            w_tgt.write(x["tgt_text"] + "\n")