In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir("/content/drive/MyDrive/dissertation project/dissertation_note/synthesis_dataset")

In [None]:
# pip install datasets soundfile pandas tqdm pyarrow

import pathlib, re, json
import pandas as pd
from tqdm import tqdm
from datasets import Dataset, Audio

# ---------------- 配置 ----------------
BATCH_MAP = {
    #"batch_outputs1": "speaker1_cleaned.txt",
    #"batch_outputs2": "speaker2_cleaned.txt",
    "batch_outputs3": "speaker3_cleaned.txt",

}
OUT_DIR = pathlib.Path("data")
OUT_DIR.mkdir(exist_ok=True, parents=True)

# ----------- 帮助函数：提取前缀编号 -----------
num_rx = re.compile(r"^(\d+)_")

def sort_key(path: pathlib.Path):
    """按文件名前的数字编号排序；若无编号则 0"""
    m = num_rx.match(path.name)
    return int(m.group(1)) if m else 0

# ---------------- 收集行 ----------------
rows = []
for batch_dir, txt_file in BATCH_MAP.items():
    wav_dir = pathlib.Path(batch_dir)
    txt_path = pathlib.Path(txt_file)
    assert wav_dir.exists(), f"未找到 {wav_dir}"
    assert txt_path.exists(), f"未找到 {txt_path}"

    # 1) 读取并排序音频（按数字编号）
    wav_files = sorted(wav_dir.glob("*.wav"), key=sort_key)
    # 2) 读取文本行
    lines = [l.strip() for l in txt_path.read_text(encoding="utf-8").splitlines()
             if l.strip()]

    if len(wav_files) != len(lines):
        raise ValueError(f"{batch_dir}: 音频 {len(wav_files)} vs 文本 {len(lines)} 行数不符")

    for wav, sent in zip(wav_files, lines):
        rows.append({"audio": str(wav), "sentence": sent})

print("Total utterances merged:", len(rows))   # 应该 1200

# -------------- 生成 Dataset --------------
ds = Dataset.from_pandas(pd.DataFrame(rows), preserve_index=False)
ds = ds.cast_column("audio", Audio())          # 保留原采样率

# -------------- 保存 -----------------------
parquet_path = OUT_DIR / "train.parquet"
ds.to_parquet(str(parquet_path))
print("✓ Saved to", parquet_path)

# -------------- README --------------------
yaml = {
    "dataset_info": {
        "features": [
            {"name": "audio",    "dtype": "audio"},
            {"name": "sentence", "dtype": "string"}
        ],
        "splits": [{"name": "train", "num_examples": len(ds)}]
    }
}
pathlib.Path("README.md").write_text(f"--- {json.dumps(yaml)} ---\n", encoding="utf-8")
print("README.md written")


Total utterances merged: 600


Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

✓ Saved to data/train.parquet
README.md written


In [None]:
!pip install -U "datasets>=3.8"  "pyarrow>=8"


[31mERROR: Could not find a version that satisfies the requirement datasets>=3.8 (from versions: 0.0.9, 1.0.0, 1.0.1, 1.0.2, 1.1.0, 1.1.1, 1.1.2, 1.1.3, 1.2.0, 1.2.1, 1.3.0, 1.4.0, 1.4.1, 1.5.0, 1.6.0, 1.6.1, 1.6.2, 1.7.0, 1.8.0, 1.9.0, 1.10.0, 1.10.1, 1.10.2, 1.11.0, 1.12.0, 1.12.1, 1.13.0, 1.13.1, 1.13.2, 1.13.3, 1.14.0, 1.15.0, 1.15.1, 1.16.0, 1.16.1, 1.17.0, 1.18.0, 1.18.1, 1.18.2, 1.18.3, 1.18.4, 2.0.0, 2.1.0, 2.2.0, 2.2.1, 2.2.2, 2.3.0, 2.3.1, 2.3.2, 2.4.0, 2.5.0, 2.5.1, 2.5.2, 2.6.0, 2.6.1, 2.6.2, 2.7.0, 2.7.1, 2.8.0, 2.9.0, 2.10.0, 2.10.1, 2.11.0, 2.12.0, 2.13.0, 2.13.1, 2.13.2, 2.14.0, 2.14.1, 2.14.2, 2.14.3, 2.14.4, 2.14.5, 2.14.6, 2.14.7, 2.15.0, 2.16.0, 2.16.1, 2.17.0, 2.17.1, 2.18.0, 2.19.0, 2.19.1, 2.19.2, 2.20.0, 2.21.0, 3.0.0, 3.0.1, 3.0.2, 3.1.0, 3.2.0, 3.3.0, 3.3.1, 3.3.2, 3.4.0, 3.4.1, 3.5.0, 3.5.1, 3.6.0)[0m[31m
[0m[31mERROR: No matching distribution found for datasets>=3.8[0m[31m
[0m

In [None]:
import pandas as pd
import soundfile as sf

# 读取 parquet 为 DataFrame
df = pd.read_parquet("data/train.parquet")
print("共有样本数:", len(df))
print(df.columns)
print(df.iloc[1])   # 查看第一个样本的路径和文本


共有样本数: 600
Index(['audio', 'sentence'], dtype='object')
audio       {'bytes': None, 'path': 'batch_outputs3/001_Th...
sentence                 the cat played the game after dinner
Name: 1, dtype: object


In [None]:
import soundfile as sf
from IPython.display import Audio

# 取任意一行（如第 1 行）
row = df.iloc[299]
print("路径:", row["audio"]["path"])
print("句子:", row["sentence"])

# 读取 waveform
waveform, sr = sf.read(row["audio"]["path"])
print("采样率:", sr, "| 长度:", len(waveform) / sr, "秒")

# 播放音频（仅限 Jupyter / Colab）
Audio(waveform, rate=sr)


路径: batch_outputs3/299_The_cat_wrote_a_card_at_noon..wav
句子: the cat wrote a card at noon
采样率: 22050 | 长度: 2.426485260770975 秒


In [None]:
!pwd

/content/drive/MyDrive/dissertation project/dissertation_note/synthesis_dataset


In [1]:
import re

# 原始输入路径
in_path = "Copy of speaker3.txt"
# 清洗后保存路径
out_path = "speaker3_cleaned.txt"

# 正则：保留字母、数字、空格，去掉标点
def clean(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)   # 去除标点符号
    text = re.sub(r"\s+", " ", text)      # 合并多余空格
    return text.strip()

# 读取 + 清洗 + 写入
with open(in_path, "r", encoding="utf-8") as fin, \
     open(out_path, "w", encoding="utf-8") as fout:
    for line in fin:
        if line.strip():  # 跳过空行
            cleaned = clean(line)
            fout.write(cleaned + "\n")

print(f"已保存 cleaned 文件到: {out_path}")


FileNotFoundError: [Errno 2] No such file or directory: 'Copy of speaker3.txt'

In [None]:
! ls

 batch_outputs1		'Copy of speaker2.txt'	 README.md
 batch_outputs2		'Copy of speaker3.txt'	 speaker1_cleaned.txt
 batch_outputs3		 data			 speaker2_cleaned.txt
'Copy of speaker1.txt'	 make_parquet.ipynb	 speaker3_cleaned.txt
