In [1]:
import os
from datasets import load_dataset
import soundfile as sf

# Config language và split
src_lang = "en_us"
tgt_lang = "vi_vn"   # English (US)
split = "validation"  # Đổi thành "train" khi muốn full

# Load dataset
src_dataset = load_dataset("google/fleurs", src_lang, split=split)
tgt_dataset = load_dataset("google/fleurs", tgt_lang, split=split)

# Thư mục lưu
data_dir = "./fleurs_data"
src_dir = os.path.join(data_dir, "src")
tgt_dir = os.path.join(data_dir, "tgt")
os.makedirs(src_dir, exist_ok=True)
os.makedirs(tgt_dir, exist_ok=True)

# Lưu audio (16kHz sẵn)
for i in range(min(len(src_dataset), len(tgt_dataset))):
    # Source (Punjabi)
    audio_src = src_dataset[i]["audio"]["array"]
    sr = src_dataset[i]["audio"]["sampling_rate"]  # 16000
    src_path = os.path.join(src_dir, f"{i:06d}.wav")
    sf.write(src_path, audio_src, sr)

    # Target (English)
    audio_tgt = tgt_dataset[i]["audio"]["array"]
    tgt_path = os.path.join(tgt_dir, f"{i:06d}.wav")
    sf.write(tgt_path, audio_tgt, sr)

print(f"Done! Đã lưu {min(len(src_dataset), len(tgt_dataset))} speech pairs vào {data_dir}")

  from .autonotebook import tqdm as notebook_tqdm
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Done! Đã lưu 361 speech pairs vào ./fleurs_data


In [2]:
src_manifest = os.path.join(data_dir, f"{split}.src.tsv")
tgt_manifest = os.path.join(data_dir, f"{split}.tgt.tsv")

with open(src_manifest, "w") as f_src, open(tgt_manifest, "w") as f_tgt:
    f_src.write(src_dir + "\n")
    f_tgt.write(tgt_dir + "\n")

    for i in range(min(len(src_dataset), len(tgt_dataset))):
        filename = f"{i:06d}.wav"

        # Source
        n_samples_src = len(src_dataset[i]["audio"]["array"])
        f_src.write(f"{filename}\t{n_samples_src}\n")

        # Target
        n_samples_tgt = len(tgt_dataset[i]["audio"]["array"])
        f_tgt.write(f"{filename}\t{n_samples_tgt}\n")

print(f"mnaifest tạo xong: {src_manifest} và {tgt_manifest}")

mnaifest tạo xong: ./fleurs_data/validation.src.tsv và ./fleurs_data/validation.tgt.tsv


In [3]:
import os
import torch
import numpy as np
import joblib
from tqdm import tqdm
from transformers import HubertModel, Wav2Vec2FeatureExtractor
import torchaudio
from sklearn.cluster import MiniBatchKMeans

# Config
data_dir = "./fleurs_data"
tgt_dir = os.path.join(data_dir, "tgt")
unit_dir = os.path.join(data_dir, "unit")
os.makedirs(unit_dir, exist_ok=True)
split = "train"          # Dùng train để train kmeans
k_clusters = 1000        # Standard cho English

# Load HuBERT
model = HubertModel.from_pretrained("facebook/hubert-base-ls960")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-base-ls960")
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def extract_features(audio_array):
    inputs = feature_extractor(audio_array, sampling_rate=16000, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    # Layer 9 (standard cho discrete units)
    hidden = outputs.hidden_states[9].squeeze(0).cpu().numpy()  # (time, dim)
    return hidden

# === 1. Collect features từ target train để train kmeans ===
print("Collecting features để train kmeans...")
all_features = []
for filename in tqdm(sorted(os.listdir(tgt_dir))):
    if filename.endswith(".wav"):
        waveform, sr = torchaudio.load(os.path.join(tgt_dir, filename))
        audio = waveform.squeeze(0).numpy()
        feats = extract_features(audio)
        all_features.append(feats)

all_features = np.concatenate(all_features, axis=0)
print(f"Total frames: {all_features.shape[0]}")

# === 2. Train kmeans ===
print("Training kmeans...")
kmeans = MiniBatchKMeans(n_clusters=k_clusters, batch_size=10000, random_state=0)
kmeans.fit(all_features)
joblib.dump(kmeans, os.path.join(data_dir, f"kmeans_{k_clusters}.joblib"))
print("kmeans saved!")

# === 3. Infer units + dedup cho tất cả file target ===
def get_dedup_units(audio_array):
    feats = extract_features(audio_array)
    units = kmeans.predict(feats)
    # Deduplicate consecutive same units (như paper)
    dedup = [units[0]]
    for u in units[1:]:
        if u != dedup[-1]:
            dedup.append(u)
    return dedup

print("Inferring units cho từng file...")
for filename in tqdm(sorted(os.listdir(tgt_dir))):
    if filename.endswith(".wav"):
        waveform, sr = torchaudio.load(os.path.join(tgt_dir, filename))
        audio = waveform.squeeze(0).numpy()
        units = get_dedup_units(audio)
        unit_path = os.path.join(unit_dir, filename.replace(".wav", ".unit"))
        with open(unit_path, "w") as f:
            f.write(" ".join(map(str, units)))

# === 4. Tạo unit manifest ===
unit_manifest = os.path.join(data_dir, f"{split}.tgt.unit.tsv")
with open(unit_manifest, "w") as f:
    f.write(os.path.abspath(unit_dir) + "\n")
    for filename in sorted(os.listdir(unit_dir)):
        if filename.endswith(".unit"):
            unit_path = os.path.join(unit_dir, filename)
            with open(unit_path) as uf:
                units = uf.read().strip().split()
            n_units = len(units)
            wav_name = filename.replace(".unit", ".wav")
            f.write(f"{wav_name}\t{n_units}\n")

print(f"Unit extraction xong! Manifest: {unit_manifest}")

Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_v', 'encoder.pos_conv_embed.conv.weight_g']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

Collecting features để train kmeans...


100%|██████████| 361/361 [00:56<00:00,  6.41it/s]


Total frames: 214616
Training kmeans...
kmeans saved!
Inferring units cho từng file...


100%|██████████| 361/361 [00:59<00:00,  6.06it/s]

Unit extraction xong! Manifest: ./fleurs_data/train.tgt.unit.tsv





In [5]:
import os
from datasets import load_dataset
import soundfile as sf
import torchaudio
import torch
import numpy as np
from transformers import HubertModel, Wav2Vec2FeatureExtractor
from sklearn.cluster import MiniBatchKMeans
import joblib
from tqdm import tqdm

# Load kmeans từ train
kmeans = joblib.load("./fleurs_data/kmeans_1000.joblib")
# Config validation
split = "validation"
src_lang = "vi_vn"
tgt_lang = "en_us"

src_dataset = load_dataset("google/fleurs", src_lang, split=split)
tgt_dataset = load_dataset("google/fleurs", tgt_lang, split=split)

num_pairs = min(len(src_dataset), len(tgt_dataset))
print(f"Validation pairs: {num_pairs}")

# Thư mục validation
val_dir = "./fleurs_data/validation"
src_val_dir = os.path.join(val_dir, "src")
tgt_val_dir = os.path.join(val_dir, "tgt")
unit_val_dir = os.path.join(val_dir, "unit")  # Tạm, sau rename
os.makedirs(src_val_dir, exist_ok=True)
os.makedirs(tgt_val_dir, exist_ok=True)
os.makedirs(unit_val_dir, exist_ok=True)

# Lưu wav
for i in range(num_pairs):
    audio_src = src_dataset[i]["audio"]["array"]
    sr = src_dataset[i]["audio"]["sampling_rate"]
    src_path = os.path.join(src_val_dir, f"{i:06d}.wav")
    sf.write(src_path, audio_src, sr)

    audio_tgt = tgt_dataset[i]["audio"]["array"]
    tgt_path = os.path.join(tgt_val_dir, f"{i:06d}.wav")
    sf.write(tgt_path, audio_tgt, sr)

# Load HuBERT để extract features (giống trước)
model = HubertModel.from_pretrained("facebook/hubert-base-ls960")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-base-ls960")
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def extract_features(audio_array):
    inputs = feature_extractor(audio_array, sampling_rate=16000, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    hidden = outputs.hidden_states[9].squeeze(0).cpu().numpy()
    return hidden

def get_dedup_units(audio_array):
    feats = extract_features(audio_array)
    units = kmeans.predict(feats)
    dedup = [units[0]]
    for u in units[1:]:
        if u != dedup[-1]:
            dedup.append(u)
    return dedup

# Infer units cho validation target
print("Inferring units cho validation...")
for filename in tqdm(sorted(os.listdir(tgt_val_dir))):
    if filename.endswith(".wav"):
        waveform, sr = torchaudio.load(os.path.join(tgt_val_dir, filename))
        audio = waveform.squeeze(0).numpy()
        units = get_dedup_units(audio)
        unit_path = os.path.join(unit_val_dir, filename.replace(".wav", ".unit"))
        with open(unit_path, "w") as f:
            f.write(" ".join(map(str, units)))

print("Validation data + units xong!")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Validation pairs: 361


Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_v', 'encoder.pos_conv_embed.conv.weight_g']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

Inferring units cho validation...


100%|██████████| 361/361 [00:45<00:00,  7.86it/s]

Validation data + units xong!





In [6]:
import os
import shutil

# Cho train
shutil.move("./fleurs_data/unit", "./fleurs_data/du")
shutil.move("./fleurs_data/train.tgt.unit.tsv", "./fleurs_data/train.du.tsv")
for f in os.listdir("./fleurs_data/du"):
    if f.endswith(".unit"):
        os.rename(os.path.join("./fleurs_data/du", f), os.path.join("./fleurs_data/du", f.replace(".unit", ".du")))

# Cho validation (nếu có)
val_du_dir = "./fleurs_data/validation/du"
val_unit_dir = "./fleurs_data/validation/unit"
if os.path.exists(val_unit_dir):
    shutil.move(val_unit_dir, val_du_dir)
    # Tạo validation manifest .du.tsv tương tự bước 4 cũ
    val_manifest = "./fleurs_data/validation/valid.du.tsv"
    with open(val_manifest, "w") as f:
        f.write(os.path.abspath(val_du_dir) + "\n")
        for filename in sorted(os.listdir(val_du_dir)):
            if filename.endswith(".du"):
                with open(os.path.join(val_du_dir, filename)) as uf:
                    n_units = len(uf.read().strip().split())
                wav_name = filename.replace(".du", ".wav")
                f.write(f"{wav_name}\t{n_units}\n")


In [7]:
k_clusters = 500
dict_path = "./fleurs_data/dict.du.txt"

with open(dict_path, "w") as f:
    for i in range(k_clusters):
        f.write(f"{i} 100000\n")  # Fake count cao
    f.write("<unk> 0\n")

print("Dict tạo xong!")

Dict tạo xong!


In [14]:
!which python
!python --version
!python -c "import fairseq; print(fairseq.__version__)"


/home/khanh/miniconda3/envs/hubert/bin/python
Python 3.10.19
2026-01-16 18:13:05 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
0.12.2


In [36]:
ls


[0m[01;34mdata-bin[0m/    [01;34mdu[0m/                 [01;34msrc[0m/  train.du.tsv  validation.src.tsv
dict.du.txt  kmeans_1000.joblib  [01;34mtgt[0m/  [01;34mvalidation[0m/   validation.tgt.tsv


In [32]:
cd /home/khanh/Projects/KhoaLuan/src/fleurs_data


/home/khanh/Projects/KhoaLuan/src/fleurs_data


In [37]:
!cp validation.src.tsv train.src.tsv
!cp validation.tgt.tsv train.tgt.tsv


In [44]:
cd /home/khanh/Projects/KhoaLuan/fairseq

/home/khanh/Projects/KhoaLuan/fairseq


In [62]:
%cd ../src/fleurs_data

/home/khanh/Projects/KhoaLuan/src/fleurs_data


In [70]:
# Di chuyển vào thư mục
%cd /home/khanh/Projects/KhoaLuan/src/fleurs_data

# Tạo symlink (quan trọng nhất!)
!ln -sf train.src.tsv train.src
!ln -sf validation.src.tsv validation.src

# Làm thêm cho target (nên làm luôn)
!ln -sf train.du.tsv train.du
!ln -sf train.tgt.tsv train.tgt
!ln -sf validation.tgt.tsv validation.tgt

# Kiểm tra kết quả (phải thấy các link -> .tsv)
!ls -l | grep -E "train|validation"

/home/khanh/Projects/KhoaLuan/src/fleurs_data
lrwxrwxrwx 1 khanh khanh      12 Jan 16 19:11 train.du -> train.du.tsv
-rw-rw-r-- 1 khanh khanh    5466 Jan 16 17:58 train.du.tsv
lrwxrwxrwx 1 khanh khanh      13 Jan 16 19:11 train.src -> train.src.tsv
-rw-rw-r-- 1 khanh khanh    6461 Jan 16 18:32 train.src.tsv
lrwxrwxrwx 1 khanh khanh      13 Jan 16 19:11 train.tgt -> train.tgt.tsv
-rw-rw-r-- 1 khanh khanh    6494 Jan 16 18:32 train.tgt.tsv
drwxrwxr-x 5 khanh khanh    4096 Jan 16 18:03 validation
lrwxrwxrwx 1 khanh khanh      18 Jan 16 19:11 validation.src -> validation.src.tsv
-rw-rw-r-- 1 khanh khanh    6461 Jan 16 17:53 validation.src.tsv
lrwxrwxrwx 1 khanh khanh      18 Jan 16 19:11 validation.tgt -> validation.tgt.tsv
-rw-rw-r-- 1 khanh khanh    6494 Jan 16 17:53 validation.tgt.tsv


In [74]:
pwd


'/home/khanh/Projects/KhoaLuan/src/fleurs_data'

In [82]:
# Thử bỏ 2 argument không hỗ trợ
!cd ~/Projects/KhoaLuan/
!fairseq-preprocess \
  --user-dir ../fairseq/examples/speech_to_speech \
  --task speech_to_speech \
  --trainpref fleurs_data/train \
  --validpref fleurs_data/validation/validtion.src.tsv \
  --destdir fleurs_data/data-bin \
  --tgtdict fleurs_data/dict.du.txt \
  --workers 4

2026-01-16 19:46:09 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2026-01-16 19:46:09 | INFO | fairseq_cli.preprocess | Namespace(no_progress_bar=False, log_interval=100, log_format=None, log_file=None, aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project=None, azureml_logging=False, seed=1, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=False, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=128, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir='../fairseq/examples/speech_to_speech', empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', criterion='cross_

In [86]:
cd ..

/home/khanh/Projects/KhoaLuan/src


In [87]:

!pwd
# 4. Chạy lệnh với đường dẫn đúng
!fairseq-preprocess \
  --user-dir ../fairseq/examples/speech_to_speech \
  --task speech_to_speech \
  --trainpref fleurs_data/train \
  --validpref fleurs_data/valid \
  --destdir fleurs_data/data-bin \
  --tgtdict fleurs_data/dict.du.txt \
  --workers 4

/home/khanh/Projects/KhoaLuan/src
2026-01-16 19:47:24 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2026-01-16 19:47:24 | INFO | fairseq_cli.preprocess | Namespace(no_progress_bar=False, log_interval=100, log_format=None, log_file=None, aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project=None, azureml_logging=False, seed=1, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=False, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=128, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir='../fairseq/examples/speech_to_speech', empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_pat

In [88]:
# 1. Trích xuất cột Units từ file TSV (Giả sử cột 2 là units)
!cut -f2 fleurs_data/train.du.tsv > fleurs_data/train.txt
!cut -f2 fleurs_data/validation.tgt.tsv > fleurs_data/valid.txt

# Kiểm tra xem file mới tạo có nội dung không
!head -n 3 fleurs_data/train.txt

/home/khanh/Projects/KhoaLuan/src/fleurs_data/unit
436
371


In [89]:
# Xóa thư mục data-bin cũ nếu bị lỗi
!rm -rf fleurs_data/data-bin

# Chạy lệnh preprocess chuẩn cho Units
!fairseq-preprocess \
  --user-dir ../fairseq/examples/speech_to_speech \
  --source-lang txt \
  --trainpref fleurs_data/train \
  --validpref fleurs_data/valid \
  --destdir fleurs_data/data-bin \
  --srcdict fleurs_data/dict.du.txt \
  --workers 4

2026-01-16 19:48:57 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2026-01-16 19:48:57 | INFO | fairseq_cli.preprocess | Namespace(no_progress_bar=False, log_interval=100, log_format=None, log_file=None, aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project=None, azureml_logging=False, seed=1, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=False, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=128, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir='../fairseq/examples/speech_to_speech', empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', criterion='cross_

In [90]:
cp fleurs_data/dict.du.txt fleurs_data/dict.du.txt.bak

In [92]:
# Xóa dòng bắt đầu bằng <unk>
!sed -i '/^<unk>/d' fleurs_data/dict.du.txt

# Xóa dòng bắt đầu bằng <s>
!sed -i '/^<s>/d' fleurs_data/dict.du.txt

# Xóa dòng bắt đầu bằng </s>
!sed -i '/^<\/s>/d' fleurs_data/dict.du.txt

# Xóa dòng bắt đầu bằng <pad>
!sed -i '/^<pad>/d' fleurs_data/dict.du.txt

In [94]:
!head -n 5 fleurs_data/dict.du.txt

0 100000
1 100000
2 100000
3 100000
4 100000


In [None]:
import os

# 1. Lấy đường dẫn tuyệt đối
cwd = os.getcwd()
train_pref = os.path.join(cwd, "fleurs_data/train")
valid_pref = os.path.join(cwd, "fleurs_data/valid")
dest_dir = os.path.join(cwd, "fleurs_data/data-bin")
dict_path = os.path.join(cwd, "fleurs_data/dict.du.txt")

# 2. Kiểm tra file input có tồn tại không trước khi chạy
if not os.path.exists(train_pref + ".txt"):
    print(f"LỖI: Không tìm thấy file {train_pref}.txt")
    print("Đang tạo lại file từ train.du.tsv...")
    !cut -f2 fleurs_data/train.du.tsv > fleurs_data/train.txt
    !cut -f2 fleurs_data/validation.tgt.tsv > fleurs_data/valid.txt

# 3. lệnh Preprocess 
!rm -rf {dest_dir}  
!fairseq-preprocess \
  --source-lang txt \
  --trainpref {train_pref} \
  --validpref {valid_pref} \
  --destdir {dest_dir} \
  --srcdict {dict_path} \
  --only-source \
  --workers 4

print("Kiểm tra kết quả:")
!ls -l {dest_dir}

2026-01-16 20:41:27 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2026-01-16 20:41:28 | INFO | fairseq_cli.preprocess | Namespace(no_progress_bar=False, log_interval=100, log_format=None, log_file=None, aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project=None, azureml_logging=False, seed=1, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=False, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=128, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', criterion='cross_entropy', tokenizer=None, bpe=None

In [101]:
# Xóa symlink hỏng
!rm fleurs_data/train fleurs_data/valid

# Xóa thư mục data-bin cũ nếu có
!rm -rf fleurs_data/data-bin

rm: cannot remove 'fleurs_data/train': No such file or directory
rm: cannot remove 'fleurs_data/valid': No such file or directory
