In [2]:
import os
from google.colab import userdata

GITHUB_USERNAME = "MUKAMAFrancois"
REPO_NAME = "tecGrwTechnical"
GITHUB_TOKEN = userdata.get("GITHUB_TOKEN")
project_dir = f"/content/{REPO_NAME}"

if os.path.exists(project_dir):
    print("Updating project repo...")
    %cd {project_dir}
    !git pull
else:
    print("Cloning project repo...")
    repo_url = f"https://{GITHUB_TOKEN}@github.com/{GITHUB_USERNAME}/{REPO_NAME}.git"
    !git clone {repo_url} {project_dir}


Cloning project repo...
Cloning into '/content/tecGrwTechnical'...
remote: Enumerating objects: 182, done.[K
remote: Counting objects: 100% (182/182), done.[K
remote: Compressing objects: 100% (134/134), done.[K
remote: Total 182 (delta 97), reused 130 (delta 45), pack-reused 0 (from 0)[K
Receiving objects: 100% (182/182), 356.63 KiB | 7.92 MiB/s, done.
Resolving deltas: 100% (97/97), done.


In [3]:
import os
os.chdir(f"/content/{REPO_NAME}")
print("Working directory:", os.getcwd())

Working directory: /content/tecGrwTechnical


In [4]:
!pip install -q --upgrade pip
!pip install -q "transformers>=4.56,<5" "tokenizers>=0.22,<0.24" accelerate tqdm torchaudio datasets pyyaml pandas soundfile speechbrain


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m83.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
from pathlib import Path
from src.loader import load_config, get_hf_token
from src.analytics import run_speaker_analysis
from src.preprocess import run_preprocessing_pipeline

config = load_config()
token = get_hf_token()

print("Running speaker analytics before preprocessing...")
speaker_stats, best_speaker = run_speaker_analysis(config, token=token)
print("Recommended speaker from analytics:", best_speaker)
print(speaker_stats.head(10))

selected = int(config.get("SELECTED_SPEAKER_ID", best_speaker))
if int(best_speaker) != selected:
    print(f"WARNING: config SELECTED_SPEAKER_ID={selected}, analytics recommends {best_speaker}.")

def resolve_path(cfg, key):
    raw = cfg.get(key)
    if raw is None:
        return None
    p = Path(raw)
    if p.exists():
        return p
    p2 = Path(cfg.get("PROCESSED_DIR", "")) / p.name
    return p2


def metadata_non_empty(path_obj):
    return path_obj is not None and path_obj.exists() and path_obj.stat().st_size > 0


train_meta = resolve_path(config, "TRAIN_METADATA")
val_meta = resolve_path(config, "VAL_METADATA")

if metadata_non_empty(train_meta) and metadata_non_empty(val_meta):
    print(f"Processed metadata found: {train_meta} and {val_meta}")
else:
    print("Processed metadata missing or empty. Running preprocessing pipeline...")
    stats = run_preprocessing_pipeline(config, token)
    print("Preprocessing stats:", stats)

train_meta = resolve_path(config, "TRAIN_METADATA")
val_meta = resolve_path(config, "VAL_METADATA")
print("TRAIN_METADATA:", train_meta)
print("VAL_METADATA:", val_meta)


Using HF token from Colab secrets
Running speaker analytics before preprocessing...


README.md:   0%|          | 0.00/714 [00:00<?, ?B/s]

data/train-00000-of-00003.parquet:   0%|          | 0.00/346M [00:00<?, ?B/s]

data/train-00001-of-00003.parquet:   0%|          | 0.00/341M [00:00<?, ?B/s]

data/train-00002-of-00003.parquet:   0%|          | 0.00/340M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/128M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/132M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4149 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/519 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/519 [00:00<?, ? examples/s]

Loaded train split: 4149 samples
  [1] iyunw ikinini kigoye kumira gyubanz unwe amazi mak
  [2] nibyo rwose gatera
  [3] ejo nibagiye kwishyura umuriro niukuza gukubi tugi
Loaded validation split: 519 samples
  [1] ese iz ingamba nshya zizaba zihageje mu guhangana 
  [2] barwiye mizamirimo bagomba kwandikisha ibigo byabo
  [3] inyungu ifatizo y'a bank nkuru igena ko mabanki y'
Loaded test split: 519 samples
  [1] urukiko rwa sanzwe burega afite imenyetso simusiga
  [2] nyuma y'iby'umweru bitatu amashuri yose y'inchuke 
  [3] abacuruzi b'ikawa bishimiye ko igiciro ku isoko mp


Analyzing speakers: 100%|██████████| 5187/5187 [00:57<00:00, 90.86it/s]


Recommended speaker from analytics: 1.0
   speaker_id  total_duration_sec  mean_duration_sec  median_duration_sec  \
0           1        15079.416000           8.231122                6.912   
1           2        13324.936063           7.815212                7.520   
2           3        11900.621000           7.212498                6.688   

   clip_count  total_duration_hr  
0        1832           4.188727  
1        1705           3.701371  
2        1650           3.305728  
Processed metadata missing or empty. Running preprocessing pipeline...
Using device: cuda
Loaded train split: 4149 samples
  [1] iyunw ikinini kigoye kumira gyubanz unwe amazi mak
  [2] nibyo rwose gatera
  [3] ejo nibagiye kwishyura umuriro niukuza gukubi tugi
Loaded validation split: 519 samples
  [1] ese iz ingamba nshya zizaba zihageje mu guhangana 
  [2] barwiye mizamirimo bagomba kwandikisha ibigo byabo
  [3] inyungu ifatizo y'a bank nkuru igena ko mabanki y'
Loaded test split: 519 samples
  [1] uruk

Processing train: 100%|██████████| 4149/4149 [01:21<00:00, 51.04it/s]


Preprocessing train: 1323 samples, 141 skipped


Processing validation: 100%|██████████| 519/519 [00:09<00:00, 53.56it/s]


Preprocessing validation: 152 samples, 14 skipped


Processing test: 100%|██████████| 519/519 [00:10<00:00, 48.74it/s]


Preprocessing test: 175 samples, 27 skipped

Preprocessing complete.
train: 1323 samples
validation: 152 samples
test: 175 samples
Preprocessing stats: {'train': 1323, 'validation': 152, 'test': 175}
TRAIN_METADATA: data/processed/metadata_train.csv
VAL_METADATA: data/processed/metadata_validation.csv


In [6]:
import torch
from src.training.speecht5_pipeline import (
    TTSDataCollatorWithPadding,
    build_processed_datasets,
    get_speaker_embedding,
    load_speecht5_components,
    load_train_val_datasets,
    print_preprocessed_batch_debug,
)

config = load_config()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

train_ds, val_ds = load_train_val_datasets(config)
print("Train samples:", len(train_ds))
print("Val samples:", len(val_ds))

processor, model, vocoder = load_speecht5_components(device)
speaker_embedding = get_speaker_embedding(train_ds, device)

train_proc, val_proc = build_processed_datasets(
    train_ds=train_ds,
    val_ds=val_ds,
    processor=processor,
    model=model,
    speaker_embedding=speaker_embedding,
)

data_collator = TTSDataCollatorWithPadding(processor, model)
print_preprocessed_batch_debug(train_proc, data_collator)


Device: cuda
Train samples: 1323
Val samples: 152


preprocessor_config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/232 [00:00<?, ?B/s]

spm_char.model:   0%|          | 0.00/238k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/585M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/585M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/50.7M [00:00<?, ?B/s]

DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _speechbrain_save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _speechbrain_load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _recover
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached


hyperparams.yaml: 0.00B [00:00, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/56895a2df401be4150a159f3a1c653f00051d477/hyperparams.yaml' -> '/content/tecGrwTechnical/pretrained_spkrec/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached


model.safetensors:   0%|          | 0.00/50.6M [00:00<?, ?B/s]

DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _load
DEBUG:speechbrain.utils.checkpoints:Registered parameter transfer hook for _load
  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load_if_possible
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in pretrained_spkrec.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached


embedding_model.ckpt:   0%|          | 0.00/16.9M [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/56895a2df401be4150a159f3a1c653f00051d477/embedding_model.ckpt' -> '/content/tecGrwTechnical/pretrained_spkrec/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /content/tecGrwTechnical/pretrained_spkrec/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached


mean_var_norm_emb.ckpt:   0%|          | 0.00/3.20k [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/56895a2df401be4150a159f3a1c653f00051d477/mean_var_norm_emb.ckpt' -> '/content/tecGrwTechnical/pretrained_spkrec/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /content/tecGrwTechnical/pretrained_spkrec/mean_var_norm_emb.ckpt
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached


classifier.ckpt:   0%|          | 0.00/15.9M [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/56895a2df401be4150a159f3a1c653f00051d477/classifier.ckpt' -> '/content/tecGrwTechnical/pretrained_spkrec/classifier.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["classifier"] = /content/tecGrwTechnical/pretrained_spkrec/classifier.ckpt
INFO:speechbrain.utils.fetching:Fetch label_encoder.txt: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached


label_encoder.txt: 0.00B [00:00, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-xvect-voxceleb/snapshots/56895a2df401be4150a159f3a1c653f00051d477/label_encoder.txt' -> '/content/tecGrwTechnical/pretrained_spkrec/label_encoder.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["label_encoder"] = /content/tecGrwTechnical/pretrained_spkrec/label_encoder.ckpt
INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: embedding_model, mean_var_norm_emb, classifier, label_encoder
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): embedding_model -> /content/tecGrwTechnical/pretrained_spkrec/embedding_model.ckpt
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): mean_var_norm_emb -> /content/tecGrwTechnical/pretrained_spkrec/mean_var_norm_emb.ckpt
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): classifier -

Using SpeechBrain speaker embedding: (512,)


Map:   0%|          | 0/1323 [00:00<?, ? examples/s]

Map:   0%|          | 0/152 [00:00<?, ? examples/s]

First processed sample checks:
  input_ids len: 20
  labels shape: (82, 80)
Batch sanity check:
  input_ids: (2, 181)
  attention_mask: (2, 181)
  labels: (2, 824, 80)
  speaker_embeddings: (2, 512)


In [7]:
from src.training.speecht5_pipeline import build_trainer_bundle, run_stagewise_training

bundle = build_trainer_bundle(
    model=model,
    processor=processor,
    train_proc=train_proc,
    val_proc=val_proc,
    data_collator=data_collator,
    config=config,
    output_dir="speecht5_finetuned",
)
output_dir = bundle.output_dir
trainer = bundle.trainer

print("TrainingArguments metric_for_best_model:", trainer.args.metric_for_best_model)
print("TrainingArguments load_best_model_at_end:", trainer.args.load_best_model_at_end)
print("TrainingArguments save_strategy:", trainer.args.save_strategy)
eval_attr = "eval_strategy" if hasattr(trainer.args, "eval_strategy") else "evaluation_strategy"
print(f"TrainingArguments {eval_attr}:", getattr(trainer.args, eval_attr))
print("Running a pre-train evaluate() sanity check...")
eval_metrics = trainer.evaluate()
print("Eval metrics keys:", sorted(eval_metrics.keys()))

metric_name = trainer.args.metric_for_best_model
metric_key = metric_name if metric_name.startswith("eval_") else f"eval_{metric_name}"
if metric_key not in eval_metrics:
    print(f"WARNING: '{metric_key}' missing from eval metrics. Disabling load_best_model_at_end for this run.")
    trainer.args.load_best_model_at_end = False
    trainer.args.metric_for_best_model = None

train_result = run_stagewise_training(bundle)


TrainingArguments metric_for_best_model: loss
TrainingArguments load_best_model_at_end: True
TrainingArguments save_strategy: SaveStrategy.EPOCH
TrainingArguments eval_strategy: IntervalStrategy.EPOCH
Running a pre-train evaluate() sanity check...


Eval metrics keys: ['eval_loss', 'eval_model_preparation_time', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second']

[Stage 1] training for 5.0 epochs at lr=0.00010000


Epoch,Training Loss,Validation Loss,Model Preparation Time
0,No log,0.703153,0.0049
1,0.534700,0.48265,0.0049
2,0.497100,0.439204,0.0049
3,0.477200,0.43142,0.0049
4,0.444300,0.403459,0.0049
5,0.428200,0.386565,0.0049





[Stage 2] training for 4.0 epochs at lr=0.00003000


Epoch,Training Loss,Validation Loss,Model Preparation Time
0,No log,0.387138,0.0049
1,0.412500,0.385786,0.0049
2,0.411900,0.384392,0.0049
3,0.418600,0.383633,0.0049
4,0.405700,0.378475,0.0049



[Stage 3] training for 3.0 epochs at lr=0.00001000


Epoch,Training Loss,Validation Loss,Model Preparation Time
0,No log,0.378136,0.0049
1,0.407500,0.379425,0.0049
2,0.403200,0.37749,0.0049
3,0.400300,0.378338,0.0049


In [8]:
from pathlib import Path
from IPython.display import Audio, display
from src.training.speecht5_inference import (
    configure_generation_for_latency,
    export_int8_deployment_package,
    export_final_model_package,
    get_directory_size_mb,
    load_finetuned_model,
    save_generation_config,
    synthesize_test_sentences,
)

required_sentences = [
    "Muraho, nagufasha gute uyu munsi?",
    "Niba ufite ibibazo bijyanye n'ubuzima bwawe, twagufasha.",
    "Ni ngombwa ko ubonana umuganga vuba.",
    "Twabanye nawe kandi tuzakomeza kukwitaho.",
    "Ushobora kuduhamagara igihe cyose ukeneye ubufasha.",
    # my own test sentences:
    "Dukora guheramu gitondo kugeza nimugoroba. Kuwa mbere kugeza kuwa gatanu, saa moya za mu gitondo kugeza saa kumi n'ebyiri z'umugoroba.",
    "Tubakirana urugwiro kandi twiteguye kugufasha mu buryo bwose dushoboye.",
    "Murakoze kandi turagushimira kuba waratuganye.",
]


best_ckpt = trainer.state.best_model_checkpoint or output_dir
print("Using checkpoint:", best_ckpt)
finetuned_model = load_finetuned_model(best_ckpt, device)
configure_generation_for_latency(finetuned_model, max_length=600)
save_generation_config(finetuned_model, best_ckpt)

audio_paths = synthesize_test_sentences(
    model=finetuned_model,
    processor=processor,
    vocoder=vocoder,
    speaker_embedding=speaker_embedding,
    sentences=required_sentences,
    output_dir="evaluation/final_required_sentences",
    device=device,
    sample_rate=16000,
    fast_maxlenratio=9.0,
    safe_maxlenratio=14.0,
    retry_for_completeness=True,
)

for text, audio_path in zip(required_sentences, audio_paths):
    print("Saved:", audio_path)
    print("Text:", text)
    display(Audio(filename=str(audio_path), autoplay=False))

int8_dir = export_int8_deployment_package(
    finetuned_model,
    processor,
    "speecht5_int8_deployment",
)
int8_size_mb = get_directory_size_mb(int8_dir)

print(f"\nINT8 deployment package size: {int8_size_mb:.2f} MB")

Using checkpoint: speecht5_finetuned/checkpoint-332
Saved: evaluation/final_required_sentences/sentence_01.wav
Text: Muraho, nagufasha gute uyu munsi?


Saved: evaluation/final_required_sentences/sentence_02.wav
Text: Niba ufite ibibazo bijyanye n'ubuzima bwawe, twagufasha.


Saved: evaluation/final_required_sentences/sentence_03.wav
Text: Ni ngombwa ko ubonana umuganga vuba.


Saved: evaluation/final_required_sentences/sentence_04.wav
Text: Twabanye nawe kandi tuzakomeza kukwitaho.


Saved: evaluation/final_required_sentences/sentence_05.wav
Text: Ushobora kuduhamagara igihe cyose ukeneye ubufasha.


Saved: evaluation/final_required_sentences/sentence_06.wav
Text: Dukora guheramu gitondo kugeza nimugoroba. Kuwa mbere kugeza kuwa gatanu, saa moya za mu gitondo kugeza saa kumi n'ebyiri z'umugoroba.


Saved: evaluation/final_required_sentences/sentence_07.wav
Text: Tubakirana urugwiro kandi twiteguye kugufasha mu buryo bwose dushoboye.


Saved: evaluation/final_required_sentences/sentence_08.wav
Text: Murakoze kandi turagushimira kuba waratuganye.



INT8 deployment package size: 142.50 MB


In [11]:
from src.training.speecht5_inference import measure_latency

def to_ten_words(text):
    words = str(text).split()
    return " ".join(words[:10])

latency_sentences = [to_ten_words(s) for s in required_sentences]
latency_sentences = [s for s in latency_sentences if s]

print("Inference device:", device)
if getattr(device, "type", "cpu") != "cuda":
    print("WARNING: running on CPU will usually exceed the 800 ms latency target.")

print("Benchmarking with <=10-word sentences:")
for s in latency_sentences:
    print(f"- {s} ({len(s.split())} words)")

latencies_ms, mean_ms = measure_latency(
    model=finetuned_model,
    processor=processor,
    vocoder=vocoder,
    speaker_embedding=speaker_embedding,
    sentences=latency_sentences,
    device=device,
    warmup_runs=2,
    add_leading_prompt=False,
    cache_inputs=True,
)

for ms, text in zip(latencies_ms, latency_sentences):
    print(f"Latency: {ms:.2f} ms | {text}")

print(f"Mean latency: {mean_ms:.2f} ms")

Inference device: cuda
Benchmarking with <=10-word sentences:
- Muraho, nagufasha gute uyu munsi? (5 words)
- Niba ufite ibibazo bijyanye n'ubuzima bwawe, twagufasha. (7 words)
- Ni ngombwa ko ubonana umuganga vuba. (6 words)
- Twabanye nawe kandi tuzakomeza kukwitaho. (5 words)
- Ushobora kuduhamagara igihe cyose ukeneye ubufasha. (6 words)
- Dukora guheramu gitondo kugeza nimugoroba. Kuwa mbere kugeza kuwa gatanu, (10 words)
- Tubakirana urugwiro kandi twiteguye kugufasha mu buryo bwose dushoboye. (9 words)
- Murakoze kandi turagushimira kuba waratuganye. (5 words)
Latency: 524.99 ms | Muraho, nagufasha gute uyu munsi?
Latency: 810.52 ms | Niba ufite ibibazo bijyanye n'ubuzima bwawe, twagufasha.
Latency: 581.31 ms | Ni ngombwa ko ubonana umuganga vuba.
Latency: 1050.61 ms | Twabanye nawe kandi tuzakomeza kukwitaho.
Latency: 982.65 ms | Ushobora kuduhamagara igihe cyose ukeneye ubufasha.
Latency: 1415.40 ms | Dukora guheramu gitondo kugeza nimugoroba. Kuwa mbere kugeza kuwa gatanu,
La

In [10]:
from google.colab import files
import shutil
import torch
from pathlib import Path

def directory_size_mb(path):
    p = Path(path)
    total = sum(f.stat().st_size for f in p.rglob("*") if f.is_file())
    return total / (1024.0 * 1024.0)

# Export clean FP32 inference package
fp32_dir = Path("speecht5_fp32_infer")
if fp32_dir.exists():
    shutil.rmtree(fp32_dir)
export_final_model_package(finetuned_model, processor, fp32_dir)
fp32_zip = shutil.make_archive("speecht5_fp32_infer", "zip", root_dir=str(fp32_dir))
fp32_dir_size_mb = directory_size_mb(fp32_dir)
fp32_zip_size_mb = Path(fp32_zip).stat().st_size / (1024.0 * 1024.0)
print(f"FP32 directory size: {fp32_dir_size_mb:.2f} MB")
files.download(fp32_zip)

# Export clean INT8 inference package
int8_dir = Path("speecht5_int8_deployment")
if int8_dir.exists():
    shutil.rmtree(int8_dir)
export_int8_deployment_package(finetuned_model, processor, int8_dir)
int8_zip = shutil.make_archive("speecht5_int8_deployment", "zip", root_dir=str(int8_dir))
int8_dir_size_mb = directory_size_mb(int8_dir)
int8_zip_size_mb = Path(int8_zip).stat().st_size / (1024.0 * 1024.0)
print(f"INT8 directory size: {int8_dir_size_mb:.2f} MB")
files.download(int8_zip)
torch.save(torch.tensor(speaker_embedding, dtype=torch.float32), "speaker_embedding.pt")
files.download("speaker_embedding.pt")


FP32 directory size: 551.25 MB


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

INT8 directory size: 142.50 MB


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>