<a href="https://colab.research.google.com/github/HamdanXI/nlp_adventure/blob/main/speach-privacy/libritts-FL-meanWeights.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
The token `ml818` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate

In [3]:
################################################################################
# 1. IMPORTS
################################################################################
import torch
import torchaudio
import numpy as np
import random
import matplotlib.pyplot as plt

from datasets import load_dataset, Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import (
    Wav2Vec2ForCTC,
    Wav2Vec2Processor,
    TrainingArguments,
    Trainer
)
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from collections import defaultdict

################################################################################
# 2. LOAD THE ORIGINAL DATASET
#    We'll just load the "train" split of the LibriTTS dev dataset (which is small).
################################################################################
dataset = load_dataset("HamdanXI/libritts_dev_dataset_gender")["train"]
# dataset columns: ["audio", "gender", "speaker_id", "text_original", ...]

print("Full dataset size:", len(dataset))
print("Columns:", dataset.column_names)
print("Unique speaker_ids:", len(set(dataset["speaker_id"])))

################################################################################
# 3. PICK 40 SPEAKERS: 20 MALE + 20 FEMALE
################################################################################
male_speakers_all = list({ex["speaker_id"] for ex in dataset if ex["gender"] == "M"})
female_speakers_all = list({ex["speaker_id"] for ex in dataset if ex["gender"] == "F"})

random.seed(42)
chosen_male = random.sample(male_speakers_all, 20)
chosen_female = random.sample(female_speakers_all, 20)

chosen_speakers = chosen_male + chosen_female
print(f"Chosen {len(chosen_male)} male speakers:   {chosen_male}")
print(f"Chosen {len(chosen_female)} female speakers: {chosen_female}")

################################################################################
# 4. FILTER DATASET TO ONLY THOSE 40 SPEAKERS
################################################################################
filtered_ds = dataset.filter(lambda x: x["speaker_id"] in chosen_speakers)

################################################################################
# 5. FOR EACH SPEAKER, PICK THE SINGLE LONGEST AUDIO SAMPLE
################################################################################
# We'll group rows by speaker_id, then pick the sample with the largest audio["array"] length.

samples_by_speaker = defaultdict(list)
for ex in filtered_ds:
    spk = ex["speaker_id"]
    samples_by_speaker[spk].append(ex)

speaker_to_longest_example = {}
for spk, rows in samples_by_speaker.items():
    # Sort by audio length descending, pick the top row
    rows_sorted = sorted(rows, key=lambda x: len(x["audio"]["array"]), reverse=True)
    speaker_to_longest_example[spk] = rows_sorted[0]

# We'll have exactly 40 entries here, one per speaker
print("Number of chosen speakers:", len(speaker_to_longest_example))

################################################################################
# 6. LOAD PROCESSOR
################################################################################
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

################################################################################
# 7. PREPARE FUNCTION (resample + tokenize)
################################################################################
def prepare_batch(batch):
    """
    1) Resample audio to 16k if needed.
    2) Extract raw (unpadded) input_values via processor.
    3) Tokenize text.
    """
    sr = batch["audio"]["sampling_rate"]
    audio_array = batch["audio"]["array"]
    wave_tensor = torch.tensor(audio_array, dtype=torch.float32)
    if sr != 16000:
        wave_tensor = torchaudio.functional.resample(wave_tensor, sr, 16000)
    wave_16k = wave_tensor.numpy()

    # Audio feature extraction
    audio_features = processor.feature_extractor(wave_16k, sampling_rate=16000)
    # Text tokenization
    text_tokens = processor.tokenizer(batch["text_original"] if batch["text_original"] else "")

    return {
        "speaker_id":   batch["speaker_id"],
        "gender":       batch["gender"],
        "text_original": batch["text_original"],
        "input_values": audio_features["input_values"][0],
        "labels":       text_tokens["input_ids"]
    }

################################################################################
# 8. MAP OUR 40 SAMPLES INTO A SMALL DATASET
################################################################################
longest_samples_list = []
for spk_id in speaker_to_longest_example:
    longest_samples_list.append(speaker_to_longest_example[spk_id])

# Create a small Dataset from these 40 examples
small_ds = Dataset.from_dict({
    "audio":         [ex["audio"] for ex in longest_samples_list],
    "speaker_id":    [ex["speaker_id"] for ex in longest_samples_list],
    "gender":        [ex["gender"] for ex in longest_samples_list],
    "text_original": [ex["text_original"] for ex in longest_samples_list],
})

# Now map: remove columns we don't need except "audio", "speaker_id", "gender", "text_original"
# but we do want to *add* input_values, labels
keep_cols = ["audio","speaker_id","gender","text_original","input_values","labels"]
small_ds = small_ds.map(
    prepare_batch,
    remove_columns=[col for col in small_ds.column_names if col not in keep_cols]
)

################################################################################
# 9. DATA COLLATOR
################################################################################
@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        input_values = [f["input_values"] for f in features]
        labels       = [f["labels"]       for f in features]

        audio_batch = self.processor.feature_extractor.pad(
            {"input_values": input_values},
            padding=self.padding,
            return_tensors="pt",
        )

        labels_batch = self.processor.tokenizer.pad(
            {"input_ids": labels},
            padding=self.padding,
            return_tensors="pt",
        )

        labels_tensor = labels_batch["input_ids"].masked_fill(
            labels_batch["input_ids"] == self.processor.tokenizer.pad_token_id, -100
        )

        return {
            "input_values": audio_batch["input_values"],
            "labels":       labels_tensor
        }

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

################################################################################
# 10. FUNCTION TO TRAIN ON SINGLE SAMPLE
################################################################################
def train_on_single_sample(
    sample,
    base_model_name="facebook/wav2vec2-base-960h",
    num_train_epochs=1,
    device="cuda" if torch.cuda.is_available() else "cpu"
):
    """
    Fine-tune on a single sample (dictionary with "input_values", "labels").
    Returns the fine-tuned model on CPU.
    """
    from datasets import Dataset

    single_ds = Dataset.from_dict({
        "input_values": [sample["input_values"]],
        "labels":       [sample["labels"]]
    })

    model = Wav2Vec2ForCTC.from_pretrained(base_model_name)
    model.to(device)
    # freeze feature encoder
    model.freeze_feature_encoder()

    training_args = TrainingArguments(
        output_dir="temp_model",
        per_device_train_batch_size=1,
        num_train_epochs=num_train_epochs,
        evaluation_strategy="no",
        logging_steps=5,
        learning_rate=1e-5,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=single_ds,
        tokenizer=processor.feature_extractor
    )

    trainer.train()
    model.to("cpu")
    return model

################################################################################
# 11. COMPUTE LAYERWISE DIFFERENCES
################################################################################
def compute_layerwise_diff(new_model, base_model, num_layers=12):
    """
    Compare new_model vs. base_model for each Wav2Vec2 encoder layer.
    Return a list of length `num_layers` with the mean absolute difference
    for each layer.
    """
    from collections import defaultdict
    layerwise_params = defaultdict(list)

    base_dict = dict(base_model.named_parameters())
    new_dict  = dict(new_model.named_parameters())

    for name, base_param in base_dict.items():
        # Look for encoder layers in the name
        if ("wav2vec2.encoder.layers" in name) and (name in new_dict):
            new_param = new_dict[name]
            parts = name.split(".")
            # e.g. name might be "wav2vec2.encoder.layers.0.attention..." -> parts[3] = '0'
            try:
                layer_idx = int(parts[3])
            except:
                continue

            if layer_idx < num_layers:
                diff = (new_param - base_param).detach().cpu().view(-1)
                layerwise_params[layer_idx].append(diff)

    layer_diffs = []
    for i in range(num_layers):
        if i in layerwise_params:
            diffs_cat = torch.cat(layerwise_params[i], dim=0)
            # We'll store the mean absolute difference
            mad = diffs_cat.abs().mean().item()
            layer_diffs.append(mad)
        else:
            layer_diffs.append(0.0)

    return layer_diffs

################################################################################
# 12. MAIN LOOP:
#     For each of the 40 samples -> Fine-tune -> Compare -> Collect differences
################################################################################
base_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").eval()
base_model.to("cpu")

all_rows = []
for i, sample in enumerate(small_ds):
    spk_id  = sample["speaker_id"]
    gender  = sample["gender"]
    text    = sample["text_original"]
    audio   = sample["audio"]  # dict with keys like "array", "sampling_rate"

    print(f"[{i+1}/40] Fine-tuning on speaker {spk_id} (gender={gender})")

    # Fine-tune
    new_model = train_on_single_sample(sample)

    # Compute layerwise difference (12 layers)
    layer_diffs = compute_layerwise_diff(new_model, base_model, num_layers=12)

    # Build a row dict: text_original, audio, speaker_id, gender, plus 12 columns
    row_dict = {
        "text_original": text,
        "audio": audio,   # you can keep the full audio dict
        "speaker_id": spk_id,
        "gender": gender,
    }
    # Add 12 columns for layer diffs
    for layer_idx in range(12):
        row_dict[f"layer_{layer_idx}_diff"] = layer_diffs[layer_idx]

    all_rows.append(row_dict)

    del new_model
    torch.cuda.empty_cache()

# Now we have 40 rows with all the columns we need.

################################################################################
# 13. CREATE A NEW DATASET AND PUSH TO HUB
################################################################################
final_dataset = Dataset.from_list(all_rows)

# Optional: Remove any columns you do not want. Here we keep them all:
# final_dataset = final_dataset.remove_columns([...])  # if needed

print("Final dataset format:")
print(final_dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/590 [00:00<?, ?B/s]

train-00000-of-00004.parquet:   0%|          | 0.00/336M [00:00<?, ?B/s]

train-00001-of-00004.parquet:   0%|          | 0.00/470M [00:00<?, ?B/s]

train-00002-of-00004.parquet:   0%|          | 0.00/376M [00:00<?, ?B/s]

train-00003-of-00004.parquet:   0%|          | 0.00/356M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5736 [00:00<?, ? examples/s]

Full dataset size: 5736
Columns: ['audio', 'text_normalized', 'text_original', 'speaker_id', 'path', 'chapter_id', 'id', 'gender']
Unique speaker_ids: 40
Chosen 20 male speakers:   ['422', '7976', '652', '6295', '5694', '2902', '2086', '6241', '2803', '3752', '1272', '777', '2078', '2428', '3170', '251', '3000', '5536', '174', '8297']
Chosen 20 female speakers: ['6345', '8842', '5895', '2277', '1993', '2035', '3853', '3081', '6319', '1988', '3576', '3536', '6313', '2412', '7850', '1673', '1462', '1919', '84', '5338']


Filter:   0%|          | 0/5736 [00:00<?, ? examples/s]

Number of chosen speakers: 40


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[1/40] Fine-tuning on speaker 2902 (gender=M)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhamdan-alali[0m ([33mhamdan-alali-mbzuai[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss


[2/40] Fine-tuning on speaker 3536 (gender=F)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[3/40] Fine-tuning on speaker 1462 (gender=F)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[4/40] Fine-tuning on speaker 3081 (gender=F)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[5/40] Fine-tuning on speaker 251 (gender=M)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[6/40] Fine-tuning on speaker 2086 (gender=M)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[7/40] Fine-tuning on speaker 652 (gender=M)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[8/40] Fine-tuning on speaker 6241 (gender=M)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[9/40] Fine-tuning on speaker 6313 (gender=F)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[10/40] Fine-tuning on speaker 3000 (gender=M)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[11/40] Fine-tuning on speaker 3853 (gender=F)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[12/40] Fine-tuning on speaker 5895 (gender=F)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[13/40] Fine-tuning on speaker 6295 (gender=M)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[14/40] Fine-tuning on speaker 84 (gender=F)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[15/40] Fine-tuning on speaker 5338 (gender=F)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[16/40] Fine-tuning on speaker 3170 (gender=M)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[17/40] Fine-tuning on speaker 7976 (gender=M)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[18/40] Fine-tuning on speaker 1673 (gender=F)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[19/40] Fine-tuning on speaker 1919 (gender=F)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[20/40] Fine-tuning on speaker 7850 (gender=F)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[21/40] Fine-tuning on speaker 174 (gender=M)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[22/40] Fine-tuning on speaker 8842 (gender=F)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[23/40] Fine-tuning on speaker 2277 (gender=F)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[24/40] Fine-tuning on speaker 6345 (gender=F)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[25/40] Fine-tuning on speaker 1993 (gender=F)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[26/40] Fine-tuning on speaker 2803 (gender=M)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[27/40] Fine-tuning on speaker 2078 (gender=M)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[28/40] Fine-tuning on speaker 6319 (gender=F)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[29/40] Fine-tuning on speaker 1272 (gender=M)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[30/40] Fine-tuning on speaker 1988 (gender=F)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[31/40] Fine-tuning on speaker 3752 (gender=M)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[32/40] Fine-tuning on speaker 422 (gender=M)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[33/40] Fine-tuning on speaker 5536 (gender=M)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[34/40] Fine-tuning on speaker 2412 (gender=F)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[35/40] Fine-tuning on speaker 8297 (gender=M)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[36/40] Fine-tuning on speaker 3576 (gender=F)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[37/40] Fine-tuning on speaker 2428 (gender=M)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[38/40] Fine-tuning on speaker 777 (gender=M)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[39/40] Fine-tuning on speaker 5694 (gender=M)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


[40/40] Fine-tuning on speaker 2035 (gender=F)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


Final dataset format:
Dataset({
    features: ['text_original', 'audio', 'speaker_id', 'gender', 'layer_0_diff', 'layer_1_diff', 'layer_2_diff', 'layer_3_diff', 'layer_4_diff', 'layer_5_diff', 'layer_6_diff', 'layer_7_diff', 'layer_8_diff', 'layer_9_diff', 'layer_10_diff', 'layer_11_diff'],
    num_rows: 40
})


In [4]:
final_dataset.push_to_hub("HamdanXI/libritts_FL")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/HamdanXI/libritts_FL/commit/997124b0b6210ca9d81562dbdea8d65e5f604935', commit_message='Upload dataset', commit_description='', oid='997124b0b6210ca9d81562dbdea8d65e5f604935', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/HamdanXI/libritts_FL', endpoint='https://huggingface.co', repo_type='dataset', repo_id='HamdanXI/libritts_FL'), pr_revision=None, pr_num=None)