<a href="https://colab.research.google.com/github/HamdanXI/nlp_adventure/blob/main/ASR-Privacy/1-second-speech-predict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [6]:
ds

DatasetDict({
    train: Dataset({
        features: ['audio', 'text_normalized', 'text_original', 'speaker_id', 'path', 'chapter_id', 'id'],
        num_rows: 5736
    })
})

In [7]:
ds['train']['audio'][0]

{'path': '2902_9008_000002_000000.wav',
 'array': array([-3.05175781e-05, -3.05175781e-04, -3.96728516e-04, ...,
         8.54492188e-04,  8.54492188e-04,  1.40380859e-03]),
 'sampling_rate': 24000}

In [None]:
from datasets import load_dataset
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
ds = load_dataset("HamdanXI/libritts_dev_dataset")

# Parameters
sample_rate = 24000  # Assuming the sampling rate from your screenshot
segment_duration = 1  # Duration of audio segment in seconds
segment_length = sample_rate * segment_duration  # Number of samples in the segment

# Preprocess the dataset
def process_audio(batch):
    waveform = batch["audio"]["array"]
    # Ensure we take only the first `segment_duration` seconds
    if len(waveform) >= segment_length:
        waveform = waveform[:segment_length]
    else:
        # Pad if shorter than required length
        padding = segment_length - len(waveform)
        waveform = np.pad(waveform, (0, padding), mode="constant")
    # Flatten waveform to use as features
    batch["features"] = waveform
    return batch

# Apply processing
ds = ds.map(process_audio, remove_columns=["audio"])

# Extract features and labels
X = np.array([item["features"] for item in ds["train"]])
y = np.array([item["speaker_id"] for item in ds["train"]])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

In [None]:
from datasets import load_dataset
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification, TrainingArguments, Trainer
import torch
import numpy as np
from sklearn.model_selection import train_test_split

# Load the dataset
ds = load_dataset("HamdanXI/libritts_dev_dataset")

# Parameters
sample_rate = 16000  # Wav2Vec2 expects audio at 16kHz
segment_duration = 1  # 1-second audio segments
segment_length = sample_rate * segment_duration

# Create a mapping from string speaker IDs to integers
unique_labels = sorted(set(ds["train"]["speaker_id"]))
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

# Load Wav2Vec2 processor and model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=len(unique_labels)
)
model.config.label2id = label2id
model.config.id2label = id2label

# Preprocess the dataset
def preprocess_audio(batch):
    waveform = torch.tensor(batch["audio"]["array"], dtype=torch.float32)  # Ensure Float32 type
    # Ensure 1-second segments
    if len(waveform) >= segment_length:
        waveform = waveform[:segment_length]
    else:
        padding = segment_length - len(waveform)
        waveform = torch.nn.functional.pad(waveform, (0, padding))
    # Resample to 16kHz
    if batch["audio"]["sampling_rate"] != sample_rate:
        resampler = torchaudio.transforms.Resample(
            orig_freq=batch["audio"]["sampling_rate"], new_freq=sample_rate
        )
        waveform = resampler(waveform)
    # Normalize the waveform
    waveform = (waveform - waveform.mean()) / waveform.std()
    return {
        "input_values": processor(waveform.numpy(), sampling_rate=sample_rate).input_values[0],
        "label": label2id[batch["speaker_id"]],  # Convert label to integer
    }

# Apply preprocessing
ds = ds.map(preprocess_audio, remove_columns=["audio"])

# Split dataset into train/test
train_test_split = ds["train"].train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

# Define data collator
def data_collator(features):
    input_values = torch.tensor([f["input_values"] for f in features], dtype=torch.float32)
    labels = torch.tensor([f["label"] for f in features], dtype=torch.long)  # Use long for classification labels
    return {"input_values": input_values, "labels": labels}

# Training arguments
training_args = TrainingArguments(
    output_dir="./wav2vec2-speaker-id",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=5,
    warmup_steps=500,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    fp16=True,
    push_to_hub=False,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=processor.feature_extractor,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

# Evaluate the model
metrics = trainer.evaluate()
print(f"Metrics: {metrics}")

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/5736 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,7.0482,3.515486


Epoch,Training Loss,Validation Loss
1,7.0482,3.515486
