In [59]:
pip install transformers==4.44.2 datasets==3.1.0 torch==2.5.0

Note: you may need to restart the kernel to use updated packages.


In [60]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")  # Use GPU
    print("GPU is available. Using:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")  # Use CPU
    print("GPU is not available. Using CPU.")

GPU is not available. Using CPU.


In [61]:
# Load the orginal VoxPopuli dataset
datasetO = load_dataset("facebook/voxpopuli", "sl", split="train")
len(datasetO)

2099

In [62]:
from datasets import load_dataset

# Set random seed
torch.manual_seed(42)

# Load the VoxPopuli dataset
dataset = load_dataset("facebook/voxpopuli", "sl", split="train", streaming= True)

# Shuffle the dataset
import random
random.seed(42)
shuffled_dataset = list(dataset)
random.shuffle(shuffled_dataset)

print("Sample Data:", shuffled_dataset[:2])  # Display 2 samples for inspection


Sample Data: [{'audio_id': '20190918-0900-PLENARY-sl_20190918-19:18:48_3', 'language': 13, 'audio': {'path': 'train_part_0/20190918-0900-PLENARY-sl_20190918-19:18:48_3.wav', 'array': array([ 0.00042725, -0.00488281, -0.00680542, ...,  0.00708008,
        0.00717163,  0.00378418]), 'sampling_rate': 16000}, 'raw_text': 'Nobena od te in podobnih resolucij, ki jih je Evropski parlament sprejel že zelo veliko, ne more biti zares koristna, če sočasno dopuščamo dejanja, ki jih ta resolucija obsoja, in to so zatiranje demokracije, nespoštovanje pravne države, odrekanje pravic manjšinam in drugačnim, avtokracija, nacionalizem.', 'normalized_text': 'nobena od te in podobnih resolucij ki jih je evropski parlament sprejel že zelo veliko ne more biti zares koristna če sočasno dopuščamo dejanja ki jih ta resolucija obsoja in to so zatiranje demokracije nespoštovanje pravne države odrekanje pravic manjšinam in drugačnim avtokracija nacionalizem.', 'gender': 'female', 'speaker_id': '96911', 'is_gold_t

In [63]:
dataset

IterableDataset({
    features: ['audio_id', 'language', 'audio', 'raw_text', 'normalized_text', 'gender', 'speaker_id', 'is_gold_transcript', 'accent'],
    num_shards: 1
})

In [64]:
# # Count the length of the original streaming dataset
# dataset_length = sum(1 for _ in dataset)
# print(f"The length of the original dataset is: {dataset_length}")

In [65]:
# Shuffle dataset
torch.manual_seed(42)
dataset = dataset.shuffle(seed=42)

In [66]:
# Step 4: Feature Extraction Using Wav2Vec2
from transformers import Wav2Vec2Processor, Wav2Vec2Model

# Load pretrained Wav2Vec2 model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
wav2vec2_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h").to(device)

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [67]:
# Function for feature extraction
def extract_features(batch):
    inputs = processor(batch["audio"]["array"], sampling_rate=batch["audio"]["sampling_rate"], return_tensors="pt", padding=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = wav2vec2_model(**inputs)
    return {"features": outputs.last_hidden_state.cpu().numpy()}

In [68]:
# Apply feature extraction
dataset = dataset.map(extract_features)

In [None]:
max_samples = 2101
min_feature_length = float('inf')

for i, example in enumerate(dataset):
    if i >= max_samples:
        break
    feature_length = example['features'].shape[1]
    min_feature_length = min(min_feature_length, feature_length)

print(f"Minimum feature length (limited to {max_samples} samples): {min_feature_length}")


In [None]:
# Initialize the maximum feature length
max_feature_length = float('-inf')  # Start with a very small value

# Iterate over the streaming dataset
for example in dataset:
    feature_length = example['features'].shape[1]  # Access the feature length
    max_feature_length = max(max_feature_length, feature_length)  # Update maximum

print(f"Maximum feature length: {max_feature_length}")


In [None]:
from datasets import Dataset

# Collect filtered examples into a list
filtered_examples = []
for example in dataset:
    if example["features"].shape[1] >= 200:  # Apply filtering condition
        filtered_examples.append(example)

# Convert the filtered examples into a non-streaming dataset
filtered_dataset = Dataset.from_list(filtered_examples)

print(f"Length of filtered dataset: {len(filtered_dataset)}")


In [None]:
dataset = filtered_dataset

In [None]:
# Print the first 5 examples from the dataset to check the data
for i in range(5):
    print(dataset[i]['features'].shape)


In [None]:
len(dataset)

In [None]:
# Map speakers to integer labels
speakers = sorted(set(dataset["speaker_id"]))
speaker_to_label = {speaker: idx for idx, speaker in enumerate(speakers)}
dataset = dataset.map(lambda x: {"label": speaker_to_label[x["speaker_id"]]})


In [None]:
# Access the speaker ID of the first example in the filtered dataset
first_example_speaker_id = dataset[0]["speaker_id"]
print(f"Speaker ID of the first example: {first_example_speaker_id}")

In [None]:
# Get the number of unique speakers in the filtered dataset
unique_speakers = len(set([x["speaker_id"] for x in dataset]))
print(f"Number of unique speakers: {unique_speakers}")

In [None]:
# Step 6: Model Definition
import torch.nn as nn

In [None]:
class CNNModel(nn.Module):
    def __init__(self, num_classes):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv1d(768, 256, kernel_size=3)
        self.bn1 = nn.BatchNorm1d(256)
        self.conv2 = nn.Conv1d(256, 128, kernel_size=3)
        self.bn2 = nn.BatchNorm1d(128)
        self.conv3 = nn.Conv1d(128, 32, kernel_size=3)
        self.bn3 = nn.BatchNorm1d(32)
        self.fc1 = nn.Linear(32, 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool1d(2)
        self.global_avg_pool = nn.AdaptiveAvgPool1d(1)

    def forward(self, x):
        x = self.relu(self.bn1(self.conv1(x)))
        x = self.pool(x)
        x = self.relu(self.bn2(self.conv2(x)))
        x = self.pool(x)
        x = self.relu(self.bn3(self.conv3(x)))
        x = self.pool(x)
        x = self.global_avg_pool(x).squeeze(-1)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [None]:
# Step 7: Data Splitting and DataLoader
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset

In [None]:
# Split the dataset
train_data, test_data = train_test_split(list(dataset), test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)


In [None]:
# Calculate the length of the training dataset
train_length = len(train_data)
print(f"Length of the train dataset: {train_length}")

In [None]:
# Define the batch size for training
batch_size = 100  # As per the instructions

# Calculate the number of batches in the training dataset
num_batches_train = (len(train_data) + batch_size - 1) // batch_size  # Ceiling division
print(f"Number of batches in train data: {num_batches_train}")


In [None]:
# Custom PyTorch Dataset
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        features = torch.tensor(item["features"][:200], dtype=torch.float32).permute(1, 0)  # Truncate to 200 frames
        label = torch.tensor(item["label"], dtype=torch.long)
        return features, label

In [None]:
# Create Datasets and DataLoaders
batch_sizes = {"train": 100, "val": 10, "test": 1}
train_loader = DataLoader(CustomDataset(train_data), batch_size=batch_sizes["train"], shuffle=True)
val_loader = DataLoader(CustomDataset(val_data), batch_size=batch_sizes["val"], shuffle=False)
test_loader = DataLoader(CustomDataset(test_data), batch_size=batch_sizes["test"], shuffle=False)

In [None]:
# Step 8: Training the Model
model = CNNModel(num_classes=len(speakers)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Training loop
for epoch in range(100):
    model.train()
    train_loss = 0
    for features, labels in train_loader:
        features, labels = features.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss /= len(train_loader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for features, labels in val_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
    val_loss /= len(val_loader)
    print(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")


In [None]:
# Step 9: Evaluating on the Test Set
from sklearn.metrics import accuracy_score

model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for features, labels in test_loader:
        features, labels = features.to(device), labels.to(device)
        outputs = model(features)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

test_accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")