<a href="https://colab.research.google.com/github/LeonardFreris/fleonardos_ece_uth_projects/blob/main/Untitled4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install speechbrain

In [2]:
import os
import random

# Define the path to the folder containing the audio files
data_folder = '/content/drive/MyDrive/spoken_digits'

# Define the proportions of the train, validation, and test sets
train_proportion = 0.7
val_proportion = 0.2
test_proportion = 0.1

# Get a list of the audio files in the folder
audio_files = os.listdir(data_folder)
audio_files = [os.path.join(data_folder, f) for f in audio_files if f.endswith('.wav')]

# Shuffle the list of audio files
random.shuffle(audio_files)

# Calculate the number of files in each set based on the proportions
num_train_files = int(train_proportion * len(audio_files))
num_val_files = int(val_proportion * len(audio_files))
num_test_files = len(audio_files) - num_train_files - num_val_files

# Split the list of audio files into train, validation, and test sets
train_files = audio_files[:num_train_files]
val_files = audio_files[num_train_files:num_train_files+num_val_files]
test_files = audio_files[num_train_files+num_val_files:]

# Print the number of files in each set
print(f'Number of train files: {len(train_files)}')
print(f'Number of val files: {len(val_files)}')
print(f'Number of test files: {len(test_files)}')

Number of train files: 2100
Number of val files: 600
Number of test files: 300


In [3]:
import os

# Define the ranges for each label
label_ranges = [(1, 300), (301, 600), (601, 900), (901, 1200), (1201, 1500),
                (1501, 1800), (1801, 2100), (2101, 2400), (2401, 2700), (2701, 3000)]

# Loop over the files in the folder and write the labels to the output file
with open('/content/drive/MyDrive/transcription1.txt', 'w') as f:
    for filename in train_files:
        if filename.endswith('.wav'):
            file_number = int(filename.split('_')[-1].split('.')[0])
            for i, label_range in enumerate(label_ranges):
                if label_range[0] <= file_number <= label_range[1]:
                    label = str(i)
                    break
            else:
                raise ValueError(f"File number {file_number} does not fit in any label range")
            # Write the label to the output file
            f.write(f"{label}\n")

In [4]:
import torch.nn as nn

class DigitRecognizer(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.relu3 = nn.ReLU()
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(128 * 6 * 4, 512)
        self.relu4 = nn.ReLU()
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.pool2(x)
        x = self.conv3(x)
        x = self.bn3(x)
        x = self.relu3(x)
        x = self.pool3(x)
        x = self.dropout(x)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        x = self.relu4(x)
        x = self.fc2(x)
        return x

In [5]:
import torch
from speechbrain.dataio.dataio import read_audio
from speechbrain.processing.features import STFT, spectral_magnitude, Filterbank, DCT, Deltas
from speechbrain.processing.features import ContextWindow, InputNormalization

class MyAudioFeatures:
    
    def __init__(self, audio_path, sample_rate=16000, win_length=25, hop_length=10, n_fft=400, 
                 n_mels=40, n_mfccs=20, left_frames=5, right_frames=5):
        self.audio_path = audio_path
        self.sample_rate = sample_rate
        self.win_length = win_length
        self.hop_length = hop_length
        self.n_fft = n_fft
        self.n_mels = n_mels
        self.n_mfccs = n_mfccs
        self.left_frames = left_frames
        self.right_frames = right_frames
        self.compute_stft = STFT(sample_rate=self.sample_rate, win_length=self.win_length, 
                                 hop_length=self.hop_length, n_fft=self.n_fft)
        self.compute_fbanks = Filterbank(n_mels=self.n_mels)
        self.compute_mfccs = DCT(input_size=self.n_mels, n_out=self.n_mfccs)
        self.compute_deltas = Deltas(input_size=self.n_mfccs)
        self.compute_cw = ContextWindow(left_frames=self.left_frames, right_frames=self.right_frames)
        self.norm = InputNormalization()
        
    def extract_features(self):
        signal = read_audio(self.audio_path)
        signal = signal.unsqueeze(0)
        features = self.compute_stft(signal)
        features = spectral_magnitude(features)
        features = self.compute_fbanks(features)
        features = self.compute_mfccs(features)
        delta1 = self.compute_deltas(features)
        delta2 = self.compute_deltas(delta1)
        features = torch.cat([features, delta1, delta2], dim=2)
        features = self.compute_cw(features)
        features = self.norm(features, torch.tensor([1]).float())
        return features

In [6]:
audio_path = '/content/drive/MyDrive/spoken_digits/audio_1002.wav'
audio_features = MyAudioFeatures(audio_path)
features = audio_features.extract_features()
print(features.shape)

torch.Size([1, 31, 660])


In [7]:
class LabelEncoder:
    def __init__(self):
        label_list = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
        self.vocab = {label: index for index, label in enumerate(label_list)}
        self.inverse_vocab = {v: k for k, v in self.vocab.items()}
        self.blank_index = -1

    def encode_label(self, label):
        return torch.tensor([self.vocab[label]])

    def decode_label(self, label):
        return self.inverse_vocab[label.item()]

In [8]:
import soundfile as sf
import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence

class SpeechDataset(Dataset):
    def __init__(self, files, labels_file, feature_extractor):
        self.files = files
        self.feature_extractor = feature_extractor
        self.label_encoder = LabelEncoder()

        # Load label data from file if labels_file is not None
        self.labels = []
        if labels_file is not None:
            with open(labels_file, 'r') as file:
                file_contents = file.readlines()
            for label in file_contents:
                # Remove any leading or trailing whitespace characters (including the '\n' character)
                label = label.strip()
                self.labels.append(label)

    def __len__(self):
        return len(self.files)

    def __getitem__(self, index):
        # Load the audio file from the path specified in the preprocessed data
        audio_path = self.files[index]
        audio, sample_rate = sf.read(audio_path)

        audio = torch.FloatTensor(audio)
        audio = audio.unsqueeze(0)  # Add a dimension for the batch
        audio = pad_sequence([audio], batch_first=True, padding_value=0)
        audio = audio.squeeze(0)  # Remove the extra batch dimension

        # Extract the label from the preprocessed data and encode it as an integer
        label_str = self.labels[index]
        label_int = self.label_encoder.encode_label(label_str)

        # Return the audio tensor and label as a tuple
        return audio, label_int

In [9]:
from torch.utils.data import DataLoader

labels_file = '/content/drive/MyDrive/transcription1.txt'

train_data = SpeechDataset(train_files, labels_file, audio_features.extract_features())
valid_data = SpeechDataset(val_files, None, audio_features.extract_features())  # Note: no labels file for validation data
test_data = SpeechDataset(test_files, None, audio_features.extract_features())  # Note: no labels file for test data

# Set your batch size and create your data loaders
batch_size = 32
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

In [10]:
inputs, targets = next(iter(train_loader))


RuntimeError: ignored

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader
import librosa

# Define the training parameters
num_epochs = 10
learning_rate = 1e-4

# Initialize the model and optimizer
model = DigitRecognizer(10)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Define the loss function
loss_fn = nn.CrossEntropyLoss()

# Training loop
for epoch in range(num_epochs):
    for i, (inputs, targets) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()

    # Compute accuracy on validation set
    with torch.no_grad():
        correct = 0
        total = 0
        for inputs, targets in valid_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
        val_acc = correct / total

    print(f"Epoch {epoch+1}: Validation Accuracy = {val_acc}")

TypeError: ignored

In [None]:
from speechbrain.utils.distributed import run_on_main

@run_on_main
def main():
# Load the best checkpoint from training
brain.checkpointer.load_best_model()

# Evaluate the model on the test set
test_loss, test_metric = brain.evaluate()

# Print the test results
print(f"Test Loss: {test_loss:.4f} | Test Accuracy: {test_metric:.4f}")

In [None]:
from speechbrain.utils.distributed import run_on_main

@run_on_main
def main():
# Load the best checkpoint from training
brain.checkpointer.load_best_model()

# Preprocess the audio file
audio_file = 'path/to/audio/file'
features = batch_prep.preprocess_utterance(audio_file)

# Run inference on the preprocessed features
output = model(features)

# Decode the output to obtain the predicted digit
prediction = label_encoder.decode_label(torch.argmax(output))
print(f"Predicted Digit: {prediction}")

IndentationError: ignored