## This is a rough training process for sound localization.

In this simplified model, we use GCC-PHAT as input and employ only fully connected layers to predict the sound's direction (localization).

### 1. Load the training and testing data from the dataset first

In [1]:
import h5py
import numpy as np

train_h5_file_path = r"C:\Users\grizi\Desktop\TUD\year2\thesis\neural_network\DoA_Net\data\training_more.h5"

with h5py.File(train_h5_file_path, 'r') as h5file:
    # Iterate through each dataset in the H5 file
    for dataset_name in h5file:
        # Get the dataset

        train_dataset = h5file[dataset_name]

        if dataset_name == "label":
            angel_labels_train = np.array(train_dataset)
        elif dataset_name == "gcc_vectors":
            gcc_vectors_train = np.array(train_dataset)
        
        # Print the dataset name and its shape
        print(f"Dataset: {dataset_name}, Shape: {train_dataset.shape}")

test_h5_file_path = r"C:\Users\grizi\Desktop\TUD\year2\thesis\neural_network\DoA_Net\data\test.h5"

with h5py.File(test_h5_file_path, 'r') as h5file:
    # Iterate through each dataset in the H5 file
    for dataset_name in h5file:
        # Get the dataset

        test_dataset = h5file[dataset_name]

        if dataset_name == "label":
            angel_labels_test = np.array(test_dataset)
        elif dataset_name == "gcc_vectors":
            gcc_vectors_test = np.array(test_dataset)
        
        # Print the dataset name and its shape
        print(f"Dataset: {dataset_name}, Shape: {test_dataset.shape}")

Dataset: audio, Shape: (98396, 2400, 4)
Dataset: gcc_vectors, Shape: (98396, 51, 6)
Dataset: label, Shape: (98396, 5)
Dataset: audio, Shape: (19151, 2400, 4)
Dataset: gcc_vectors, Shape: (19151, 51, 6)
Dataset: label, Shape: (19151, 5)


Here, we need to encode the angle information into a label format that the model can effectively learn. There are two possible encoding methods:

1. One-Hot Encoding: In this approach, the angle is represented as a one-hot vector, where only the element corresponding to the specific angle is 1, and all other 359 elements are 0. However, this method has a major drawback: it treats all angles as equally distant in the label space, which ignores the natural angular relationship. For instance, angles 0° and 180° are far apart, yet their label distance is treated the same as angles 0° and 1°, which are actually very close.

2. Gaussian Distribution Encoding: Instead of a strict one-hot representation, we can assign a Gaussian distribution around the ground truth angle. For example, if the true angle is 200°, we label the range from 195° to 205° with a Gaussian distribution, giving higher weights to angles closer to the ground truth. This method creates a soft connection between adjacent labels, allowing the model to better capture the inherent continuity of the angle space in a classification task.

First we generate a guassian distribution for 10 elements around the ground truth

In [3]:
def generate_gaussian(center, start, end, peak_value=1, sigma=1):

    x = np.arange(start, end + 1)  # Generate the range from start to end
    gaussian = np.exp(-0.5 * ((x - center) ** 2) / (sigma ** 2))  # Gaussian formula
    gaussian = gaussian / gaussian.max() * peak_value  # Normalize to make the peak value equal to `peak_value`
    return gaussian

# Generate Gaussian distribution from 1 to 11 with peak at 6
start, end, center = 1, 61, 30
sigma = 1  # Standard deviation
gaussian_distribution = generate_gaussian(center, start, end, peak_value=1, sigma=sigma)

gaussian_distribution


array([2.39425476e-183, 5.70904011e-171, 5.00796571e-159, 1.61608841e-147,
       1.91855567e-136, 8.37894253e-126, 1.34619985e-115, 7.95674389e-106,
       1.73008221e-096, 1.38389653e-087, 4.07235863e-079, 4.40853133e-071,
       1.75568810e-063, 2.57220937e-056, 1.38634329e-049, 2.74878501e-043,
       2.00500878e-037, 5.38018616e-032, 5.31109225e-027, 1.92874985e-022,
       2.57675711e-018, 1.26641655e-014, 2.28973485e-011, 1.52299797e-008,
       3.72665317e-006, 3.35462628e-004, 1.11089965e-002, 1.35335283e-001,
       6.06530660e-001, 1.00000000e+000, 6.06530660e-001, 1.35335283e-001,
       1.11089965e-002, 3.35462628e-004, 3.72665317e-006, 1.52299797e-008,
       2.28973485e-011, 1.26641655e-014, 2.57675711e-018, 1.92874985e-022,
       5.31109225e-027, 5.38018616e-032, 2.00500878e-037, 2.74878501e-043,
       1.38634329e-049, 2.57220937e-056, 1.75568810e-063, 4.40853133e-071,
       4.07235863e-079, 1.38389653e-087, 1.73008221e-096, 7.95674389e-106,
       1.34619985e-115, 8

In [4]:
zero_matrix_train = np.zeros((len(angel_labels_train),360))
zero_matrix_test = np.zeros((len(angel_labels_test),360))

# Define the Gaussian distribution function
def gaussian_labeling(label, num_classes=360):

    zero_row = np.zeros((num_classes))
    for i in range(61):
        center = label + i - 30
        if center < 0:
            center = 360 + center
        elif center > 359:
            center = center - 360
        zero_row[center] = gaussian_distribution[i]

    return zero_row

    

# set the method for encoding the labels
encode_method = "gaussian"

# encode the labels as an one-hot format
if encode_method == "one_hot":
    for i in range(len(angel_labels_train)):
        zero_matrix_train[i][int(angel_labels_train[i,3])] = 1
    for i in range(len(angel_labels_test)):
        zero_matrix_test[i][int(angel_labels_test[i,3])] = 1

    possibility_matrix_train = zero_matrix_train
    possibility_matrix_test = zero_matrix_test

# encode the labels as a gaussian distribution
else:

    # Generate the Gaussian distribution encoding
    possibility_matrix_train = zero_matrix_train
    possibility_matrix_test = zero_matrix_test

    for i in range(len(angel_labels_train)):
        ground_truth_angle = int(angel_labels_train[i, 3])  # Ground truth angle
        possibility_matrix_train[i,:] = gaussian_labeling(ground_truth_angle, 360)

    for i in range(len(angel_labels_test)):
        ground_truth_angle = int(angel_labels_test[i, 3])  # Ground truth angle
        possibility_matrix_test[i,:] = gaussian_labeling(ground_truth_angle, 360)


# possibility_matrix now contains the Gaussian-encoded labels
print(possibility_matrix_train.shape)
print(possibility_matrix_test.shape)


(98396, 360)
(19151, 360)


### 2. Start the training process

First we constrcut a fully connected neural network

In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
print(torch.__version__)

# Define the dataset class
class GCCDataset(Dataset):
    def __init__(self, data, labels):
        """
        Args:
            data (np.ndarray): Input data of shape (num_samples, 51, 6).
            labels (np.ndarray): Ground truth labels (DOA indices, 0-359).
        """
        self.data = torch.tensor(data, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# Define the model architecture
class GCCPhatModel(nn.Module):
    def __init__(self):
        super(GCCPhatModel, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(51 * 6, 1000)
        self.bn1 = nn.BatchNorm1d(1000)
        self.fc2 = nn.Linear(1000, 1000)
        self.bn2 = nn.BatchNorm1d(1000)
        self.fc3 = nn.Linear(1000, 1000)
        self.bn3 = nn.BatchNorm1d(1000)
        self.fc4 = nn.Linear(1000, 360)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.flatten(x)
        x = self.bn1(nn.ReLU()(self.fc1(x)))
        x = self.bn2(nn.ReLU()(self.fc2(x)))
        x = self.bn3(nn.ReLU()(self.fc3(x)))
        x = self.fc4(x)
        return self.softmax(x)





2.1.1+cu121


Before the training phase, we want to verify whether our network can effectively learn features from the data.

Training Phase

In [23]:
# Generate synthetic data for demonstration (replace this with your actual data)
num_samples = len(possibility_matrix_train)
input_dim = (51, 6)
num_classes = 360

# # Shuffle gcc_vectors_train and its labels
# train_indices = np.random.permutation(len(gcc_vectors_train))
# gcc_vectors_train = gcc_vectors_train[train_indices]
# possibility_matrix_train = possibility_matrix_train[train_indices]

# # Shuffle gcc_vectors_test and its labels
# test_indices = np.random.permutation(len(gcc_vectors_test))
# gcc_vectors_test = gcc_vectors_test[test_indices]
# possibility_matrix_test = possibility_matrix_test[test_indices]

# Create datasets
train_data, train_labels = gcc_vectors_train.astype(np.float32), possibility_matrix_train.astype(np.float32)
test_data, test_labels = gcc_vectors_test.astype(np.float32), possibility_matrix_test.astype(np.float32)

train_dataset = GCCDataset(train_data, train_labels)
test_dataset = GCCDataset(test_data, test_labels)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

# Initialize the model, loss function, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = GCCPhatModel().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training loop
num_epochs = 40
iter = 0
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # print(f"Iter {iter}, Loss: {loss.item()}")
        iter += 1
        
        running_loss += loss.item()


    # Print epoch loss
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {running_loss / len(train_loader):.4f}")

cuda
Epoch 1/40, Loss: 14.4480
Epoch 2/40, Loss: 14.3197
Epoch 3/40, Loss: 14.2832
Epoch 4/40, Loss: 14.2622
Epoch 5/40, Loss: 14.2469
Epoch 6/40, Loss: 14.2333
Epoch 7/40, Loss: 14.2209
Epoch 8/40, Loss: 14.2156
Epoch 9/40, Loss: 14.2002
Epoch 10/40, Loss: 14.1942
Epoch 11/40, Loss: 14.1852
Epoch 12/40, Loss: 14.1806
Epoch 13/40, Loss: 14.1727
Epoch 14/40, Loss: 14.1645
Epoch 15/40, Loss: 14.1598
Epoch 16/40, Loss: 14.1569
Epoch 17/40, Loss: 14.1501
Epoch 18/40, Loss: 14.1436
Epoch 19/40, Loss: 14.1379
Epoch 20/40, Loss: 14.1353
Epoch 21/40, Loss: 14.1344
Epoch 22/40, Loss: 14.1302
Epoch 23/40, Loss: 14.1221
Epoch 24/40, Loss: 14.1198
Epoch 25/40, Loss: 14.1167
Epoch 26/40, Loss: 14.1122
Epoch 27/40, Loss: 14.1103
Epoch 28/40, Loss: 14.1088
Epoch 29/40, Loss: 14.1014
Epoch 30/40, Loss: 14.1023
Epoch 31/40, Loss: 14.0993
Epoch 32/40, Loss: 14.0955
Epoch 33/40, Loss: 14.0963
Epoch 34/40, Loss: 14.0973
Epoch 35/40, Loss: 14.0919
Epoch 36/40, Loss: 14.0892
Epoch 37/40, Loss: 14.0877
Epoch

In [27]:
# Evaluation
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(torch.argmax(targets,dim=1).cpu().numpy())

# Calculate accuracy
# Calculate the difference (error) between predictions and labels
errors = np.array(all_preds) - np.array(all_labels)
errors = np.abs(errors)
for i in range(len(errors)):
    if errors[i] > 180:
        errors[i] = 360 - errors[i]
print(errors)

print(all_preds)
print(all_labels)
mean_error = np.mean(np.abs(errors))

correct = 0
for i in range (len(all_preds)):
    if np.abs(errors[i]) <= 5:
        correct += 1
accuracy = correct / len(all_preds)
# accuracy = accuracy_score(all_labels, all_preds)

print(f"Mean Error: {mean_error}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")


[103   1   1 ...   1 171 175]
[128, 24, 20, 12, 12, 12, 12, 9, 347, 5, 5, 147, 3, 356, 355, 355, 5, 1, 147, 13, 20, 24, 24, 24, 24, 29, 22, 29, 34, 24, 52, 30, 37, 37, 37, 38, 29, 83, 83, 98, 96, 98, 96, 101, 101, 147, 147, 61, 269, 312, 309, 309, 317, 309, 309, 304, 312, 312, 309, 309, 312, 312, 309, 309, 312, 309, 309, 309, 314, 304, 128, 128, 133, 128, 128, 128, 128, 128, 128, 128, 133, 128, 128, 128, 128, 124, 286, 323, 328, 328, 328, 328, 317, 226, 328, 323, 328, 328, 328, 323, 226, 323, 317, 328, 133, 148, 285, 323, 323, 328, 328, 328, 269, 226, 328, 317, 124, 328, 226, 241, 323, 317, 328, 328, 323, 323, 323, 328, 328, 328, 328, 328, 323, 323, 323, 323, 328, 328, 128, 226, 323, 328, 34, 34, 323, 328, 111, 323, 328, 328, 328, 328, 226, 323, 328, 328, 328, 118, 323, 323, 323, 328, 328, 328, 323, 323, 323, 323, 323, 328, 259, 323, 323, 328, 328, 328, 226, 226, 226, 328, 328, 323, 328, 328, 226, 323, 328, 328, 29, 22, 128, 226, 323, 323, 323, 328, 328, 328, 287, 328, 328, 328, 328, 3

Here we start a markdown block to record the structure and the outcome of the neural network

| Index | Iterations | Linear Layers | Label | Batch Size | Mean Error | Accuracy |  Gaussian Range | Training Size |
|-------|------------|---------------|-------|------------|----------|----------|----------|----------|
|   1   |     20     |       4       |   one-hot   |     2048      |    23.42     |    76.73%     |    -    |    74876    |
|   2   |     100      |       4       |   one-hot   |    2048      |    25.39     |    75.89%     |    -     |    74876    |
|  3    |    1000      |      4       |  one-hot   |    2048     |    25.06     |    75.82%     |    -     |    74876    |
|  4   |    100      |      4       |  one-hot   |    1024      |    23.54     |    76.48%     |    -     |    74876    |
|  5   |    100      |      4       |  gaussian   |    2048      |    21.49     |   80.66%     |    90     |    74876    |
|  5   |    100      |      4       |  gaussian   |    2048      |    20.73     |   80.46%     |    60     |    74876    |
|  6   |    100      |      4       |  gaussian   |    2048      |    21.31     |   79.95%     |    30    |    74876    |
|  7   |    100      |      4       |  gaussian   |    1024      |    21.12     |   80.96%     |    60     |    74876    |
|  8   |    100      |      4       |  gaussian   |    1024      |    19.41     |   81.34%     |    60    |    93476    |
