## This is a rough training process for sound localization.

In this simplified model, we use GCC-PHAT as input and employ only fully connected layers to predict the sound's direction (localization).

### 1. Get the data from the dataset first

In [7]:
import h5py
import numpy as np

h5_file_path = r"C:\Users\grizi\Desktop\TUD\year2\thesis\neural_network\DoA_Net\data\training_more.h5"

with h5py.File(h5_file_path, 'r') as h5file:
    # Iterate through each dataset in the H5 file
    for dataset_name in h5file:
        # Get the dataset

        dataset = h5file[dataset_name]

        if dataset_name == "label":
            angel_labels = np.array(dataset)
        elif dataset_name == "gcc_vectors":
            gcc_vectors = np.array(dataset)
        
        # Print the dataset name and its shape
        print(f"Dataset: {dataset_name}, Shape: {dataset.shape}")

Dataset: audio, Shape: (98396, 2400, 4)
Dataset: gcc_vectors, Shape: (98396, 51, 6)
Dataset: label, Shape: (98396, 5)


In [8]:
print(gcc_vectors.shape)
print(angel_labels.shape)

(98396, 51, 6)
(98396, 5)


Here, we need to encode the angle information into a label format that the model can effectively learn. There are two possible encoding methods:

1. One-Hot Encoding: In this approach, the angle is represented as a one-hot vector, where only the element corresponding to the specific angle is 1, and all other 359 elements are 0. However, this method has a major drawback: it treats all angles as equally distant in the label space, which ignores the natural angular relationship. For instance, angles 0° and 180° are far apart, yet their label distance is treated the same as angles 0° and 1°, which are actually very close.

2. Gaussian Distribution Encoding: Instead of a strict one-hot representation, we can assign a Gaussian distribution around the ground truth angle. For example, if the true angle is 200°, we label the range from 195° to 205° with a Gaussian distribution, giving higher weights to angles closer to the ground truth. This method creates a soft connection between adjacent labels, allowing the model to better capture the inherent continuity of the angle space in a classification task.

First we generate a guassian distribution for 10 elements around the ground truth

In [9]:
def generate_gaussian(center, start, end, peak_value=1, sigma=1):

    x = np.arange(start, end + 1)  # Generate the range from start to end
    gaussian = np.exp(-0.5 * ((x - center) ** 2) / (sigma ** 2))  # Gaussian formula
    gaussian = gaussian / gaussian.max() * peak_value  # Normalize to make the peak value equal to `peak_value`
    return gaussian

# Generate Gaussian distribution from 1 to 11 with peak at 6
start, end, center = 1, 61, 31
sigma = 1  # Standard deviation
gaussian_distribution = generate_gaussian(center, start, end, peak_value=1, sigma=sigma)

gaussian_distribution


array([3.69388307e-196, 2.39425476e-183, 5.70904011e-171, 5.00796571e-159,
       1.61608841e-147, 1.91855567e-136, 8.37894253e-126, 1.34619985e-115,
       7.95674389e-106, 1.73008221e-096, 1.38389653e-087, 4.07235863e-079,
       4.40853133e-071, 1.75568810e-063, 2.57220937e-056, 1.38634329e-049,
       2.74878501e-043, 2.00500878e-037, 5.38018616e-032, 5.31109225e-027,
       1.92874985e-022, 2.57675711e-018, 1.26641655e-014, 2.28973485e-011,
       1.52299797e-008, 3.72665317e-006, 3.35462628e-004, 1.11089965e-002,
       1.35335283e-001, 6.06530660e-001, 1.00000000e+000, 6.06530660e-001,
       1.35335283e-001, 1.11089965e-002, 3.35462628e-004, 3.72665317e-006,
       1.52299797e-008, 2.28973485e-011, 1.26641655e-014, 2.57675711e-018,
       1.92874985e-022, 5.31109225e-027, 5.38018616e-032, 2.00500878e-037,
       2.74878501e-043, 1.38634329e-049, 2.57220937e-056, 1.75568810e-063,
       4.40853133e-071, 4.07235863e-079, 1.38389653e-087, 1.73008221e-096,
       7.95674389e-106, 1

In [10]:
zero_matrix = np.zeros((len(angel_labels),360))

# Define the Gaussian distribution function
def gaussian_labeling(label, num_classes=360):

    zero_row = np.zeros((num_classes))
    for i in range(61):
        center = label + i - 30
        if center < 0:
            center = 360 + center
        elif center > 359:
            center = center - 360
        zero_row[center] = gaussian_distribution[i]

    return zero_row

    

# set the method for encoding the labels
encode_method = "gaussian"

# encode the labels as an one-hot format
if encode_method == "one_hot":
    for i in range(len(angel_labels)):
        zero_matrix[i][int(angel_labels[i,3])] = 1

    possibility_matrix = zero_matrix

# encode the labels as a gaussian distribution
else:

    # Generate the Gaussian distribution encoding
    possibility_matrix = zero_matrix
    for i in range(len(angel_labels)):
        ground_truth_angle = int(angel_labels[i, 3])  # Ground truth angle
        possibility_matrix[i,:] = gaussian_labeling(ground_truth_angle, 360)


# possibility_matrix now contains the Gaussian-encoded labels
print(angel_labels[0,3])
print(possibility_matrix[0,180:200])


-165
[1.38634329e-49 2.74878501e-43 2.00500878e-37 5.38018616e-32
 5.31109225e-27 1.92874985e-22 2.57675711e-18 1.26641655e-14
 2.28973485e-11 1.52299797e-08 3.72665317e-06 3.35462628e-04
 1.11089965e-02 1.35335283e-01 6.06530660e-01 1.00000000e+00
 6.06530660e-01 1.35335283e-01 1.11089965e-02 3.35462628e-04]


### 2. Start the training process

First we constrcut a fully connected neural network

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
print(torch.__version__)

# Define the dataset class
class GCCDataset(Dataset):
    def __init__(self, data, labels):
        """
        Args:
            data (np.ndarray): Input data of shape (num_samples, 51, 6).
            labels (np.ndarray): Ground truth labels (DOA indices, 0-359).
        """
        self.data = torch.tensor(data, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# Define the model architecture
class GCCPhatModel(nn.Module):
    def __init__(self):
        super(GCCPhatModel, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(51 * 6, 2000)
        self.bn1 = nn.BatchNorm1d(2000)
        self.fc2 = nn.Linear(2000, 1000)
        self.bn2 = nn.BatchNorm1d(1000)
        self.fc3 = nn.Linear(1000, 1000)
        self.bn3 = nn.BatchNorm1d(1000)
        self.fc4 = nn.Linear(1000, 360)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.flatten(x)
        x = self.bn1(nn.ReLU()(self.fc1(x)))
        x = self.bn2(nn.ReLU()(self.fc2(x)))
        x = self.bn3(nn.ReLU()(self.fc3(x)))
        x = self.fc4(x)
        return self.softmax(x)

# Generate synthetic data for demonstration (replace this with your actual data)
num_samples = len(possibility_matrix)
input_dim = (51, 6)
num_classes = 360

# Random GCC-PHAT data and labels
data = gcc_vectors.astype(np.float32)
labels = possibility_matrix.astype(np.float32)

# Split the data into train and test sets
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.05, random_state=42)
print(train_data.shape, test_data.shape)

# Create DataLoaders
train_dataset = GCCDataset(train_data, train_labels)
test_dataset = GCCDataset(test_data, test_labels)

train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

# Initialize the model, loss function, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = GCCPhatModel().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()

    # Print epoch loss
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {running_loss / len(train_loader):.4f}")



2.1.1+cu121
(93476, 51, 6) (4920, 51, 6)
cuda
Epoch 1/100, Loss: 14.4546
Epoch 2/100, Loss: 14.3319
Epoch 3/100, Loss: 14.2968
Epoch 4/100, Loss: 14.2780
Epoch 5/100, Loss: 14.2622
Epoch 6/100, Loss: 14.2466
Epoch 7/100, Loss: 14.2372
Epoch 8/100, Loss: 14.2242
Epoch 9/100, Loss: 14.2147
Epoch 10/100, Loss: 14.2065
Epoch 11/100, Loss: 14.1979
Epoch 12/100, Loss: 14.1922
Epoch 13/100, Loss: 14.1864
Epoch 14/100, Loss: 14.1802
Epoch 15/100, Loss: 14.1759
Epoch 16/100, Loss: 14.1705
Epoch 17/100, Loss: 14.1657
Epoch 18/100, Loss: 14.1608


KeyboardInterrupt: 

In [12]:
# Evaluation
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(torch.argmax(targets,dim=1).cpu().numpy())

# Calculate accuracy
# Calculate the difference (error) between predictions and labels
errors = np.array(all_preds) - np.array(all_labels)
print(all_preds)
print(all_labels)
mean_error = np.mean(np.abs(errors))
print(errors)
correct = 0
for i in range (len(all_preds)):
    if np.abs(errors[i]) <= 5:
        correct += 1
accuracy = correct / len(all_preds)
# accuracy = accuracy_score(all_labels, all_preds)

print(f"Mean Error: {mean_error}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")


[174, 138, 181, 52, 220, 332, 329, 52, 223, 92, 134, 0, 2, 344, 168, 329, 261, 23, 265, 138, 259, 6, 125, 145, 163, 134, 176, 348, 17, 245, 17, 115, 339, 52, 27, 49, 0, 115, 149, 178, 198, 129, 82, 174, 208, 261, 186, 356, 201, 286, 112, 298, 52, 157, 141, 161, 13, 163, 356, 67, 356, 52, 125, 290, 178, 183, 78, 142, 88, 318, 337, 97, 265, 304, 339, 214, 25, 29, 201, 121, 2, 58, 97, 243, 227, 88, 23, 332, 227, 346, 186, 323, 290, 227, 183, 174, 247, 356, 205, 200, 58, 62, 5, 318, 181, 125, 125, 17, 235, 156, 183, 74, 188, 197, 84, 76, 62, 186, 58, 58, 259, 11, 88, 186, 0, 0, 339, 129, 58, 227, 74, 298, 74, 145, 20, 77, 205, 227, 356, 52, 278, 186, 316, 35, 2, 172, 201, 2, 45, 329, 157, 186, 288, 106, 329, 277, 121, 58, 84, 290, 135, 135, 23, 265, 357, 227, 60, 15, 343, 6, 62, 78, 78, 288, 173, 182, 98, 223, 2, 195, 84, 173, 29, 214, 52, 255, 304, 145, 15, 188, 83, 210, 5, 2, 74, 174, 190, 342, 188, 290, 5, 106, 15, 2, 67, 195, 52, 310, 261, 14, 265, 329, 181, 201, 308, 315, 261, 157, 28

Here we start a markdown block to record the structure and the outcome of the neural network

| Index | Iterations | Linear Layers | Label | Batch Size | Mean Error | Accuracy |  Gaussian Range | Training Size |
|-------|------------|---------------|-------|------------|----------|----------|----------|----------|
|   1   |     20     |       4       |   one-hot   |     2048      |    23.42     |    76.73%     |    -    |    74876    |
|   2   |     100      |       4       |   one-hot   |    2048      |    25.39     |    75.89%     |    -     |    74876    |
|  3    |    1000      |      4       |  one-hot   |    2048     |    25.06     |    75.82%     |    -     |    74876    |
|  4   |    100      |      4       |  one-hot   |    1024      |    23.54     |    76.48%     |    -     |    74876    |
|  5   |    100      |      4       |  gaussian   |    2048      |    21.49     |   80.66%     |    90     |    74876    |
|  5   |    100      |      4       |  gaussian   |    2048      |    20.73     |   80.46%     |    60     |    74876    |
|  6   |    100      |      4       |  gaussian   |    2048      |    21.31     |   79.95%     |    30    |    74876    |
|  7   |    100      |      4       |  gaussian   |    1024      |    21.12     |   80.96%     |    60     |    74876    |
|  8   |    100      |      4       |  gaussian   |    1024      |    19.41     |   81.34%     |    60    |    93476    |
