## This is a rough training process for sound localization.

In this simplified model, we use GCC-PHAT as input and employ only fully connected layers to predict the sound's direction (localization).

### 1. Load the training and testing data from the dataset first

In [1]:
import h5py
import numpy as np

train_h5_file_path = r"C:\Users\grizi\Desktop\TUD\year2\thesis\neural_network\DoA_Net\data\training_more.h5"

with h5py.File(train_h5_file_path, 'r') as h5file:
    # Iterate through each dataset in the H5 file
    for dataset_name in h5file:
        # Get the dataset

        train_dataset = h5file[dataset_name]

        if dataset_name == "label":
            angel_labels_train = np.array(train_dataset)
        elif dataset_name == "gcc_vectors":
            gcc_vectors_train = np.array(train_dataset)
        
        # Print the dataset name and its shape
        print(f"Dataset: {dataset_name}, Shape: {train_dataset.shape}")

test_h5_file_path = r"C:\Users\grizi\Desktop\TUD\year2\thesis\neural_network\DoA_Net\data\test.h5"

with h5py.File(test_h5_file_path, 'r') as h5file:
    # Iterate through each dataset in the H5 file
    for dataset_name in h5file:
        # Get the dataset

        test_dataset = h5file[dataset_name]

        if dataset_name == "label":
            angel_labels_test = np.array(test_dataset)
        elif dataset_name == "gcc_vectors":
            gcc_vectors_test = np.array(test_dataset)
        
        # Print the dataset name and its shape
        print(f"Dataset: {dataset_name}, Shape: {test_dataset.shape}")

print(gcc_vectors_test.shape)

Dataset: audio, Shape: (98396, 2400, 4)
Dataset: gcc_vectors, Shape: (98396, 51, 6)
Dataset: label, Shape: (98396, 5)
Dataset: audio, Shape: (19151, 2400, 4)
Dataset: gcc_vectors, Shape: (19151, 51, 6)
Dataset: label, Shape: (19151, 5)
(19151, 51, 6)


Here, we need to encode the angle information into a label format that the model can effectively learn. There are two possible encoding methods:

1. One-Hot Encoding: In this approach, the angle is represented as a one-hot vector, where only the element corresponding to the specific angle is 1, and all other 359 elements are 0. However, this method has a major drawback: it treats all angles as equally distant in the label space, which ignores the natural angular relationship. For instance, angles 0° and 180° are far apart, yet their label distance is treated the same as angles 0° and 1°, which are actually very close.

2. Gaussian Distribution Encoding: Instead of a strict one-hot representation, we can assign a Gaussian distribution around the ground truth angle. For example, if the true angle is 200°, we label the range from 195° to 205° with a Gaussian distribution, giving higher weights to angles closer to the ground truth. This method creates a soft connection between adjacent labels, allowing the model to better capture the inherent continuity of the angle space in a classification task.

First we generate a guassian distribution for 10 elements around the ground truth

In [2]:
def generate_gaussian(center, start, end, peak_value=1, sigma=1):

    x = np.arange(start, end + 1)  # Generate the range from start to end
    gaussian = np.exp(-0.5 * ((x - center) ** 2) / (sigma ** 2))  # Gaussian formula
    gaussian = gaussian / gaussian.max() * peak_value  # Normalize to make the peak value equal to `peak_value`
    return gaussian

# Generate Gaussian distribution from 1 to 11 with peak at 6
start, end, center = 1, 21, 10
sigma = 10  # Standard deviation
gaussian_distribution = generate_gaussian(center, start, end, peak_value=1, sigma=sigma)

gaussian_distribution


array([0.66697681, 0.72614904, 0.78270454, 0.83527021, 0.8824969 ,
       0.92311635, 0.95599748, 0.98019867, 0.99501248, 1.        ,
       0.99501248, 0.98019867, 0.95599748, 0.92311635, 0.8824969 ,
       0.83527021, 0.78270454, 0.72614904, 0.66697681, 0.60653066,
       0.54607443])

In [4]:
zero_matrix_train = np.zeros((len(angel_labels_train),360))
zero_matrix_test = np.zeros((len(angel_labels_test),360))

# Define the Gaussian distribution function
def gaussian_labeling(label, num_classes=360):

    zero_row = np.zeros((num_classes))
    for i in range(21):
        center = label + i - 10
        if center < 0:
            center = 360 + center
        elif center > 359:
            center = center - 360
        zero_row[center] = gaussian_distribution[i]

    return zero_row

    

# set the method for encoding the labels
encode_method = "gaussian"

# encode the labels as an one-hot format
if encode_method == "one_hot":
    for i in range(len(angel_labels_train)):
        zero_matrix_train[i][int(angel_labels_train[i,3])] = 1
    for i in range(len(angel_labels_test)):
        zero_matrix_test[i][int(angel_labels_test[i,3])] = 1

    possibility_matrix_train = zero_matrix_train
    possibility_matrix_test = zero_matrix_test

# encode the labels as a gaussian distribution
else:

    # Generate the Gaussian distribution encoding
    possibility_matrix_train = zero_matrix_train
    possibility_matrix_test = zero_matrix_test

    for i in range(len(angel_labels_train)):
        ground_truth_angle = int(angel_labels_train[i, 3])  # Ground truth angle
        possibility_matrix_train[i,:] = gaussian_labeling(ground_truth_angle, 360)

    for i in range(len(angel_labels_test)):
        ground_truth_angle = int(angel_labels_test[i, 3])  # Ground truth angle
        possibility_matrix_test[i,:] = gaussian_labeling(ground_truth_angle, 360)


# possibility_matrix now contains the Gaussian-encoded labels
print(possibility_matrix_train.shape)
print(possibility_matrix_test.shape)


(98396, 360)
(19151, 360)


Gaussian label with lower resolution

In [5]:
possibility_matrix_train_180 = np.zeros((len(angel_labels_train),180))
possibility_matrix_test_180 = np.zeros((len(angel_labels_test),180))

# Define the Gaussian distribution function
for i in range(180):
    possibility_matrix_train_180[:,i] = possibility_matrix_train[:,2*i]
    possibility_matrix_test_180[:,i] = possibility_matrix_test[:,2*i]

print(possibility_matrix_train_180.shape)
print(possibility_matrix_test_180.shape)

(98396, 180)
(19151, 180)


### 2. Training Phase 

First we constrcut a fully connected neural network

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
print(torch.__version__)

# Define the dataset class
class GCCDataset(Dataset):
    def __init__(self, data, labels):
        """
        Args:
            data (np.ndarray): Input data of shape (num_samples, 51, 6).
            labels (np.ndarray): Ground truth labels (DOA indices, 0-359).
        """
        self.data = torch.tensor(data, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# Define the model architecture
class GCCPhatModel(nn.Module):
    def __init__(self):
        super(GCCPhatModel, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(51 * 6, 1000)
        self.bn1 = nn.BatchNorm1d(1000)
        self.fc2 = nn.Linear(1000, 1000)
        self.bn2 = nn.BatchNorm1d(1000)
        self.fc3 = nn.Linear(1000, 1000)
        self.bn3 = nn.BatchNorm1d(1000)
        self.fc4 = nn.Linear(1000, 360)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.flatten(x)
        x = self.bn1(nn.ReLU()(self.fc1(x)))
        x = self.bn2(nn.ReLU()(self.fc2(x)))
        x = self.bn3(nn.ReLU()(self.fc3(x)))
        x = self.fc4(x)
        return self.softmax(x)
    
class GCCPhatModel_180(nn.Module):
    def __init__(self):
        super(GCCPhatModel_180, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(51 * 6, 1000)
        self.bn1 = nn.BatchNorm1d(1000)
        self.fc2 = nn.Linear(1000, 1000)
        self.bn2 = nn.BatchNorm1d(1000)
        self.fc3 = nn.Linear(1000, 500)
        self.bn3 = nn.BatchNorm1d(500)
        self.fc4 = nn.Linear(500, 180)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.flatten(x)
        x = self.bn1(nn.ReLU()(self.fc1(x)))
        x = self.bn2(nn.ReLU()(self.fc2(x)))
        x = self.bn3(nn.ReLU()(self.fc3(x)))
        x = self.fc4(x)
        return self.softmax(x)




2.1.1+cu121


Before the training phase, we want to verify whether our network can effectively learn features from the data.

Training Phase for 360 degree labels

In [None]:
# Generate synthetic data for demonstration (replace this with your actual data)
num_samples = len(possibility_matrix_train)
input_dim = (51, 6)
num_classes = 360

# Create datasets
train_data, train_labels = gcc_vectors_train.astype(np.float32), possibility_matrix_train.astype(np.float32)
test_data, test_labels = gcc_vectors_test.astype(np.float32), possibility_matrix_test.astype(np.float32)

train_dataset = GCCDataset(train_data, train_labels)
test_dataset = GCCDataset(test_data, test_labels)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=2048, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2048, shuffle=False)

# Initialize the model, loss function, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = GCCPhatModel().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=5e-3)

# Training loop
num_epochs = 30
iter = 0
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # print(f"Iter {iter}, Loss: {loss.item()}")
        iter += 1
        
        running_loss += loss.item()


    # Print epoch loss
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {running_loss / len(train_loader):.4f}")

cuda
Epoch 1/30, Loss: 0.0392
Epoch 2/30, Loss: 0.0386
Epoch 3/30, Loss: 0.0384
Epoch 4/30, Loss: 0.0384
Epoch 5/30, Loss: 0.0383
Epoch 6/30, Loss: 0.0383
Epoch 7/30, Loss: 0.0382
Epoch 8/30, Loss: 0.0382
Epoch 9/30, Loss: 0.0381
Epoch 10/30, Loss: 0.0381
Epoch 11/30, Loss: 0.0380
Epoch 12/30, Loss: 0.0381
Epoch 13/30, Loss: 0.0380
Epoch 14/30, Loss: 0.0380
Epoch 15/30, Loss: 0.0380
Epoch 16/30, Loss: 0.0379
Epoch 17/30, Loss: 0.0379
Epoch 18/30, Loss: 0.0379
Epoch 19/30, Loss: 0.0379
Epoch 20/30, Loss: 0.0379
Epoch 21/30, Loss: 0.0379
Epoch 22/30, Loss: 0.0379
Epoch 23/30, Loss: 0.0379
Epoch 24/30, Loss: 0.0378
Epoch 25/30, Loss: 0.0378
Epoch 26/30, Loss: 0.0378
Epoch 27/30, Loss: 0.0378
Epoch 28/30, Loss: 0.0378
Epoch 29/30, Loss: 0.0378
Epoch 30/30, Loss: 0.0378


Training Phase for 180 degree labels

In [13]:
# Generate synthetic data for demonstration (replace this with your actual data)
num_samples = len(possibility_matrix_train)
input_dim = (51, 6)
num_classes = 180

# Create datasets
train_data, train_labels = gcc_vectors_train.astype(np.float32), possibility_matrix_train_180.astype(np.float32)
test_data, test_labels = gcc_vectors_test.astype(np.float32), possibility_matrix_test_180.astype(np.float32)

train_dataset = GCCDataset(train_data, train_labels)
test_dataset = GCCDataset(test_data, test_labels)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=2048, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2048, shuffle=False)

# Initialize the model, loss function, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = GCCPhatModel_180().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=5e-3)

# Training loop
num_epochs = 30
iter = 0
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # print(f"Iter {iter}, Loss: {loss.item()}")
        iter += 1
        
        running_loss += loss.item()


    # Print epoch loss
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {running_loss / len(train_loader):.4f}")

cuda
Epoch 1/30, Loss: 45.2150
Epoch 2/30, Loss: 45.1350
Epoch 3/30, Loss: 45.1163
Epoch 4/30, Loss: 45.1151
Epoch 5/30, Loss: 45.1115
Epoch 6/30, Loss: 45.1103
Epoch 7/30, Loss: 45.0980
Epoch 8/30, Loss: 45.0963
Epoch 9/30, Loss: 45.0880
Epoch 10/30, Loss: 45.0919
Epoch 11/30, Loss: 45.0950
Epoch 12/30, Loss: 45.0844
Epoch 13/30, Loss: 45.0828
Epoch 14/30, Loss: 45.0864
Epoch 15/30, Loss: 45.0787
Epoch 16/30, Loss: 45.0778
Epoch 17/30, Loss: 45.0615
Epoch 18/30, Loss: 45.0684
Epoch 19/30, Loss: 45.0566
Epoch 20/30, Loss: 45.0640
Epoch 21/30, Loss: 45.0584
Epoch 22/30, Loss: 45.0592
Epoch 23/30, Loss: 45.0519
Epoch 24/30, Loss: 45.0520
Epoch 25/30, Loss: 45.0501
Epoch 26/30, Loss: 45.0445
Epoch 27/30, Loss: 45.0509
Epoch 28/30, Loss: 45.0490
Epoch 29/30, Loss: 45.0609
Epoch 30/30, Loss: 45.0638


### 3. Evaluation

Evaluation for 360 degree

In [85]:
# Evaluation
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(torch.argmax(targets,dim=1).cpu().numpy())

# Calculate accuracy
# Calculate the difference (error) between predictions and labels
errors = np.array(all_preds) - np.array(all_labels)
errors = np.abs(errors)
for i in range(len(errors)):
    if errors[i] > 180:
        errors[i] = 360 - errors[i]
print(errors)

print(all_preds)
print(all_labels)
mean_error = np.mean(np.abs(errors))

correct = 0
for i in range (len(all_preds)):
    if np.abs(errors[i]) <= 10:
        correct += 1
accuracy = correct / len(all_preds)
# accuracy = accuracy_score(all_labels, all_preds)

print(f"Mean Error: {mean_error}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")


[  8   1   1 ...   5  42 114]
[17, 22, 22, 15, 15, 10, 13, 7, 1, 7, 170, 173, 2, 138, 356, 357, 7, 328, 2, 15, 19, 19, 22, 22, 28, 28, 19, 22, 39, 31, 28, 36, 41, 41, 38, 22, 36, 82, 89, 89, 89, 87, 89, 97, 105, 134, 134, 62, 333, 311, 308, 308, 318, 311, 308, 311, 311, 315, 311, 308, 314, 314, 308, 308, 316, 308, 308, 311, 315, 311, 126, 134, 126, 126, 130, 126, 126, 130, 130, 134, 130, 126, 126, 126, 126, 126, 24, 322, 322, 324, 321, 330, 251, 328, 322, 326, 326, 326, 331, 330, 329, 322, 316, 316, 126, 126, 131, 320, 322, 38, 324, 322, 330, 328, 320, 320, 320, 40, 330, 332, 320, 320, 43, 324, 326, 330, 327, 322, 324, 322, 329, 329, 327, 320, 322, 324, 324, 322, 311, 332, 320, 322, 33, 322, 326, 330, 329, 320, 43, 322, 328, 327, 329, 322, 322, 318, 320, 330, 327, 322, 322, 326, 324, 330, 330, 330, 322, 326, 322, 321, 321, 329, 322, 327, 324, 322, 321, 316, 328, 322, 325, 322, 324, 326, 327, 326, 322, 322, 316, 211, 126, 321, 320, 322, 324, 289, 326, 326, 327, 322, 320, 326, 329, 330, 

Evaluation for lower resolution

In [14]:
# Evaluation
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(torch.argmax(targets,dim=1).cpu().numpy())

# Calculate accuracy
# Calculate the difference (error) between predictions and labels
errors = np.array(all_preds) - np.array(all_labels)
errors = np.abs(errors)
for i in range(len(errors)):
    if errors[i] > 90:
        errors[i] = 180 - errors[i]
print(errors)

print(all_preds)
print(all_labels)
mean_error = np.mean(np.abs(errors))

correct = 0
for i in range (len(all_preds)):
    if np.abs(errors[i]) <= 10:
        correct += 1
accuracy = correct / len(all_preds)
# accuracy = accuracy_score(all_labels, all_preds)

print(f"Mean Error: {mean_error}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")

[ 2  1  0 ...  0 62 27]
[10, 12, 10, 8, 5, 5, 5, 5, 178, 179, 85, 85, 179, 176, 178, 179, 2, 79, 91, 8, 8, 10, 10, 12, 12, 14, 10, 12, 19, 16, 16, 19, 20, 20, 20, 10, 16, 39, 41, 49, 49, 44, 49, 49, 53, 60, 67, 31, 178, 156, 155, 155, 156, 155, 155, 153, 155, 156, 155, 155, 156, 156, 155, 155, 155, 155, 155, 155, 156, 155, 61, 67, 67, 67, 59, 61, 61, 67, 67, 67, 61, 61, 60, 67, 61, 61, 2, 163, 163, 163, 163, 164, 2, 11, 163, 163, 163, 163, 163, 163, 11, 163, 159, 159, 61, 67, 70, 163, 163, 16, 115, 159, 81, 94, 159, 159, 112, 89, 11, 11, 159, 163, 163, 163, 163, 163, 163, 163, 159, 163, 163, 164, 163, 163, 163, 163, 163, 163, 67, 67, 163, 163, 2, 16, 163, 165, 163, 163, 19, 163, 163, 163, 117, 163, 163, 159, 164, 10, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 20, 112, 115, 159, 163, 163, 163, 159, 115, 91, 91, 163, 163, 159, 163, 163, 164, 163, 163, 159, 16, 168, 2, 118, 163, 163, 163, 146, 163, 163, 11, 163, 159, 16, 163, 164, 163, 159, 155, 20, 163, 178, 90, 159, 163, 163, 163

Here we start a markdown block to record the structure and the outcome of the neural network

| Index | Iterations | Linear Layers | Label | Batch Size | Mean Error | Accuracy |  Gaussian Range | Training Size |
|-------|------------|---------------|-------|------------|----------|----------|----------|----------|
|   1   |     20     |       4       |   one-hot   |     2048      |    23.42     |    76.73%     |    -    |    74876    |
|   2   |     100      |       4       |   one-hot   |    2048      |    25.39     |    75.89%     |    -     |    74876    |
|  3    |    1000      |      4       |  one-hot   |    2048     |    25.06     |    75.82%     |    -     |    74876    |
|  4   |    100      |      4       |  one-hot   |    1024      |    23.54     |    76.48%     |    -     |    74876    |
|  5   |    100      |      4       |  gaussian   |    2048      |    21.49     |   80.66%     |    90     |    74876    |
|  5   |    100      |      4       |  gaussian   |    2048      |    20.73     |   80.46%     |    60     |    74876    |
|  6   |    100      |      4       |  gaussian   |    2048      |    21.31     |   79.95%     |    30    |    74876    |
|  7   |    100      |      4       |  gaussian   |    1024      |    21.12     |   80.96%     |    60     |    74876    |
|  8   |    100      |      4       |  gaussian   |    1024      |    19.41     |   81.34%     |    60    |    93476    |
