## This is a rough training process for sound localization.

In this simplified model, we use GCC-PHAT as input and employ only fully connected layers to predict the sound's direction (localization).

### 1. Get the data from the dataset first

In [6]:
import h5py
import numpy as np

h5_file_path = r"C:\Users\grizi\Desktop\TUD\year2\thesis\neural_network\DoA_Net\data\training.h5"

with h5py.File(h5_file_path, 'r') as h5file:
    # Iterate through each dataset in the H5 file
    for dataset_name in h5file:
        # Get the dataset

        dataset = h5file[dataset_name]

        if dataset_name == "label":
            labels = np.array(dataset)
        elif dataset_name == "gcc_vectors":
            gcc_vectors = np.array(dataset)
        
        # Print the dataset name and its shape
        print(f"Dataset: {dataset_name}, Shape: {dataset.shape}")

Dataset: audio, Shape: (78817, 2400, 4)
Dataset: gcc_vectors, Shape: (78817, 51, 6)
Dataset: label, Shape: (78817, 5)


In [7]:
print(gcc_vectors.shape)
print(labels.shape)

(78817, 51, 6)
(78817, 5)


Now we transfer the angle information to one-code format

In [8]:
zero_matrix = np.zeros((len(labels),360))
for i in range(len(labels)):
    zero_matrix[i][int(labels[i,3])] = 1

possibility_matrix = zero_matrix


### 2. Start the training process

First we constrcut a fully connected neural network

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Define the dataset class
class GCCDataset(Dataset):
    def __init__(self, data, labels):
        """
        Args:
            data (np.ndarray): Input data of shape (num_samples, 51, 6).
            labels (np.ndarray): Ground truth labels (DOA indices, 0-359).
        """
        self.data = torch.tensor(data, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# Define the model architecture
class GCCPhatModel(nn.Module):
    def __init__(self):
        super(GCCPhatModel, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(51 * 6, 2000)
        self.bn1 = nn.BatchNorm1d(2000)
        self.fc2 = nn.Linear(2000, 1000)
        self.bn2 = nn.BatchNorm1d(1000)
        self.fc3 = nn.Linear(1000, 500)
        self.bn3 = nn.BatchNorm1d(500)
        self.fc4 = nn.Linear(500, 360)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.flatten(x)
        x = self.bn1(nn.ReLU()(self.fc1(x)))
        x = self.bn2(nn.ReLU()(self.fc2(x)))
        x = self.bn3(nn.ReLU()(self.fc3(x)))
        x = self.fc4(x)
        return self.softmax(x)

# Generate synthetic data for demonstration (replace this with your actual data)
num_samples = len(possibility_matrix)
input_dim = (51, 6)
num_classes = 360

# Random GCC-PHAT data and labels
data = gcc_vectors.astype(np.float32)
labels = possibility_matrix.astype(np.float32)

# Split the data into train and test sets
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.05, random_state=42)
print(train_data.shape, test_data.shape)

# Create DataLoaders
train_dataset = GCCDataset(train_data, train_labels)
test_dataset = GCCDataset(test_data, test_labels)

train_loader = DataLoader(train_dataset, batch_size=2048, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2048, shuffle=False)

# Initialize the model, loss function, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GCCPhatModel().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training loop
num_epochs = 200
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()

    # Print epoch loss
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {running_loss / len(train_loader):.4f}")



(74876, 51, 6) (3941, 51, 6)
Epoch 1/200, Loss: 5.7081
Epoch 2/200, Loss: 5.5220
Epoch 3/200, Loss: 5.4621
Epoch 4/200, Loss: 5.4309
Epoch 5/200, Loss: 5.4082
Epoch 6/200, Loss: 5.3907
Epoch 7/200, Loss: 5.3763
Epoch 8/200, Loss: 5.3642
Epoch 9/200, Loss: 5.3541
Epoch 10/200, Loss: 5.3433
Epoch 11/200, Loss: 5.3352
Epoch 12/200, Loss: 5.3263
Epoch 13/200, Loss: 5.3184
Epoch 14/200, Loss: 5.3130
Epoch 15/200, Loss: 5.3067
Epoch 16/200, Loss: 5.3000
Epoch 17/200, Loss: 5.2946
Epoch 18/200, Loss: 5.2905
Epoch 19/200, Loss: 5.2855
Epoch 20/200, Loss: 5.2803
Epoch 21/200, Loss: 5.2758
Epoch 22/200, Loss: 5.2720
Epoch 23/200, Loss: 5.2674
Epoch 24/200, Loss: 5.2648
Epoch 25/200, Loss: 5.2598
Epoch 26/200, Loss: 5.2579
Epoch 27/200, Loss: 5.2541
Epoch 28/200, Loss: 5.2505
Epoch 29/200, Loss: 5.2477
Epoch 30/200, Loss: 5.2439
Epoch 31/200, Loss: 5.2405
Epoch 32/200, Loss: 5.2377
Epoch 33/200, Loss: 5.2346
Epoch 34/200, Loss: 5.2319
Epoch 35/200, Loss: 5.2292
Epoch 36/200, Loss: 5.2260
Epoch 37

In [51]:
# Evaluation
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(torch.argmax(targets,dim=1).cpu().numpy())

# Calculate accuracy
# Calculate the difference (error) between predictions and labels
errors = np.array(all_preds) - np.array(all_labels)
print(all_preds)
print(all_labels)
mean_error = np.mean(np.abs(errors))
print(errors)
accuracy = accuracy_score(all_labels, all_preds)

print(f"Mean Error: {mean_error}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")


[161, 59, 141, 154, 332, 61, 158, 103, 258, 171, 203, 97, 166, 343, 86, 42, 227, 92, 15, 287, 59, 315, 29, 97, 205, 110, 209, 242, 168, 245, 27, 29, 232, 313, 52, 252, 201, 357, 59, 197, 28, 259, 346, 348, 153, 329, 23, 355, 232, 166, 255, 183, 226, 27, 22, 174, 6, 52, 109, 44, 207, 47, 62, 52, 81, 196, 207, 213, 337, 60, 154, 314, 283, 52, 241, 25, 242, 21, 203, 264, 168, 119, 109, 101, 265, 29, 198, 213, 101, 27, 111, 197, 156, 201, 356, 242, 216, 232, 226, 357, 157, 122, 0, 197, 281, 344, 43, 153, 160, 305, 227, 155, 231, 50, 29, 10, 142, 63, 97, 172, 351, 0, 161, 328, 333, 199, 188, 269, 273, 39, 209, 25, 339, 20, 25, 305, 14, 157, 102, 11, 258, 305, 7, 0, 110, 219, 262, 329, 313, 211, 240, 254, 58, 311, 333, 29, 21, 111, 309, 157, 215, 304, 99, 136, 95, 200, 196, 182, 110, 2, 14, 356, 207, 52, 78, 313, 151, 331, 332, 97, 339, 207, 187, 148, 295, 313, 173, 221, 38, 114, 354, 356, 340, 27, 135, 345, 354, 194, 103, 248, 231, 188, 0, 145, 177, 303, 355, 171, 154, 264, 227, 200, 125, 0