## Load Data

The h5 file contains three component: 

1. raw audio data

2. label (direction of arrival)

3. stft (short time fourier transform for 100ms split)

In [1]:
import h5py
import numpy as np

# Path to the H5 file
h5_file_path = r"C:\Users\grizi\Desktop\TUD\year2\thesis\neural_network\DoA_Net\data\stft.h5"

# Open the H5 file
with h5py.File(h5_file_path, 'r') as h5file:
    # Iterate through each dataset in the H5 file
    for dataset_name in h5file:
        # Get the dataset
        dataset = h5file[dataset_name]
        if dataset_name == 'audio':
            audio_data = np.array(dataset)
        elif dataset_name == 'label':
            label_data = np.array(dataset)
        elif dataset_name == 'stft':
            stft_data = np.array(dataset)
        
        # Print the dataset name and its shape
        print(f"Dataset: {dataset_name}, Shape: {dataset.shape}")

Dataset: audio, Shape: (98396, 2400, 4)
Dataset: label, Shape: (98396, 5)
Dataset: stft, Shape: (98396, 8, 168, 7)


## Label Encoding 

Encode the label to a Gaussian Distribution 

In [2]:
def generate_gaussian(center, start, end, peak_value=1, sigma=1):

    x = np.arange(start, end + 1)  # Generate the range from start to end
    gaussian = np.exp(-0.5 * ((x - center) ** 2) / (sigma ** 2))  # Gaussian formula
    gaussian = gaussian / gaussian.max() * peak_value  # Normalize to make the peak value equal to `peak_value`
    return gaussian

# Generate Gaussian distribution from 1 to 11 with peak at 6
start, end, center = 1, 31, 15
sigma = 10  # Standard deviation
gaussian_distribution = generate_gaussian(center, start, end, peak_value=1, sigma=sigma)

gaussian_distribution

array([0.3753111 , 0.42955736, 0.48675226, 0.54607443, 0.60653066,
       0.66697681, 0.72614904, 0.78270454, 0.83527021, 0.8824969 ,
       0.92311635, 0.95599748, 0.98019867, 0.99501248, 1.        ,
       0.99501248, 0.98019867, 0.95599748, 0.92311635, 0.8824969 ,
       0.83527021, 0.78270454, 0.72614904, 0.66697681, 0.60653066,
       0.54607443, 0.48675226, 0.42955736, 0.3753111 , 0.32465247,
       0.2780373 ])

In [3]:
angle_label =  label_data[:,3]
print(angle_label.shape)
zero_matrix_train = np.zeros((len(angle_label),360))

# Define the Gaussian distribution function
def gaussian_labeling(label, num_classes=360):

    zero_row = np.zeros((num_classes))
    for i in range(31):
        center = label + i - 15
        if center < 0:
            center = 360 + center
        elif center > 359:
            center = center - 360
        zero_row[center] = gaussian_distribution[i]*20

    return zero_row

    

# set the method for encoding the labels
encode_method = "gaussian"

# encode the labels as a gaussian distribution
possibility_matrix_angle = zero_matrix_train

for i in range(len(angle_label)):
    ground_truth_angle = int(angle_label[i])  # Ground truth angle
    possibility_matrix_angle[i,:] = gaussian_labeling(ground_truth_angle, 360)


# possibility_matrix now contains the Gaussian-encoded labels
print(possibility_matrix_angle.shape)
print(np.argmax(possibility_matrix_angle[0]))


(98396,)
(98396, 360)
194


## Model Construction

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ResidualBlock(nn.Module):
    def __init__(self, channels):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(channels, channels, kernel_size=(3, 3), padding=1)
        self.bn1 = nn.BatchNorm2d(channels)
        self.conv2 = nn.Conv2d(channels, channels, kernel_size=(3, 3), padding=1)
        self.bn2 = nn.BatchNorm2d(channels)
    
    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = F.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out += residual  # Shortcut connection
        out = F.relu(out)
        return out

class DOA_Network(nn.Module):
    def __init__(self, input_channels=8, time_steps=7, doa_bins=360):
        super(DOA_Network, self).__init__()
        
        # 1x7 Convolution, stride (1,3), output channels 32
        self.conv1 = nn.Conv2d(in_channels=input_channels, out_channels=32, kernel_size=(1, 7), stride=(1, 3))
        self.bn1 = nn.BatchNorm2d(32)
        
        # 1x5 Convolution, stride (1,2), output channels 128
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=128, kernel_size=(1, 5), stride=(1, 2))
        self.bn2 = nn.BatchNorm2d(128)
        
        # 5 Residual Blocks
        self.res_blocks = nn.Sequential(*[ResidualBlock(128) for _ in range(5)])
        
        # 1x1 Convolution, output channels 360 (DOA bins)
        self.conv3 = nn.Conv2d(in_channels=128, out_channels=doa_bins, kernel_size=(1, 1))
        self.bn3 = nn.BatchNorm2d(doa_bins)
        
        # 1x1 Convolution, output channels 500
        self.conv4 = nn.Conv2d(in_channels=25, out_channels=500, kernel_size=(1, 1))
        self.bn4 = nn.BatchNorm2d(500)
        
        # 7x5 Convolution, output channels 1 (Final Spatial Spectrum Output)
        self.conv5 = nn.Conv2d(in_channels=500, out_channels=1, kernel_size=(7, 5), padding=(0, 2))
        
    def forward(self, x):
        # Initial convolutions
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        
        # Residual blocks
        x = self.res_blocks(x)
        
        # 1x1 Convolution to DOA bins
        x = F.relu(self.bn3(self.conv3(x)))
        
        # Swap axes (B, T, DOA, F) -> (B, T, F, DOA)
        stage1_x = x
        x = x.permute(0, 3, 2, 1) 
        
        # 1x1 Convolution
        x = F.relu(self.bn4(self.conv4(x)))
        
        # Final 7x5 Convolution with Sigmoid activation
        x = torch.sigmoid(self.conv5(x))
        
        # Remove channel dimension and return (B, T, DOA)
        x = x.squeeze(1).squeeze(1)
        return x, stage1_x

# Example usage
if __name__ == "__main__":
    # Define input dimensions (Batch, Channels, Time, Frequency)
    batch_size = 4
    input_tensor = torch.randn(batch_size, 8, 7, 168)  # (B, C, T, F)
    
    model = DOA_Network()
    output,xx = model(input_tensor)
    print("Output shape:", output.shape)  # Expected: (B, T, 360)


Output shape: torch.Size([4, 360])


## Loss Function Definition

First we define a loss function for the first training stage, then we have to ensure that the loss function has a gradient that could be used in the backward session.

In [18]:
import torch 
class Stage1_loss(nn.Module):
    def __init__(self):
        super(Stage1_loss, self).__init__()
        self.mse = nn.MSELoss(reduction='none')  # Calculate MSE loss without reduction

    def forward(self, y_pred, y_true):
        """
        Compute Stage1 Loss:
        1. Calculate MSE loss for each element in the (7, 25) region.
        2. Average the loss over the (7, 25) region.
        3. Average the loss over all samples and all feature dimensions to get a scalar loss.

        Parameters:
        - y_pred: (batch_size, 360, 7, 25) - Predicted tensor
        - y_true: (batch_size, 360) - Ground truth tensor

        Returns:
        - final_loss: Scalar loss value
        """
        
        # Expand y_true to match y_pred shape (batch_size, 360, 7, 25)
        y_true_expanded = y_true.unsqueeze(-1).unsqueeze(-1)  # (4, 360, 1, 1)

        # Calculate MSE loss element-wise: result shape -> (batch_size, 360, 7, 25)
        loss = self.mse(y_pred, y_true_expanded)

        # Compute the mean loss over the last two dimensions (7, 25) -> (batch_size, 360)
        loss_mean_per_feature = loss.mean(dim=(-1, -2))

        # Compute the mean loss over all samples and features -> scalar
        final_loss = loss_mean_per_feature.mean()

        return final_loss


# Instantiate the loss function
loss_fn = Stage1_loss()
y_pred = torch.randn((4, 360, 7, 25),requires_grad=True)  # Random prediction
y_true = torch.randn(4, 360)  # Random ground truth

# Compute the loss
loss = loss_fn(y_pred, y_true)

print("Loss:", loss.item())

# Attempt backpropagation
loss.backward()  # Should work correctly
print("Backward successful!")

Loss: 2.016645908355713
Backward successful!


## Training 

### Training Stage 1

In this part, our goal is to train the intermediate representation of shape (360, 7, 25) to approximate a (1, 360) vector that encodes directional information. From my understanding, training on a larger, more detailed intermediate representation first can simplify the learning process, making it easier for the neural network to converge compared to directly training the final (1, 360) output.

In [None]:
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

class CustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]
    
# Training function
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        index = 0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs,stage1 = model(inputs)
            # print(stage1[0,:,0,0])
            loss = criterion(stage1, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            index += 1
            print("Epoch : ", epoch, ' set : ', {index+1}/{len(train_loader)}, ' loss : ',loss)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(train_loader):.4f}")

# Prediction function
def predict(model, input_tensor, device):
    model.eval()
    with torch.no_grad():
        input_tensor = input_tensor.to(device)
        output = model(input_tensor)
    return output.cpu()

# Example usage
if __name__ == "__main__":
    # Define input dimensions (Batch, Channels, Time, Frequency)
    batch_size = 256
    input_tensor = torch.from_numpy(stft_data).float()
    input_tensor = input_tensor.permute(0, 1, 3, 2)  # Swap axes (B, F, T, C)
    target_tensor = torch.from_numpy(possibility_matrix_angle).float()
    
    # Create dataset and dataloader
    dataset = CustomDataset(input_tensor, target_tensor)
    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    # Initialize model, loss function, and optimizer

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # torch.cuda.set_per_process_memory_fraction(0.6, device=device)

    model = DOA_Network().to(device)
    criterion = Stage1_loss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # Train the model
    train_model(model, train_loader, criterion, optimizer, num_epochs=1)

# save the weight of the model
model_save_path = r"C:\Users\grizi\Desktop\TUD\year2\thesis\neural_network\DoA_Net\model\DOA_Network_stage1.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Model weights saved to {model_save_path}")




tensor([1.2773, 5.3297, 0.0000, 0.0000, 4.3890, 2.0182, 4.2633, 0.0000, 0.0000,
        2.2295, 2.5801, 0.0000, 2.1293, 0.0000, 3.7427, 1.5188, 0.0000, 0.0000,
        0.0000, 0.0000, 1.6859, 0.0000, 1.0552, 3.8791, 0.0000, 0.0000, 0.0000,
        2.3445, 4.9974, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 5.4648, 5.1829,
        0.0000, 0.3053, 1.5380, 0.0000, 1.2829, 5.1175, 0.0000, 0.0000, 0.0000,
        3.3493, 1.5113, 0.0000, 0.0000, 4.4868, 0.0000, 0.6901, 0.0000, 1.6784,
        1.4369, 0.0000, 0.0000, 1.3604, 0.0000, 0.5442, 0.0000, 0.7306, 0.4645,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 3.6278,
        3.9944, 0.8156, 4.1579, 0.0000, 0.0000, 1.2331, 0.0000, 5.7358, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 3.5243, 0.0000, 0.0000, 0.0000, 2.0047,
        6.8019, 0.0000, 0.0000, 3.8727, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        1.4054, 0.0000, 0.0000, 0.0000, 0.0000, 0.2576, 0.0000, 0.6519, 4.4978,
        0.0000, 3.2014, 2.1434, 2.6714, 

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch :  0  set :  1  loss :  tensor(19.1189, device='cuda:0', grad_fn=<MeanBackward0>)
tensor([1.1844e+00, 0.0000e+00, 0.0000e+00, 7.2099e-02, 2.0226e+00, 0.0000e+00,
        6.0158e-01, 0.0000e+00, 0.0000e+00, 7.0255e-01, 6.2791e-01, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.1403e+00, 0.0000e+00,
        1.1624e+00, 1.9019e-02, 0.0000e+00, 0.0000e+00, 3.2085e-01, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 7.8887e-02,
        0.0000e+00, 3.5850e-01, 3.1175e-01, 5.3338e-01, 6.8320e-01, 1.8556e+00,
        0.0000e+00, 0.0000e+00, 1.8468e-01, 4.4237e-01, 7.3231e-01, 5.4574e-02,
        0.0000e+00, 7.6831e-01, 2.4595e-01, 2.4080e-01, 0.0000e+00, 2.4369e+00,
        1.0114e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 2.6963e-01,
        9.0712e-01, 0.0000e+00, 2.1606e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        1.7959e+00, 0.0000e+00, 

  return F.mse_loss(input, target, reduction=self.reduction)


### Training Stage 2

In the second training phase, based on the previously trained model, we use the MSE loss to measure the difference between the model's output (1, 360) and the corresponding label (1, 360).

In [None]:
# Training function for continued training
def stage2_training(model, train_loader, criterion, optimizer, device, num_epochs=1):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch_idx, (inputs, targets) in enumerate(train_loader):
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()

            # Forward pass with the model
            outputs, stage1 = model(inputs)

            # Compute loss using Stage 1 outputs
            loss = criterion(outputs, targets)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # Print progress
            print(f"Epoch: {epoch}, Batch: {batch_idx+1}/{len(train_loader)}, Loss: {loss.item():.4f}")

        # Print average loss per epoch
        print(f"Epoch [{epoch}/{num_epochs}], Average Loss: {total_loss / len(train_loader):.4f}")

    print("stage2 training completed.")

# set the MSEloss as the criterion for the stage 2 training
criterion = nn.MSELoss()

# set the epoch for the second stage
num_epochs = 1

# Continue training the model
stage2_training(model, train_loader, criterion, optimizer, device, num_epochs)

# save the weight of the model
model_save_path = r"C:\Users\grizi\Desktop\TUD\year2\thesis\neural_network\DoA_Net\model\DOA_Network_stage2.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Model weights saved to {model_save_path}")




Epoch: 0, Batch: 1/385, Loss: 18.5625
Epoch: 0, Batch: 2/385, Loss: 17.1524
Epoch: 0, Batch: 3/385, Loss: 17.1528
Epoch: 0, Batch: 4/385, Loss: 17.1760
Epoch: 0, Batch: 5/385, Loss: 17.2106
Epoch: 0, Batch: 6/385, Loss: 17.2451
Epoch: 0, Batch: 7/385, Loss: 17.1840
Epoch: 0, Batch: 8/385, Loss: 17.1828
Epoch: 0, Batch: 9/385, Loss: 17.1506
Epoch: 0, Batch: 10/385, Loss: 17.1762
Epoch: 0, Batch: 11/385, Loss: 17.1623
Epoch: 0, Batch: 12/385, Loss: 17.1397
Epoch: 0, Batch: 13/385, Loss: 17.1287
Epoch: 0, Batch: 14/385, Loss: 17.1839
Epoch: 0, Batch: 15/385, Loss: 17.1702
Epoch: 0, Batch: 16/385, Loss: 17.1208
Epoch: 0, Batch: 17/385, Loss: 17.1849
Epoch: 0, Batch: 18/385, Loss: 17.1661
Epoch: 0, Batch: 19/385, Loss: 17.1394
Epoch: 0, Batch: 20/385, Loss: 17.1140
Epoch: 0, Batch: 21/385, Loss: 17.1193
Epoch: 0, Batch: 22/385, Loss: 17.1487
Epoch: 0, Batch: 23/385, Loss: 17.1075
Epoch: 0, Batch: 24/385, Loss: 17.1119
Epoch: 0, Batch: 25/385, Loss: 17.1580
Epoch: 0, Batch: 26/385, Loss: 17.

NameError: name 'start_epoch' is not defined

### Testing 

First we need to define a new prediction function for our model which has two output (intermediate output and final output)

In [21]:
def predict(model, input_tensor, device):
    model.eval()
    with torch.no_grad():
        input_tensor = input_tensor.to(device)
        output, stage1_output = model(input_tensor)  # Unpack the tuple
    return output.cpu(), stage1_output.cpu()  # Apply .cpu() to each tensor


In [None]:
# Prediction example
test_input = input_tensor[:2,:,:,:]
prediction, stage1_output = predict(model, test_input, device)
print("Prediction shape:", prediction.shape)
print("Stage1 output shape:", stage1_output.shape)
print(prediction.shape)

max_index = torch.argmax(prediction[1,:]).item()
print(f"Maximum value index: {max_index}")
# print(target_tensor[0,:])

max_index = torch.argmax(target_tensor[0,:]).item()
print(f"Maximum value index: {max_index}")

Prediction shape: torch.Size([2, 360])
Stage1 output shape: torch.Size([2, 360, 7, 25])
torch.Size([2, 360])
Maximum value index: 176
Maximum value index: 194
