Use your main Jupyter notebook file and train all three autoencoder using
the 40 training signals with the parameters given in Table 1.
Hint : Test different learning rates first, but use the same learning rate for all
three models at the end. Use the shuffle mode for the data loader.

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchaudio
from torch.utils.data import Dataset, DataLoader
# Autoencoder Models
class AE1(nn.Module):
    def __init__(self):
        super(AE1, self).__init__()
        self.encoder = nn.Linear(512, 16)
        self.decoder = nn.Linear(16, 512)
    def forward(self, x):
        x = F.tanh(self.encoder(x))
        x = F.tanh(self.decoder(x))
        return x
class AE2(nn.Module):
    def __init__(self):
        super(AE2, self).__init__()
        # Encoder
        self.enc1 = nn.Linear(512, 128)
        self.enc2 = nn.Linear(128, 64)
        self.enc3 = nn.Linear(64, 16)
        
        # Decoder
        self.dec1 = nn.Linear(16, 64)
        self.dec2 = nn.Linear(64, 128)
        self.dec3 = nn.Linear(128, 512)
    def forward(self, x):
        x = F.tanh(self.enc1(x))
        x = F.tanh(self.enc2(x))
        x = F.tanh(self.enc3(x))
        x = F.tanh(self.dec1(x))
        x = F.tanh(self.dec2(x))
        x = F.tanh(self.dec3(x))
        return x
class AE3(nn.Module):
    def __init__(self):
        super(AE3, self).__init__()
        # Encoder
        self.enc1 = nn.Linear(512, 384)
        self.enc2 = nn.Linear(384, 256)
        self.enc3 = nn.Linear(256, 128)
        
        # Decoder
        self.dec1 = nn.Linear(128, 256)
        self.dec2 = nn.Linear(256, 384)
        self.dec3 = nn.Linear(384, 512)
    def forward(self, x):
        x = F.tanh(self.enc1(x))
        x = F.tanh(self.enc2(x))
        x = F.tanh(self.enc3(x))
        x = F.tanh(self.dec1(x))
        x = F.tanh(self.dec2(x))
        x = F.tanh(self.dec3(x))
        return x
# Custom Audio Dataset
class AudioDataset(Dataset):
    def __init__(self, audio_files, signal_length, frame_length, overlap):
        
        self.audio_files = audio_files
        self.signal_length = signal_length
        self.frame_length = frame_length
        self.overlap = overlap
    def __len__(self):
        return len(self.audio_files)
    def __getitem__(self, idx):
        audio_path = self.audio_files[idx]
        waveform, _ = torchaudio.load(audio_path)
        # Truncate or Zero-pad the signal
        waveform = self._adjust_length(waveform)
        # Normalize the signal
        waveform = self._normalize(waveform)
        # Segment into frames
        frames = self._segment_into_frames(waveform)
        return frames
    def _adjust_length(self, waveform):
        if waveform.shape[1] > self.signal_length:
            return waveform[:, :self.signal_length]
        elif waveform.shape[1] < self.signal_length:
            padding = self.signal_length - waveform.shape[1]
            return F.pad(waveform, (0, padding))
        else:
            return waveform
    def _normalize(self, waveform):
        max_val = torch.max(torch.abs(waveform))
        if max_val > 0:
            return waveform / max_val
        return waveform
    def _segment_into_frames(self, waveform):
        step = self.frame_length - self.overlap
        num_frames = 1 + (waveform.shape[1] - self.frame_length) // step
        frames = torch.zeros((num_frames, self.frame_length))
        for i in range(num_frames):
            start = i * step
            end = start + self.frame_length
            frames[i] = waveform[0, start:end]
        return frames
# Training Function
def train(model, train_loader, epochs, device):
    model = model.to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters())
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch_idx, data in enumerate(train_loader):
            data = data.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, data)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            print(f"Epoch {epoch}, Batch {batch_idx}, Loss: {loss.item()}")
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch}, Average Loss: {avg_loss}")
# Testing Function
def test(model, test_loader, device):
    model = model.to(device)
    model.eval()
    total_snr = 0
    count = 0
    with torch.no_grad():
        for data in test_loader:
            data = data.to(device)
            reconstructed = model(data)
            original_signal = overlap_add(data, overlap)
            reconstructed_signal = overlap_add(reconstructed, overlap)
            snr = calculate_snr(original_signal, reconstructed_signal)
            print(f"Signal {count}, SNR: {snr}")
            total_snr += snr
            count += 1
    avg_snr = total_snr / count
    print(f"Average SNR: {avg_snr}")
# Helper Functions
def calculate_snr(original, reconstructed):
    noise = original - reconstructed
    signal_power = torch.mean(original ** 2)
    noise_power = torch.mean(noise ** 2)
    snr = 10 * torch.log10(signal_power / noise_power)
    return snr.item()
def overlap_add(frames, overlap, window_fn=torch.hann_window):
    frame_length = frames.shape[1]
    step = frame_length - overlap
    signal_length = step * (frames.shape[0] - 1) + frame_length
    signal = torch.zeros(signal_length)
    window = window_fn(frame_length)
    for i, frame in enumerate(frames):
        start = i * step
        end = start + frame_length
        signal[start:end] += frame * window
    return signal

import os
# Assuming a sampling rate of 16 kHz
sampling_rate = 16000
signal_length_seconds = 6  # 6s
signal_length_samples = sampling_rate * signal_length_seconds  

def get_audio_files(directory_path):
    return [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith('.wav')]
# Local Directory path
directory_path = r'C:\Kursmaterial\Dl der Sprachsignalverarbeitung\Computerübung 2\signals\Train_40'
audio_files = get_audio_files(directory_path)

# Example Usage

frame_length = 512 
# Create dataset and data loaders
train_dataset = AudioDataset(audio_files, signal_length_samples, frame_length, overlap=0)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# Train and test the models
# Training AE1
model1 = AE1()  
device = torch.device("cpu")
epochs = 20
train(model1, train_loader, epochs, device)
# Training AE2
model2 = AE2()  
train(model2, train_loader, epochs, device)
# Training AE3
model3 = AE3()  
train(model3, train_loader, epochs, device)


Epoch 0, Batch 0, Loss: 0.03363988921046257
Epoch 0, Batch 1, Loss: 0.032993778586387634
Epoch 0, Average Loss: 0.0333168338984251
Epoch 1, Batch 0, Loss: 0.03238619863986969
Epoch 1, Batch 1, Loss: 0.03243592754006386
Epoch 1, Average Loss: 0.032411063089966774
Epoch 2, Batch 0, Loss: 0.031663138419389725
Epoch 2, Batch 1, Loss: 0.03051569126546383
Epoch 2, Average Loss: 0.031089414842426777
Epoch 3, Batch 0, Loss: 0.03043435886502266
Epoch 3, Batch 1, Loss: 0.03110460750758648
Epoch 3, Average Loss: 0.03076948318630457
Epoch 4, Batch 0, Loss: 0.029657600447535515
Epoch 4, Batch 1, Loss: 0.03032148815691471
Epoch 4, Average Loss: 0.029989544302225113
Epoch 5, Batch 0, Loss: 0.029263639822602272
Epoch 5, Batch 1, Loss: 0.0282729621976614
Epoch 5, Average Loss: 0.028768301010131836
Epoch 6, Batch 0, Loss: 0.028167439624667168
Epoch 6, Batch 1, Loss: 0.029198693111538887
Epoch 6, Average Loss: 0.028683066368103027
Epoch 7, Batch 0, Loss: 0.027789399027824402
Epoch 7, Batch 1, Loss: 0.027