In [2]:
workdir = "/Users/leon/workdir/ml-learning-projects/hw3"

# Preprocess Lofi music into Numpy Array format

In [9]:
import os
import os.path
import math
import threading
import torch
import torch.utils.data
import numpy as np
import librosa as lr
import bisect


class WavenetDataset(torch.utils.data.Dataset):
    def __init__(self,
                 dataset_file,
                 item_length,
                 target_length,
                 file_location=None,
                 classes=256,
                 sampling_rate=16000,
                 mono=True,
                 normalize=False,
                 dtype=np.uint8,
                 train=True,
                 test_stride=100):

        #           |----receptive_field----|
        #                                 |--output_length--|
        # example:  | | | | | | | | | | | | | | | | | | | | |
        # target:                           | | | | | | | | | |

        self.dataset_file = dataset_file
        self._item_length = item_length
        self._test_stride = test_stride
        self.target_length = target_length
        self.classes = classes

        if not os.path.isfile(dataset_file):
            assert file_location is not None, "no location for dataset files specified"
            self.mono = mono
            self.normalize = normalize

            self.sampling_rate = sampling_rate
            self.dtype = dtype
            self.create_dataset(file_location, dataset_file)
        else:
            # Unknown parameters of the stored dataset
            # TODO Can these parameters be stored, too?
            self.mono = None
            self.normalize = None

            self.sampling_rate = None
            self.dtype = None

        self.data = np.load(self.dataset_file, mmap_mode='r')
        self.start_samples = [0]
        self._length = 0
        self.calculate_length()
        self.train = train
        print("one hot input")
        # assign every *test_stride*th item to the test set

    def create_dataset(self, location, out_file):
        print("create dataset from audio files at", location)
        self.dataset_file = out_file
        files = list_all_audio_files(location)
        processed_files = []
        for i, file in enumerate(files):
            print("  processed " + str(i) + " of " + str(len(files)) + " files")
            file_data, _ = lr.load(path=file,
                                   sr=self.sampling_rate,
                                   mono=self.mono)
            if self.normalize:
                file_data = lr.util.normalize(file_data)
            quantized_data = quantize_data(file_data, self.classes).astype(self.dtype)
            processed_files.append(quantized_data)

        np.savez(self.dataset_file, *processed_files)

    def calculate_length(self):
        start_samples = [0]
        for i in range(len(self.data.keys())):
            start_samples.append(start_samples[-1] + len(self.data['arr_' + str(i)]))
        available_length = start_samples[-1] - (self._item_length - (self.target_length - 1)) - 1
        self._length = math.floor(available_length / self.target_length)
        self.start_samples = start_samples

    def set_item_length(self, l):
        self._item_length = l
        self.calculate_length()

    def __getitem__(self, idx):
        if self._test_stride < 2:
            sample_index = idx * self.target_length
        elif self.train:
            sample_index = idx * self.target_length + math.floor(idx / (self._test_stride-1))
        else:
            sample_index = self._test_stride * (idx+1) - 1

        file_index = bisect.bisect_left(self.start_samples, sample_index) - 1
        if file_index < 0:
            file_index = 0
        if file_index + 1 >= len(self.start_samples):
            print("error: sample index " + str(sample_index) + " is to high. Results in file_index " + str(file_index))
        position_in_file = sample_index - self.start_samples[file_index]
        end_position_in_next_file = sample_index + self._item_length + 1 - self.start_samples[file_index + 1]

        if end_position_in_next_file < 0:
            file_name = 'arr_' + str(file_index)
            this_file = np.load(self.dataset_file, mmap_mode='r')[file_name]
            sample = this_file[position_in_file:position_in_file + self._item_length + 1]
        else:
            # load from two files
            file1 = np.load(self.dataset_file, mmap_mode='r')['arr_' + str(file_index)]
            file2 = np.load(self.dataset_file, mmap_mode='r')['arr_' + str(file_index + 1)]
            sample1 = file1[position_in_file:]
            sample2 = file2[:end_position_in_next_file]
            sample = np.concatenate((sample1, sample2))

        example = torch.from_numpy(sample).type(torch.LongTensor)
        one_hot = torch.FloatTensor(self.classes, self._item_length).zero_()
        one_hot.scatter_(0, example[:self._item_length].unsqueeze(0), 1.)
        target = example[-self.target_length:].unsqueeze(0)
        return one_hot, target

    def __len__(self):
        test_length = math.floor(self._length / self._test_stride)
        if self.train:
            return self._length - test_length
        else:
            return test_length


def quantize_data(data, classes):
    mu_x = mu_law_encoding(data, classes)
    bins = np.linspace(-1, 1, classes)
    quantized = np.digitize(mu_x, bins) - 1
    return quantized


def list_all_audio_files(location):
    audio_files = []
    for dirpath, dirnames, filenames in os.walk(location):
        for filename in [f for f in filenames if f.endswith((".mp3", ".wav", ".aif", "aiff"))]:
            audio_files.append(os.path.join(dirpath, filename))

    if len(audio_files) == 0:
        print("found no audio files in " + location)
    return audio_files


def mu_law_encoding(data, mu):
    mu_x = np.sign(data) * np.log(1 + mu * np.abs(data)) / np.log(mu + 1)
    return mu_x


def mu_law_expansion(data, mu):
    s = np.sign(data) * (np.exp(np.abs(data) * np.log(mu + 1)) - 1) / mu
    return s

# WaveNet Model

In [4]:
from torch import nn

In [5]:
import torch
import numpy as np

class DilatedCausalConv1d(torch.nn.Module):
    """Dilated Causal Convolution for WaveNet"""
    def __init__(self, channels, dilation=1):
        super(DilatedCausalConv1d, self).__init__()

        self.conv1d = torch.nn.Conv1d(channels, channels,
                                    kernel_size=2, stride=1,  # Fixed for WaveNet
                                    dilation=dilation,
                                    padding=0,  # Fixed for WaveNet dilation
                                    bias=False)  # Fixed for WaveNet but not sure

    def forward(self, x):
        output = self.conv1d(x)
        return output


class CausalConv1d(torch.nn.Module):
    """Causal Convolution for WaveNet"""
    def __init__(self, in_channels, out_channels):
        super(CausalConv1d, self).__init__()
        self.conv = torch.nn.Conv1d(in_channels, out_channels,
                                    kernel_size=2, stride=1, padding=1,
                                    bias=False)

    def forward(self, x):
        output = self.conv(x)

        # remove last value for causal convolution
        return output[:, :, :-1]


class ResidualBlock(torch.nn.Module):
    def __init__(self, res_channels, skip_channels, dilation):
        """
        Residual block
        :param res_channels: number of residual channel for input, output
        :param skip_channels: number of skip channel for output
        :param dilation:
        """
        super(ResidualBlock, self).__init__()

        self.dilated = DilatedCausalConv1d(res_channels, dilation=dilation)
        self.conv_res = torch.nn.Conv1d(res_channels, res_channels, 1)
        self.conv_skip = torch.nn.Conv1d(res_channels, skip_channels, 1)

        self.gate_tanh = torch.nn.Tanh()
        self.gate_sigmoid = torch.nn.Sigmoid()

    def forward(self, x, skip_size):
        """
        :param x:
        :param skip_size: The last output size for loss and prediction
        :return:
        """
        output = self.dilated(x)

        # PixelCNN gate
        gated_tanh = self.gate_tanh(output)
        gated_sigmoid = self.gate_sigmoid(output)
        gated = gated_tanh * gated_sigmoid

        # Residual network
        output = self.conv_res(gated)
        input_cut = x[:, :, -output.size(2):]
        output += input_cut

        # Skip connection
        skip = self.conv_skip(gated)
        skip = skip[:, :, -skip_size:]

        return output, skip


class ResidualStack(torch.nn.Module):
    def __init__(self, layer_size, stack_size, res_channels, skip_channels):
        """
        Stack residual blocks by layer and stack size
        :param layer_size: integer, 10 = layer[dilation=1, dilation=2, 4, 8, 16, 32, 64, 128, 256, 512]
        :param stack_size: integer, 5 = stack[layer1, layer2, layer3, layer4, layer5]
        :param res_channels: number of residual channel for input, output
        :param skip_channels: number of skip channel for output
        :return:
        """
        super(ResidualStack, self).__init__()

        self.layer_size = layer_size
        self.stack_size = stack_size
        # 5 = stack[layer1, layer2, layer3, layer4, layer5]
        # 10 = layer[dilation=1, dilation=2, 4, 8, 16, 32, 64, 128, 256, 512]
        self.res_blocks = [self._residual_block(res_channels, skip_channels, 2**l) for s in range(0, self.stack_size) for l in range(0, self.layer_size)]

    @staticmethod
    def _residual_block(res_channels, skip_channels, dilation):
        block = ResidualBlock(res_channels, skip_channels, dilation)

        if torch.cuda.is_available():
            block.cuda()

        return block

    def forward(self, x, skip_size):
        """
        :param x:
        :param skip_size: The last output size for loss and prediction
        :return:
        """
        output = x
        skip_connections = []

        for res_block in self.res_blocks:
            # output is the next input
            output, skip = res_block(output, skip_size)
            skip_connections.append(skip)

        return torch.stack(skip_connections)


class DenseNet(torch.nn.Module):
    def __init__(self, channels):
        """
        The last network of WaveNet
        :param channels: number of channels for input and output
        :return:
        """
        super(DenseNet, self).__init__()

        self.conv1 = torch.nn.Conv1d(channels, channels, 1)
        self.conv2 = torch.nn.Conv1d(channels, channels, 1)

        self.relu = torch.nn.ReLU()
        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, x):
        output = self.relu(x)
        output = self.conv1(output)
        output = self.relu(output)
        output = self.conv2(output)

        output = self.softmax(output)

        return output

In [6]:
class WaveNet(torch.nn.Module):
    def __init__(self, layer_size, block_size, in_channels, res_channels):
        """
        Stack residual blocks by layer and stack size
        :param layer_size: integer, 10 = layer[dilation=1, dilation=2, 4, 8, 16, 32, 64, 128, 256, 512]
        :param stack_size: integer, 5 = stack[layer1, layer2, layer3, layer4, layer5]
        :param in_channels: number of channels for input data. skip channel is same as input channel
        :param res_channels: number of residual channel for input, output
        :return:
        """
        super(WaveNet, self).__init__()

        self.receptive_fields = self.calc_receptive_fields(layer_size, block_size)

        self.causal = CausalConv1d(in_channels, res_channels)

        self.res_block = ResidualStack(layer_size, block_size, res_channels, in_channels)

        self.densenet = DenseNet(in_channels)

    @staticmethod
    def calc_receptive_fields(layer_size, block_size):
        layers = [2 ** i for i in range(0, layer_size)] * block_size
        num_receptive_fields = np.sum(layers)

        return int(num_receptive_fields)

    def calc_output_size(self, x):
        # B, L, C
        output_size = int(x.size(2)) - self.receptive_fields

        #self.check_input_size(x, output_size)

        return output_size

    # def check_input_size(self, x, output_size):
    #     if output_size < 1:
    #         raise InputSizeError(int(x.size(2)), self.receptive_fields, output_size)

    def forward(self, x):
        """
        The size of timestep(3rd dimention) has to be bigger than receptive fields
        :param x: Tensor[batch, timestep, channels]
        :return: Tensor[batch, timestep, channels]
        """
        output = x.transpose(1, 2)

        output_size = self.calc_output_size(output)

        output = self.causal(output)

        skip_connections = self.res_block(output, output_size)

        output = torch.sum(skip_connections, dim=0)

        output = self.densenet(output)

        return output.transpose(1, 2).contiguous()

# Training Loop

In [30]:
import os
from types import SimpleNamespace

class Trainer:
    def __init__(self, config):
        self.config = config
        self.net = WaveNet(config.layer_size, config.stack_size, config.in_channels, config.res_channels)

        self.in_channels = config.in_channels
        self.receptive_fields = self.net.receptive_fields
        self.output_length = 16
        self.lr = config.lr
        self.loss = torch.nn.CrossEntropyLoss()
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=self.lr)
        print(config.sample_size)
        # self.data_loader = DataLoader(config.data_dir, self.receptive_fields,
        #                               sample_size = config.sample_size,
        #                               sample_rate = config.sample_rate,
        #                               in_channels= config.in_channels,
        #                               batch_size = 1)

        dataset = WavenetDataset(dataset_file=workdir+"/training_data.npz",
                      item_length=self.receptive_fields + self.output_length,
                      target_length=self.output_length,
                      file_location='train_samples/bach_chaconne',
                      test_stride=500)
        self.data_loader = torch.utils.data.DataLoader(dataset,
                                                      batch_size=1,
                                                      shuffle=True,
                                                      num_workers=0,
                                                      pin_memory=False)
    def run(self, epochs = 10):
        total_steps = 0
        for current_epoch in range(epochs):
            for (inputs, targets) in iter(self.data_loader):
                # Tensor[batch, timestep, channels]
                inputs = inputs.transpose(1, 2).contiguous()
                outputs = self.net(inputs)
                print(inputs.shape, outputs.shape, targets.shape)
                loss = self.loss(outputs.view(-1, self.in_channels),
                                targets.long().view(-1))
    
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                total_steps += 1
    
                print('[{0}/{1}] loss: {2}'.format(total_steps, self.config.num_steps, loss.item()))
    
                if total_steps > self.config.num_steps:
                    break
            if total_steps > self.config.num_steps:
                    break
        model_path = os.path.join(self.config.model_dir, 'wavenet_{step}.pkl'.format(step = self.config.num_steps))
        torch.save(self.net.state_dict(), model_path)

config_defaults = SimpleNamespace(
    layer_size = 10,
    stack_size = 5,
    in_channels=256,
    res_channels = 512,
    sample_rate = 16000,
    sample_size=100000,
    lr = 0.0002,
    data_dir = workdir +"/audio_data",
    num_steps=100000,
    model_dir = workdir+"/model",
)
trainer = Trainer(config_defaults)

100000
one hot input


In [31]:
trainer.run()

torch.Size([1, 5131, 256]) torch.Size([1, 16, 256]) torch.Size([1, 1, 16])
[1/100000] loss: 5.544831275939941
torch.Size([1, 5131, 256]) torch.Size([1, 16, 256]) torch.Size([1, 1, 16])
[2/100000] loss: 5.544738292694092
torch.Size([1, 5131, 256]) torch.Size([1, 16, 256]) torch.Size([1, 1, 16])
[3/100000] loss: 5.544644832611084
torch.Size([1, 5131, 256]) torch.Size([1, 16, 256]) torch.Size([1, 1, 16])
[4/100000] loss: 5.544552803039551
torch.Size([1, 5131, 256]) torch.Size([1, 16, 256]) torch.Size([1, 1, 16])
[5/100000] loss: 5.544456958770752
torch.Size([1, 5131, 256]) torch.Size([1, 16, 256]) torch.Size([1, 1, 16])
[6/100000] loss: 5.544362545013428
torch.Size([1, 5131, 256]) torch.Size([1, 16, 256]) torch.Size([1, 1, 16])
[7/100000] loss: 5.5442633628845215
torch.Size([1, 5131, 256]) torch.Size([1, 16, 256]) torch.Size([1, 1, 16])
[8/100000] loss: 5.544159412384033
torch.Size([1, 5131, 256]) torch.Size([1, 16, 256]) torch.Size([1, 1, 16])
[9/100000] loss: 5.5440545082092285
torch.Si

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x105914f50>>
Traceback (most recent call last):
  File "/opt/anaconda3/envs/hw3/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


KeyboardInterrupt: 