# Creating 2-Dimesional Audio Feature Vectors for Clustering 

This notebook borrows heavliy from the following three sources: 

- [Audio Signal Processing for Machine Learning](https://www.youtube.com/playlist?list=PL-wATfeyAMNqIee7cH3q1bh4QJFAaeNv0)
- [librosa](https://librosa.org/doc/main/index.html)
- [Audio Feature Extraction](https://devopedia.org/audio-feature-extraction)

In [1]:
from IPython.display import display, HTML, Latex
display(HTML("<style>.container { width:100% !important; }</style>"))

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

class StopExecution(Exception):
    def _render_traceback_(self):
        print("Process Terminated")

### Env Setup

In [1]:
import os
import math
import numpy as np
import torch
import torchvision
import librosa
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import librosa.display
from tempfile import mktemp

In [3]:
ROOT = "/home/ec2-user/tugboat_interval_ds"
TUGBOAT_PATH = os.path.join(ROOT, "tugboat")
NOISE_PATH = os.path.join(ROOT, "no_tugboat")

SAMPLING_RATE = 16000
FRAME_LENGTH = 2048
HOP_LENGTH = FRAME_LENGTH // 2
WINDOW_LENGTH = FRAME_LENGTH

SAMPLING_RATE = 16000
FRAME_LENGTH = 1024
HOP_LENGTH = 256
INTERVAL_LENGTH = 1

RESNET_INPUT_SHAPE = (3, 224, 224)

### Data Loading

In [9]:
tugboat_files_ls = [
    os.path.join(TUGBOAT_PATH, x)
    for x in os.listdir(TUGBOAT_PATH)
    if x.endswith(".wav")
]
noise_files_ls = [
    os.path.join(NOISE_PATH, x) for x in os.listdir(NOISE_PATH) if x.endswith(".wav")
]

In [10]:
len(tugboat_files_ls), len(noise_files_ls)

(4030, 2918)

### Window Features Driver Function

In [34]:
def create_and_save(
    file_ls,
    out_dir,
    feature_func,
    feature_interval=INTERVAL_LENGTH,
    sampling_rate=SAMPLING_RATE,
    hop_length=HOP_LENGTH,
    frame_length=FRAME_LENGTH,
):
    target_ctr = 0
    for file in file_ls:
        prev_idx = 0
        signal, sampling_rate = librosa.load(file, sr=sampling_rate)

        for curr_idx in range(
            sampling_rate * feature_interval,
            len(signal),
            sampling_rate * feature_interval,
        ):
            window = signal[prev_idx:curr_idx]
            if len(window) > 0:
                save_feature = feature_func(
                    window,
                    interval_length=INTERVAL_LENGTH,
                    sr=SAMPLING_RATE,
                    frame_length=FRAME_LENGTH,
                    hop_length=HOP_LENGTH,
                    y_axis="log",
                )
                np.save(os.path.join(out_dir, str(target_ctr)), save_feature)
                target_ctr += 1
                prev_idx += sampling_rate * feature_interval

    return f"Saved {target_ctr} features"

### Spectrogram

larroes catch medloes

In [31]:
SG_TUG_OUT = "/home/ec2-user/clustering/spectrogram/tugboat"
SG_NOISE_OUT = "/home/ec2-user/clustering/spectrogram/no_tugboat"

os.makedirs(SG_TUG_OUT, exist_ok=True)
os.makedirs(SG_NOISE_OUT, exist_ok=True)

In [36]:
def pad_and_drop_alpha(filepath):
    tensor = torchvision.io.read_image(filepath)
    if tensor.shape[1] < RESNET_INPUT_SHAPE[1]:
        tensor = torch.nn.functional.pad(
            tensor, (0, 0, RESNET_INPUT_SHAPE[1] - tensor.shape[1], 0), "constant", 0
        )
    if tensor.shape[2] < RESNET_INPUT_SHAPE[2]:
        tensor = torch.nn.functional.pad(
            tensor, (RESNET_INPUT_SHAPE[2] - tensor.shape[2], 0), "constant", 0
        )
    if tensor.shape[0] > RESNET_INPUT_SHAPE[0]:
        tensor = tensor[:3, :, :]
    return tensor


def create_spectrogram(signal, sr, frame_size, hop_length, y_axis="linear"):
    fft = librosa.stft(signal, n_fft=frame_size, hop_length=hop_length)
    y = np.abs(fft) ** 2
    y = librosa.power_to_db(y)

    fig = plt.gcf()
    fig.dpi = 45

    librosa.display.specshow(
        y, sr=sr, hop_length=hop_length, x_axis="time", y_axis=y_axis
    )

    plt.grid(False)
    plt.axis("off")
    plt.set_cmap("turbo")
    fname = mktemp(".png")

    plt.savefig(fname, dpi=45, bbox_inches="tight", pad_inches=0)
    plt.close()
    return fname


def create_spectrogram_driver(
    signal,
    interval_length=INTERVAL_LENGTH,
    sr=SAMPLING_RATE,
    frame_length=FRAME_LENGTH,
    hop_length=HOP_LENGTH,
    y_axis="log",
):
    fname = create_spectrogram(signal, sr, frame_length, hop_length, y_axis=y_axis)
    tensor = pad_and_drop_alpha(filepath=fname)
    return tensor.numpy()

In [None]:
create_and_save(
    tugboat_files_ls,
    SG_TUG_OUT,
    create_spectrogram_driver,
    feature_interval=INTERVAL_LENGTH,
    sampling_rate=SAMPLING_RATE,
    hop_length=HOP_LENGTH,
    frame_length=FRAME_LENGTH,
)

In [None]:
create_and_save(
    noise_files_ls,
    SG_NOISE_OUT,
    create_spectrogram_driver,
    feature_interval=INTERVAL_LENGTH,
    sampling_rate=SAMPLING_RATE,
    hop_length=HOP_LENGTH,
    frame_length=FRAME_LENGTH,
)