# Exploratory Data Analysis (EDA)

## Load and visualize audio waveforms at different durations

In [1]:
import os
import torchaudio
import matplotlib.pyplot as plt

# ffmpeg path
os.add_dll_directory(r"C:\ffmpeg\bin")

# path for songs
BASE_DIR = os.path.dirname(os.path.abspath('__file__'))
sound_sample_file = os.path.join(BASE_DIR, '..', 'data', '01. Talwiinder, NDS, Rippy Grewal - Haseen.mp3')

# load audio
waveform, sr = torchaudio.load(sound_sample_file)

# durations in seconds
durations = {
    "30 seconds": int(30.0 * sr),
    "10 seconds": int(10.0 * sr),
    "1 second": int(1.0 * sr),
    "0.1 second": int(0.1 * sr),
    "0.01 second": int(0.01 * sr),
}

# create plots
fig, ax = plt.subplots(nrows=len(durations), figsize=(12, 12))

# plot waveforms
for i, (label, samples) in enumerate(durations.items()):
    ax[i].plot(waveform[0][:samples].numpy())
    ax[i].set_title(f"Waveform ({label})")
    ax[i].set_xlabel("Samples")
    ax[i].set_ylabel("Amplitude")

plt.tight_layout()
plt.show()

RuntimeError: Failed to create AudioDecoder for C:\Users\kaust\DEPLOYMENT PROJECTS\Music Recommendor with Audio Recognition\notebooks\..\data\01. Talwiinder, NDS, Rippy Grewal - Haseen.mp3: Could not open input file: C:\Users\kaust\DEPLOYMENT PROJECTS\Music Recommendor with Audio Recognition\notebooks\..\data\01. Talwiinder, NDS, Rippy Grewal - Haseen.mp3 No such file or directory

## Visualize Mel Spectrogram for first 30 seconds

In [None]:
import numpy as np
import torchaudio.transforms as T
from skimage.transform import resize

# load audio
waveform, sample_rate = torchaudio.load(sound_sample_file)

# convert to mono if stereo
if waveform.shape[0] > 1:
    waveform = waveform.mean(dim=0, keepdim=True)

# limit to first 30 seconds
max_seconds = 30
max_samples = min(waveform.shape[1], max_seconds * sample_rate)
waveform_30s = waveform[:, :max_samples]
signal_30s = waveform_30s[0].numpy()

# mel spectrogram
mel_transform = T.MelSpectrogram(sample_rate=sample_rate, n_fft=2048, n_mels=128, f_max=8000)
mel_spec = mel_transform(waveform_30s)

# convert power spectrogram to dB
mel_db = T.AmplitudeToDB(stype="power")(mel_spec)

# remove channel dimension for plotting
mel_db = mel_db[0].numpy()

# for cnn
# match original scaling and orientation
mel_image = (((80 + mel_db) / 80) * 255)
mel_image = np.flip(mel_image, axis=0)
mel_image = resize(mel_image, (128, 512)).astype(np.uint8)

# create plots
fig, ax = plt.subplots(2, 1, figsize=(10, 6))

# plot waveform
ax[0].plot(signal_30s)
ax[0].set_title(f"Audio Signal ({max_seconds} sec)")
ax[0].set_ylabel("Amplitude")
ax[0].set_xlabel("Samples")

# plot spectrogram
ax[1].imshow(mel_db, origin="lower", aspect="auto")
ax[1].set_title(f"Mel Spectrogram ({max_seconds} sec)")
ax[1].set_ylabel("Mel Frequency")
ax[1].set_xlabel("Time")

plt.tight_layout()
plt.show()

## Visualize Mel Spectrograms from Multiple Songs

In [None]:
from src.preprocessing import df

# plot mel spectrogram of the first song
fig, ax = plt.subplots(figsize=(15, 3))
plt.title('Mel Spectrogram')
plt.imshow(df.iloc[0]['audio_data'])

In [None]:
import math

# plot mel spectrogram from different songs
cols = 3
num_songs = 6
rows = math.ceil(num_songs / cols)
fig, ax = plt.subplots(ncols=cols, nrows=rows, figsize=(15, rows*3))

ax_flat = ax.flatten()
plt.suptitle('Mel Spectrogram from Different Songs', fontsize=20)

i = -1
for i in range(num_songs):
    data = df.iloc[i]['audio_data']
    ax_flat[i].imshow(data)
    ax_flat[i].axis('off')
    ax_flat[i].set_title(df.iloc[i]['file_name'])

for j in range(i + 1, len(ax_flat)):
    ax_flat[j].axis('off')

plt.tight_layout()
plt.show()

## Plot training and validation losses of Autoencoder

In [None]:
import torch
import matplotlib.pyplot as plt

# load from checkpoint
checkpoint = torch.load("../models/audio_autoencoder.pth")
train_losses = checkpoint['train_losses']
val_losses = checkpoint['val_losses']

# plot the training and validation losses
plt.figure(figsize=(10, 6))
plt.plot(train_losses, label='Training Loss', linewidth=2)
plt.plot(val_losses, label='Validation Loss', linewidth=2)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Over Time')
plt.legend()
plt.grid(True, alpha=0.1)
plt.show()

# check for overfitting
print(f"Final Train Loss: {train_losses[-1]:.4f}")
print(f"Final Val Loss: {val_losses[-1]:.4f}")
print(f"Difference: {abs(train_losses[-1] - val_losses[-1]):.4f}")

## Visualising the original spectrogram with the reconstructed spectrogram from the CNN

In [None]:
import torch.nn as nn
from src.audiopipeline import test_loader

# redefine the model class exactly as it is in autoencoder.py
class AudioAutoencoder(nn.Module):
    def __init__(self):
        super(AudioAutoencoder, self).__init__()
        # encoder
        self.encoder = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(32),
            nn.LeakyReLU(0.2), # Must match training code
            nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(64),
            nn.LeakyReLU(0.2),
            nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2),
            nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.2),
            nn.Flatten(),
            nn.Linear(16384, 512),
            nn.LeakyReLU(0.2),
            nn.Linear(512, 256)
        )
        # decoder
        self.decoder = nn.Sequential(
            nn.Linear(256, 512),
            nn.LeakyReLU(0.2),
            nn.Linear(512, 16384),
            nn.LeakyReLU(0.2),
            nn.Unflatten(1, (256, 8, 8)),
            nn.ConvTranspose2d(256, 128, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2),
            nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.BatchNorm2d(64),
            nn.LeakyReLU(0.2),
            nn.ConvTranspose2d(64, 32, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.BatchNorm2d(32),
            nn.LeakyReLU(0.2),
            nn.ConvTranspose2d(32, 1, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# setup device and load model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AudioAutoencoder().to(device)
checkpoint_path = "../models/audio_autoencoder.pth"

try:
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"Model loaded successfully from epoch {checkpoint.get('epoch', '?')}")
    model.eval()
except FileNotFoundError:
    print("Error: Model file not found. Please train the model first.")

# get a random batch from test loader
data_iter = iter(test_loader)
images, labels = next(data_iter)

# pick a image from the batch
original_db = images[0].to(device)

# apply normalisation
input_norm = (original_db + 80) / 80
input_norm = torch.clamp(input_norm, 0.0, 1.0)

# add batch dimension [1, 1, 128, 128] for the model
input_batch = input_norm.unsqueeze(0)

with torch.no_grad():
    reconstructed = model(input_batch)

# visualization
orig_viz = input_norm.squeeze().cpu().numpy()
recon_viz = reconstructed.squeeze().cpu().numpy()

plt.figure(figsize=(12, 6))

# plot original spectrogram
plt.subplot(1, 2, 1)
plt.title(f"Original Input\n{labels[0]}")
plt.imshow(orig_viz, cmap='inferno', origin='lower', vmin=0, vmax=1)
plt.colorbar(label='Signal Strength (0-1)')

# plot reconstructed spectrogram
plt.subplot(1, 2, 2)
plt.title("Reconstructed Output")
plt.imshow(recon_viz, cmap='inferno', origin='lower', vmin=0, vmax=1)
plt.colorbar(label='Signal Strength (0-1)')

plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import seaborn as sns

# load data
df = pd.read_pickle("../data/songs_with_features.pkl")

# convert into 2D np array (num_Songs, 256)
X = np.stack(df['latent_vector'].values)

# run t-SNE (reduce 256 dim -> 2 dim)
tsne = TSNE(n_components=2, perplexity=5, random_state=42, init='pca', learning_rate=200)
X_embedded = tsne.fit_transform(X)

# plot
plt.figure(figsize=(10, 8))
sns.scatterplot(x=X_embedded[:,0], y=X_embedded[:,1])

plt.title("Song Latent Space (t-SNE Visualization, 233 Songs)")
plt.show()