In [2]:
pip install tqdm

Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import wandb
from wandb.integration.keras import WandbCallback
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader
import time
from tqdm import tqdm
import wandb

import matplotlib.pyplot as plt
import librosa

In [8]:
import os

# ====== set these ======
INPUT_ROOT  = "data/train"
OUTPUT_ROOT = "data/train_image"
# =======================

SR = 22050
DURATION = 3.0
N_FFT = 2048
HOP = 512
N_MELS = 128

def audio_to_logmel_array(audio_path):
    y, _ = librosa.load(audio_path, sr=SR, mono=True)

    target_len = int(SR * DURATION)
    if len(y) < target_len:
        y = np.pad(y, (0, target_len - len(y)))
    else:
        y = y[:target_len]

    S = librosa.feature.melspectrogram(y=y, sr=SR, n_fft=N_FFT, hop_length=HOP, n_mels=N_MELS)
    S_db = librosa.power_to_db(S, ref=np.max)

    # normalize to 0..1 for imsave
    S_db -= S_db.min()
    S_db /= (S_db.max() + 1e-9)
    return S_db  # 2D array

def convert_dataset(input_root, output_root):
    os.makedirs(output_root, exist_ok=True)

    classes = [d for d in os.listdir(input_root) if os.path.isdir(os.path.join(input_root, d))]
    for cls in classes:
        in_dir = os.path.join(input_root, cls)
        out_dir = os.path.join(output_root, cls)
        os.makedirs(out_dir, exist_ok=True)

        for fn in os.listdir(in_dir):
            if not fn.lower().endswith((".au", ".wav", ".mp3", ".flac", ".ogg", ".m4a")):
                continue

            in_path = os.path.join(in_dir, fn)
            out_path = os.path.join(out_dir, os.path.splitext(fn)[0] + ".png")

            try:
                S_norm = audio_to_logmel_array(in_path)
                plt.imsave(out_path, S_norm, cmap="magma")  # saves RGB
            except Exception as e:
                print("Failed:", in_path, "->", e)

    print("Done! Spectrograms saved to:", output_root)

convert_dataset(INPUT_ROOT, OUTPUT_ROOT)


Done! Spectrograms saved to: data/train_image


In [10]:
INPUT_TEST  = "data/test"
OUTPUT_TEST = "data/test_image"   # one folder (no labels)

SR = 22050
DURATION = 3.0
N_FFT = 2048
HOP = 512
N_MELS = 128

def audio_to_logmel_norm(audio_path):
    y, _ = librosa.load(audio_path, sr=SR, mono=True)

    target_len = int(SR * DURATION)
    if len(y) < target_len:
        y = np.pad(y, (0, target_len - len(y)))
    else:
        y = y[:target_len]

    S = librosa.feature.melspectrogram(y=y, sr=SR, n_fft=N_FFT, hop_length=HOP, n_mels=N_MELS)
    S_db = librosa.power_to_db(S, ref=np.max)

    # normalize to 0..1 so imsave can colorize nicely
    S_db -= S_db.min()
    S_db /= (S_db.max() + 1e-9)
    return S_db

os.makedirs(OUTPUT_TEST, exist_ok=True)

for fn in sorted(os.listdir(INPUT_TEST)):
    if not fn.lower().endswith((".au", ".wav", ".mp3", ".flac", ".ogg", ".m4a")):
        continue

    in_path = os.path.join(INPUT_TEST, fn)
    out_path = os.path.join(OUTPUT_TEST, os.path.splitext(fn)[0] + ".png")

    try:
        S_norm = audio_to_logmel_norm(in_path)
        plt.imsave(out_path, S_norm, cmap="magma")  # saves RGB image
    except Exception as e:
        print("Failed:", fn, "->", e)

print("Done! Test spectrograms in:", OUTPUT_TEST)


Done! Test spectrograms in: data/test_image


In [23]:
wandb.init(project='resnet50_project', name='resnet_pytorch')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [24]:
transform = transforms.Compose([
	transforms.Resize((224)),
	transforms.ToTensor(),
	transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


train_dataset = datasets.CIFAR10(root='data/train_image', train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True,  pin_memory=False, num_workers=0)
test_dataset = datasets.CIFAR10(root='data/test_image', train=False, download=True, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False,  pin_memory=False, num_workers=0)


In [25]:
class CustomResNet(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()

        self.resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)

        # resnet50 final layer input features = 2048
        in_features = self.resnet.fc.in_features

        # remove the original classifier head
        self.resnet.fc = nn.Identity()

        # your custom head
        self.classifier = nn.Sequential(
            nn.Linear(in_features, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        x = self.resnet(x)        # (B, 2048)
        x = self.classifier(x)    # (B, num_classes)
        return x

model = CustomResNet(num_classes=10).to(device)


In [26]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

EPOCHS = 3

for epoch in range(EPOCHS):
    running_loss = 0.0
    data_bar = tqdm(train_loader)
    i = 0
    correct = 0
    total = 0
    start_time = time.time()

    for data in data_bar:
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        data_bar.set_description(
            "Processing epoch {:d} minibatch {:d} train loss {:.3f}".format(
                epoch, i + 1, running_loss / (i + 1)
            )
        )
        i += 1

    # Log metrics to wandb
    wandb.log({'epoch': epoch + 1, 'train_loss': running_loss / len(train_loader)})

print('Finished Training')


Processing epoch 0 minibatch 782 train loss 1.576: 100%|█| 782/782 [2:16:03<00:0
Processing epoch 1 minibatch 782 train loss 0.268: 100%|█| 782/782 [2:14:30<00:0
Processing epoch 2 minibatch 782 train loss 0.139: 100%|█| 782/782 [2:17:43<00:0

Finished Training





In [28]:
correct = 0
total = 0
i = 0
with torch.no_grad():
	pbar = tqdm(test_loader)
	for data in data_bar:
		images, labels = data
		images = images.to(device)
		labels = labels.to(device)
		outputs = model(images)
		_, predicted = torch.max(outputs.data, 1)
		total += labels.size(0)
		correct += (predicted == labels).sum().item()
		pbar.set_description("minibatch {:d} test accuracy {:4.2f}%".format(i+1,100.0*correct/total))
		i += 1


print('Accuracy of the network on the 10000 test images: %4.2f %%' % (100.0 * correct / total))



minibatch 782 test accuracy 96.80%:   0%|             | 0/157 [1:23:13<?, ?it/s]

Accuracy of the network on the 10000 test images: 96.80 %


In [32]:
pip install pandas

Collecting pandas
  Downloading pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl (10.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.7/10.7 MB[0m [31m2.8 MB/s[0m  [33m0:00:03[0mm0:00:01[0m0:01[0m
[?25hUsing cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Installing collected packages: pytz, pandas
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [pandas]2m1/2[0m [pandas]
[1A[2KSuccessfully installed pandas-2.3.3 pytz-2025.2
Note: you may need to restart the kernel to use updated packages.


In [33]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd

# ---- paths ----
LIST_PATH = "data/test/list_test.txt"          
IMG_DIR   = "data/test_image"          
OUT_CSV   = "submission.csv"
# ----------------------------------------------

# Must match the folder order
class_names = ["blues","classical","country","disco","hiphop","jazz","metal","pop","reggae","rock"]

# 1) read filenames
with open(LIST_PATH, "r") as f:
    filenames = [ln.strip() for ln in f if ln.strip()]

# 2) dataset that loads the corresponding spectrogram png for each id
class TestSpecDataset(Dataset):
    def __init__(self, filenames, img_dir, transform):
        self.filenames = filenames
        self.img_paths = [
            os.path.join(img_dir, os.path.splitext(fn)[0] + ".png")
            for fn in filenames
        ]
        self.transform = transform

    def __len__(self):
        return len(self.filenames)

    def __getitem__(self, idx):
        img = Image.open(self.img_paths[idx]).convert("RGB")
        img = self.transform(img)
        return img, self.filenames[idx]

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

test_ds = TestSpecDataset(filenames, IMG_DIR, transform)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False)

# 3) predict -> preds_label
model.eval()
preds_label = []
ids_out = []

with torch.no_grad():
    for x, batch_ids in test_loader:
        x = x.to(device)
        logits = model(x)                 # (B, 10)
        pred_idx = logits.argmax(dim=1).cpu().tolist()
        preds_label.extend([class_names[i] for i in pred_idx])
        ids_out.extend(list(batch_ids))

# 4) write submission.csv
submission = pd.DataFrame({"id": ids_out, "class": preds_label})
submission.to_csv(OUT_CSV, index=False)

print("Saved:", OUT_CSV)
submission.head()

Saved: submission.csv


Unnamed: 0,id,class
0,test.00596.au,reggae
1,test.02436.au,reggae
2,test.02930.au,reggae
3,test.03364.au,disco
4,test.03550.au,country
