In [51]:
# Block 1: Importing Libraries
import random
import tarfile
import resampy
import pandas as pd
import librosa
import numpy as np
import os
import shutil
import concurrent.futures
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from concurrent.futures import ThreadPoolExecutor
from torch.optim.lr_scheduler import CosineAnnealingLR
import albumentations as A
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")

GPU: NVIDIA GeForce RTX 4080 Laptop GPU is available.


In [52]:
# Block 2: Extracting Data
def extract_tar(tar_file, target_dir):
    if os.path.exists(target_dir):
        user_input = input(f"The directory '{target_dir}' already exists. Do you want to skip extraction? (y/n): ")
        if user_input.lower() == 'y':
            print(f"Skipping extraction of {tar_file}.")
            return
        else:
            print(f"Overwriting the existing directory '{target_dir}'.")
            shutil.rmtree(target_dir)
    with tarfile.open(tar_file, 'r') as tar:
        tar.extractall(target_dir)
    # Remove residue "._" hidden files from the inner folder
    inner_folder = os.path.join(target_dir, os.path.splitext(os.path.basename(tar_file))[0])
    for root, dirs, files in os.walk(inner_folder):
        for file in files:
            if file.startswith("._"):
                os.remove(os.path.join(root, file))

extract_tar('train_mp3s.tar', 'train_mp3s')
extract_tar('test_mp3s.tar', 'test_mp3s')
train_labels = np.loadtxt('train_label.txt', dtype=int)

Skipping extraction of train_mp3s.tar.
Skipping extraction of test_mp3s.tar.


In [53]:
# Block 3: Preprocessing Functions

import pickle

def save_preprocessed_data(train_features, train_labels, test_features, folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    with open(os.path.join(folder_path, 'train_features.pkl'), 'wb') as f:
        pickle.dump(train_features, f)

    with open(os.path.join(folder_path, 'train_labels.pkl'), 'wb') as f:
        pickle.dump(train_labels, f)

    with open(os.path.join(folder_path, 'test_features.pkl'), 'wb') as f:
        pickle.dump(test_features, f)

def load_preprocessed_data(folder_path):
    with open(os.path.join(folder_path, 'train_features.pkl'), 'rb') as f:
        train_features = pickle.load(f)

    with open(os.path.join(folder_path, 'train_labels.pkl'), 'rb') as f:
        train_labels = pickle.load(f)

    with open(os.path.join(folder_path, 'test_features.pkl'), 'rb') as f:
        test_features = pickle.load(f)

    return train_features, train_labels, test_features
def extract_mfcc(audio, sample_rate):
    return librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)

def extract_mel_spec(audio, sample_rate):
    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=128)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    mel_spec_scaled = np.mean(mel_spec_db.T, axis=0)
    return mel_spec_scaled

def extract_tonnetz(audio, sample_rate):
    return librosa.feature.tonnetz(y=audio, sr=sample_rate)

def extract_chroma_stft(audio, sample_rate):
    return librosa.feature.chroma_stft(y=audio, sr=sample_rate)

def extract_chroma_cqt(audio, sample_rate):
    return librosa.feature.chroma_cqt(y=audio, sr=sample_rate)

def extract_chroma_cens(audio, sample_rate):
    return librosa.feature.chroma_cens(y=audio, sr=sample_rate)
def apply_audio_augmentation(mel_spectrogram):
    # Get the shape of the mel spectrogram
    F, T = mel_spectrogram.shape

    # Create a binary mask filled with ones
    M = np.ones((F, T), dtype=int)

    # Frequency masking
    ftimes = random.randint(0, 3)
    for _ in range(ftimes):
        fstart = random.randint(0, F - 1)
        γ = random.uniform(0, 1)
        fend = int(fstart + γ * F)
        M[fstart:fend, :] = 0

    # Time masking
    ttimes = random.randint(0, 3)
    for _ in range(ttimes):
        tstart = random.randint(0, T - 1)
        γ = random.uniform(0, 1)
        tend = int(tstart + γ * T)
        M[:, tstart:tend] = 0

    # Apply the mask to the mel spectrogram
    masked_mel_spectrogram = M * mel_spectrogram

    return masked_mel_spectrogram
def preprocess_audio(file_path, augment=False):
    try:
        audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast')
        print(f"Loaded audio file: {file_path}")

        mel_spec = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=128)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

        if augment:
            augmented_mel_spec_db = apply_audio_augmentation(mel_spec_db)
            features_scaled = np.mean(augmented_mel_spec_db.T, axis=0)
        else:
            features_scaled = np.mean(mel_spec_db.T, axis=0)

        print(f"Extracted features: {features_scaled.shape}")
        return features_scaled
    except Exception as e:
        print(f"Error processing file: {file_path}")
        print(f"Error message: {str(e)}")
        return None



def process_file(file_path, augment=False):
    print(f"Processing file: {file_path}")
    features = preprocess_audio(file_path, augment=augment)
    return features

def prepare_data(directory, augment=False):
    file_paths = [os.path.join(directory, f"{i}.mp3") for i in range(len(os.listdir(directory)))]
    
    with ThreadPoolExecutor(max_workers=32) as executor:
        results = list(executor.map(lambda x: process_file(x, augment), file_paths))
    
    features = []
    for feat in results:
        if feat is not None:
            features.append(feat)
        else:
            features.append(np.zeros_like(features[0]))  # Append a placeholder if augmented features are not generated
    
    features = np.array(features)
    print(f"Processed {len(features)} audio files")
    return features

In [54]:
# Block 4: Preparing Data (modified)

folder_path = 'V6'

try:
    train_features, train_labels, test_features = load_preprocessed_data(folder_path)
    print("Loaded preprocessed data from the 'V6' folder.")

except FileNotFoundError:
    train_features_original = prepare_data('train_mp3s/train_mp3s')
    print(f"Original train features shape: {train_features_original.shape}")

    train_features_augmented = prepare_data('train_mp3s/train_mp3s', augment=True)
    print(f"Augmented train features shape: {train_features_augmented.shape}")

    train_features = np.concatenate((train_features_original, train_features_augmented), axis=0)
    print(f"Combined train features shape: {train_features.shape}")

    test_features = prepare_data('test_mp3s/test_mp3s')
    print(f"Test features shape: {test_features.shape}")

    label_encoder = LabelEncoder()
    train_labels_original = label_encoder.fit_transform(train_labels)
    train_labels_augmented = train_labels_original.copy()
    train_labels = np.concatenate((train_labels_original, train_labels_augmented), axis=0)
    print(f"Train labels shape: {train_labels.shape}")

    print(f"Number of training features: {len(train_features)}")
    print(f"Number of training labels: {len(train_labels)}")
    print(f"Number of test features: {len(test_features)}")

    save_preprocessed_data(train_features, train_labels, test_features, folder_path)
    print(f"Saved preprocessed data to the {folder_path} folder.")

if len(train_features) == 0:
    print("No training features available. Please check the data.")

Loaded preprocessed data from the 'V6' folder.


In [55]:
# Block 5: Model Definition, Training, and Prediction
class AudioClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(AudioClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.bn1 = nn.BatchNorm1d(hidden_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.bn2 = nn.BatchNorm1d(hidden_size)
        self.fc3 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc3(x)
        return x

class AudioDataset(Dataset):
    def __init__(self, features, labels=None):
        self.features = features.astype(np.float32)  # Convert features to float32
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.labels is not None:
            return self.features[idx], self.labels[idx]
        else:
            return self.features[idx]

def train_model(model, train_loader, criterion, optimizer, scheduler, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        scheduler.step()
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / (batch_idx+1):.4f}")

def predict(model, test_loader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for data in test_loader:
            data = data.to(device)
            output = model(data)
            _, predicted = torch.max(output.data, 1)
            predictions.extend(predicted.cpu().numpy())
    return predictions

# Set hyperparameters
input_size = train_features.shape[1]
hidden_size = 256
num_classes = len(np.unique(train_labels))
batch_size = 64
learning_rate = 0.001
num_epochs = 150

# Create DataLoader
train_dataset = AudioDataset(train_features, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = AudioDataset(test_features)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Initialize the model, loss function, optimizer, and scheduler
model = AudioClassifier(input_size, hidden_size, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs)

# Train the model
train_model(model, train_loader, criterion, optimizer, scheduler, num_epochs)

# Make predictions on the test set
predictions = predict(model, test_loader)

# Save predictions to a CSV file
output_df = pd.DataFrame({'id': range(len(predictions)), 'category': predictions})
output_df.to_csv('submission.csv', index=False)

Epoch [1/50], Loss: 1.1768
Epoch [2/50], Loss: 1.0859
Epoch [3/50], Loss: 1.0610
Epoch [4/50], Loss: 1.0418
Epoch [5/50], Loss: 1.0277
Epoch [6/50], Loss: 1.0109
Epoch [7/50], Loss: 1.0048
Epoch [8/50], Loss: 0.9911
Epoch [9/50], Loss: 0.9934
Epoch [10/50], Loss: 0.9860
Epoch [11/50], Loss: 0.9770
Epoch [12/50], Loss: 0.9663
Epoch [13/50], Loss: 0.9606
Epoch [14/50], Loss: 0.9524
Epoch [15/50], Loss: 0.9543
Epoch [16/50], Loss: 0.9469
Epoch [17/50], Loss: 0.9428
Epoch [18/50], Loss: 0.9368
Epoch [19/50], Loss: 0.9254
Epoch [20/50], Loss: 0.9250
Epoch [21/50], Loss: 0.9148
Epoch [22/50], Loss: 0.9164
Epoch [23/50], Loss: 0.9120
Epoch [24/50], Loss: 0.9071
Epoch [25/50], Loss: 0.9060
Epoch [26/50], Loss: 0.8949
Epoch [27/50], Loss: 0.8935
Epoch [28/50], Loss: 0.8871
Epoch [29/50], Loss: 0.8846
Epoch [30/50], Loss: 0.8858
Epoch [31/50], Loss: 0.8831
Epoch [32/50], Loss: 0.8781
Epoch [33/50], Loss: 0.8699
Epoch [34/50], Loss: 0.8706
Epoch [35/50], Loss: 0.8702
Epoch [36/50], Loss: 0.8632
E