In [6]:
# Block 1: Importing Libraries
import random
import tarfile
import resampy
import pandas as pd
import librosa
import numpy as np
import os
import shutil
import concurrent.futures
import pickle
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from concurrent.futures import ThreadPoolExecutor
from torch.optim.lr_scheduler import CosineAnnealingLR
import albumentations as A
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")

GPU: NVIDIA GeForce RTX 4080 Laptop GPU is available.


In [7]:
# Block 2: Extracting Data
def extract_tar(tar_file, target_dir):
    if os.path.exists(target_dir):
        user_input = input(f"The directory '{target_dir}' already exists. Do you want to skip extraction? (y/n): ")
        if user_input.lower() == 'y':
            print(f"Skipping extraction of {tar_file}.")
            return
        else:
            print(f"Overwriting the existing directory '{target_dir}'.")
            shutil.rmtree(target_dir)
    with tarfile.open(tar_file, 'r') as tar:
        tar.extractall(target_dir)
    # Remove residue "._" hidden files from the inner folder
    inner_folder = os.path.join(target_dir, os.path.splitext(os.path.basename(tar_file))[0])
    for root, dirs, files in os.walk(inner_folder):
        for file in files:
            if file.startswith("._"):
                os.remove(os.path.join(root, file))

extract_tar('train_mp3s.tar', 'train_mp3s')
extract_tar('test_mp3s.tar', 'test_mp3s')
train_labels = np.loadtxt('train_label.txt', dtype=int)


Skipping extraction of train_mp3s.tar.
Skipping extraction of test_mp3s.tar.


In [8]:
# Block 3: Preprocessing Functions
def save_preprocessed_data(train_features, train_labels, test_features, folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    with open(os.path.join(folder_path, 'train_features.pkl'), 'wb') as f:
        pickle.dump(train_features, f)

    with open(os.path.join(folder_path, 'train_labels.pkl'), 'wb') as f:
        pickle.dump(train_labels, f)

    with open(os.path.join(folder_path, 'test_features.pkl'), 'wb') as f:
        pickle.dump(test_features, f)

def load_preprocessed_data(folder_path):
    with open(os.path.join(folder_path, 'train_features.pkl'), 'rb') as f:
        train_features = pickle.load(f)

    with open(os.path.join(folder_path, 'train_labels.pkl'), 'rb') as f:
        train_labels = pickle.load(f)

    with open(os.path.join(folder_path, 'test_features.pkl'), 'rb') as f:
        test_features = pickle.load(f)

    return train_features, train_labels, test_features

def extract_mfcc(audio, sample_rate):
    return librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)

def extract_mel_spec(audio, sample_rate):
    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=128)
    return mel_spec

def extract_tonnetz(audio, sample_rate):
    return librosa.feature.tonnetz(y=audio, sr=sample_rate)

def extract_chroma_stft(audio, sample_rate):
    return librosa.feature.chroma_stft(y=audio, sr=sample_rate)

def extract_chroma_cqt(audio, sample_rate):
    return librosa.feature.chroma_cqt(y=audio, sr=sample_rate)

def extract_chroma_cens(audio, sample_rate):
    return librosa.feature.chroma_cens(y=audio, sr=sample_rate)

def apply_specmix(mel_spec, label, train_features, train_labels, num_mixes=2, alpha=0.2):
    # Get the indices of samples with the same label
    same_label_indices = np.where(train_labels == label)[0]

    if len(same_label_indices) < num_mixes:
        # If there are not enough samples with the same label, use all available samples
        mix_indices = same_label_indices
    else:
        # Randomly select num_mixes samples with the same label
        mix_indices = np.random.choice(same_label_indices, size=num_mixes, replace=False)

    # Get the mel spectrograms of the selected samples
    mix_mel_specs = train_features[mix_indices]

    # Generate mixing weights using the Beta distribution
    weights = np.random.beta(alpha, alpha, size=len(mix_indices))
    weights_norm = weights / np.sum(weights)

    # Truncate or pad the selected mel spectrograms to match the shape of the input mel spectrogram
    target_length = mel_spec.shape[1]
    mix_mel_specs_resized = []
    for spec in mix_mel_specs:
        if len(spec.shape) == 1:
            # If spec is 1-dimensional, reshape it to 2-dimensional
            spec = spec.reshape(1, -1)
        if spec.shape[1] > target_length:
            # Truncate the spectrogram if it is longer than the target length
            spec = spec[:, :target_length]
        elif spec.shape[1] < target_length:
            # Pad the spectrogram with zeros if it is shorter than the target length
            pad_width = target_length - spec.shape[1]
            spec = np.pad(spec, ((0, 0), (0, pad_width)), mode='constant')
        mix_mel_specs_resized.append(spec)

    # Mix the mel spectrograms using the generated weights
    mixed_mel_spec = np.zeros_like(mel_spec)
    for i in range(len(mix_indices)):
        mixed_mel_spec += weights_norm[i] * mix_mel_specs_resized[i]

    return mixed_mel_spec

def apply_audio_augmentation(mel_spec, label, train_features, train_labels):
    augmented_mel_spec = apply_specmix(mel_spec, label, train_features, train_labels)
    return augmented_mel_spec

def preprocess_audio(file_path, label, train_features, train_labels, augment=False):
    try:
        audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast')
        print(f"Loaded audio file: {file_path}")

        mel_spec = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=128)

        if augment and label is not None and train_features is not None and train_labels is not None:
            augmented_mel_spec = apply_audio_augmentation(mel_spec, label, train_features, train_labels)
            features_scaled = np.mean(augmented_mel_spec.T, axis=0)
        else:
            features_scaled = np.mean(mel_spec.T, axis=0)

        print(f"Extracted features: {features_scaled.shape}")
        return features_scaled
    except Exception as e:
        print(f"Error processing file: {file_path}")
        print(f"Error message: {str(e)}")
        return None

def process_file(file_path, label, train_features, train_labels, augment=False):
    print(f"Processing file: {file_path}")
    if label is None:
        # If label is None, pass None for train_features and train_labels as well
        features = preprocess_audio(file_path, None, None, None, augment=augment)
    else:
        features = preprocess_audio(file_path, label, train_features, train_labels, augment=augment)
    return features

def prepare_data(directory, train_features, train_labels, augment=False):
    file_paths = [os.path.join(directory, f"{i}.mp3") for i in range(len(os.listdir(directory)))]
    labels = train_labels.tolist() if train_labels is not None else [None] * len(file_paths)
    
    with ThreadPoolExecutor(max_workers=32) as executor:
        results = list(executor.map(lambda x: process_file(x[0], x[1], train_features, train_labels, augment), zip(file_paths, labels)))
    
    features = [feat for feat in results if feat is not None]
    
    if not features:
        raise ValueError("No audio files were successfully processed.")
    
    features = np.array(features)
    print(f"Processed {len(features)} audio files")
    return features



In [9]:
# Block 4: Preparing Data (modified)
folder_path = 'V6'

try:
    train_features, train_labels, test_features = load_preprocessed_data(folder_path)
    print("Loaded preprocessed data from the 'V6' folder.")
    label_encoder = LabelEncoder()
    train_labels_encoded = label_encoder.fit_transform(train_labels)

except FileNotFoundError:
    label_encoder = LabelEncoder()
    train_labels_encoded = label_encoder.fit_transform(train_labels)

    train_features_original = prepare_data('train_mp3s/train_mp3s', np.empty((0, 128)), train_labels_encoded)
    print(f"Original train features shape: {train_features_original.shape}")

    train_features_augmented = prepare_data('train_mp3s/train_mp3s', train_features_original, train_labels_encoded, augment=True)
    print(f"Augmented train features shape: {train_features_augmented.shape}")

    train_features = np.concatenate((train_features_original, train_features_augmented), axis=0)
    print(f"Combined train features shape: {train_features.shape}")

    test_features = prepare_data('test_mp3s/test_mp3s', np.empty((0, 128)), None)
    print(f"Test features shape: {test_features.shape}")

    train_labels_augmented = train_labels_encoded.copy()
    train_labels_encoded = np.concatenate((train_labels_encoded, train_labels_augmented), axis=0)
    print(f"Train labels shape: {train_labels_encoded.shape}")

    print(f"Number of training features: {len(train_features)}")
    print(f"Number of training labels: {len(train_labels_encoded)}")
    print(f"Number of test features: {len(test_features)}")

    save_preprocessed_data(train_features, train_labels_encoded, test_features, folder_path)
    print(f"Saved preprocessed data to the {folder_path} folder.")

if len(train_features) == 0:
    print("No training features available. Please check the data.")

# Reshape the features to have a channel dimension of 1
train_features = train_features.reshape(-1, 1, train_features.shape[1])
test_features = test_features.reshape(-1, 1, test_features.shape[1])
print(test_features.shape)
print(train_features.shape)

Loaded preprocessed data from the 'V6' folder.
(2447, 1, 128)
(23772, 1, 128)


In [10]:
# Block 5: Model Definition, Training, and Prediction
class AudioDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        return self.features[index], self.labels[index]

class AudioClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(AudioClassifier, self).__init__()
        self.conv1 = nn.Conv1d(1, 128, kernel_size=3, padding=1)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.3)
        self.conv2 = nn.Conv1d(128, 256, kernel_size=3, padding=1)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.3)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(256 * input_size, hidden_size)
        self.relu3 = nn.ReLU()
        self.dropout3 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.relu3(x)
        x = self.dropout3(x)
        x = self.fc2(x)
        return x

def train_model(model, train_loader, criterion, optimizer, scheduler, device, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        running_accuracy = 0.0
        running_f1 = 0.0
        running_precision = 0.0
        running_recall = 0.0
        
        for features, labels in train_loader:
            features, labels = features.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels.long())
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            
            _, predicted = torch.max(outputs.data, 1)
            running_accuracy += accuracy_score(labels.cpu().numpy(), predicted.cpu().numpy())
            running_f1 += f1_score(labels.cpu().numpy(), predicted.cpu().numpy(), average='weighted', zero_division=1)
            running_precision += precision_score(labels.cpu().numpy(), predicted.cpu().numpy(), average='weighted', zero_division=1)
            running_recall += recall_score(labels.cpu().numpy(), predicted.cpu().numpy(), average='weighted', zero_division=1)
        
        scheduler.step()
        epoch_loss = running_loss / len(train_loader)
        epoch_accuracy = running_accuracy / len(train_loader)
        epoch_f1 = running_f1 / len(train_loader)
        epoch_precision = running_precision / len(train_loader)
        epoch_recall = running_recall / len(train_loader)
        
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}, F1 Score: {epoch_f1:.4f}, Precision: {epoch_precision:.4f}, Recall: {epoch_recall:.4f}")

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create data loaders
train_dataset = AudioDataset(train_features, train_labels_encoded)
test_dataset = AudioDataset(test_features, np.zeros(len(test_features)))  # Dummy labels for test data
train_loader = DataLoader(train_dataset, batch_size=8192, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8192)

# Model parameters
input_size = train_features.shape[2]  # Assuming the input features have shape (batch_size, 1, feature_size)
hidden_size = 2048
num_classes = len(np.unique(train_labels_encoded))
num_epochs = 10000

# Create the model
model = AudioClassifier(input_size, hidden_size, num_classes).to(device)

# Define the loss function, optimizer, and scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs)

# Train the model
train_model(model, train_loader, criterion, optimizer, scheduler, device, num_epochs)

# Generate submission.csv
model.eval()
with torch.no_grad():
    predictions = []
    for features, _ in test_loader:
        features = features.to(device)
        outputs = model(features)
        _, predicted = torch.max(outputs, 1)
        predictions.extend(predicted.cpu().numpy())

submission = pd.DataFrame({'id': range(len(predictions)), 'category': predictions})
submission.to_csv('submission.csv', index=False)

Epoch [1/10000], Loss: 45.4357, Accuracy: 0.3243, F1 Score: 0.2299, Precision: 0.5297, Recall: 0.3243
Epoch [2/10000], Loss: 28.7235, Accuracy: 0.2946, F1 Score: 0.1721, Precision: 0.6092, Recall: 0.2946
Epoch [3/10000], Loss: 2.4438, Accuracy: 0.3208, F1 Score: 0.2113, Precision: 0.4654, Recall: 0.3208
Epoch [4/10000], Loss: 1.3211, Accuracy: 0.3733, F1 Score: 0.2901, Precision: 0.5476, Recall: 0.3733
Epoch [5/10000], Loss: 1.3503, Accuracy: 0.3859, F1 Score: 0.2700, Precision: 0.5348, Recall: 0.3859
Epoch [6/10000], Loss: 1.3671, Accuracy: 0.3801, F1 Score: 0.2476, Precision: 0.5193, Recall: 0.3801
Epoch [7/10000], Loss: 1.3744, Accuracy: 0.3767, F1 Score: 0.2325, Precision: 0.5304, Recall: 0.3767
Epoch [8/10000], Loss: 1.3763, Accuracy: 0.3721, F1 Score: 0.2211, Precision: 0.4668, Recall: 0.3721
Epoch [9/10000], Loss: 1.3765, Accuracy: 0.3706, F1 Score: 0.2152, Precision: 0.5163, Recall: 0.3706
Epoch [10/10000], Loss: 1.3761, Accuracy: 0.3695, F1 Score: 0.2114, Precision: 0.5211, Re