In [1]:
# Block 1: Importing Libraries
import tarfile
import resampy
import pandas as pd
import librosa
import numpy as np
import os
import shutil
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from concurrent.futures import ThreadPoolExecutor

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")


GPU: NVIDIA GeForce RTX 4080 Laptop GPU is available.


In [2]:
# Block 2: Extracting Data
def extract_tar(tar_file, target_dir):
    if os.path.exists(target_dir):
        user_input = input(f"The directory '{target_dir}' already exists. Do you want to skip extraction? (y/n): ")
        if user_input.lower() == 'y':
            print(f"Skipping extraction of {tar_file}.")
            return
        else:
            print(f"Overwriting the existing directory '{target_dir}'.")
            shutil.rmtree(target_dir)

    with tarfile.open(tar_file, 'r') as tar:
        tar.extractall(target_dir)

    # Remove residue "._" hidden files from the inner folder
    inner_folder = os.path.join(target_dir, os.path.splitext(os.path.basename(tar_file))[0])
    for root, dirs, files in os.walk(inner_folder):
        for file in files:
            if file.startswith("._"):
                os.remove(os.path.join(root, file))

extract_tar('train_mp3s.tar', 'train_mp3s')
extract_tar('test_mp3s.tar', 'test_mp3s')
train_labels = np.loadtxt('train_label.txt', dtype=int)
train_labels = np.array([int(label) for label in train_labels])

Skipping extraction of train_mp3s.tar.
Skipping extraction of test_mp3s.tar.


In [3]:
# Block 3: Preprocessing Functions
def preprocess_audio(file_path):
    try:
        audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast')
        print(f"Loaded audio file: {file_path}")
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_scaled = np.mean(mfccs.T, axis=0)
        print(f"Extracted MFCCs: {mfccs_scaled.shape}")
        return mfccs_scaled
    except Exception as e:
        print(f"Error processing file: {file_path}")
        print(f"Error message: {str(e)}")
        return None

def process_file(file_path):
    print(f"Processing file: {file_path}")
    mfccs = preprocess_audio(file_path)
    return mfccs

def prepare_data(directory):
    file_paths = [os.path.join(directory, f"{i}.mp3") for i in range(len(os.listdir(directory)))]
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(process_file, file_paths))
    features = [mfccs for mfccs in results if mfccs is not None]
    print(f"Processed {len(features)} audio files")
    return np.array(features)

In [4]:
# Block 4: Preparing Data
train_features_file = 'train_features.pkl'
test_features_file = 'test_features.pkl'

try:
    with open(train_features_file, 'rb') as f:
        train_features = pickle.load(f)
    print("Loaded train features from file.")
except FileNotFoundError:
    train_features = prepare_data('train_mp3s/train_mp3s')
    with open(train_features_file, 'wb') as f:
        pickle.dump(train_features, f)
    print("Saved train features to file.")

try:
    with open(test_features_file, 'rb') as f:
        test_features = pickle.load(f)
    print("Loaded test features from file.")
except FileNotFoundError:
    test_features = prepare_data('test_mp3s/test_mp3s')
    with open(test_features_file, 'wb') as f:
        pickle.dump(test_features, f)
    print("Saved test features to file.")

print(f"Train features shape: {train_features.shape}")
print(f"Test features shape: {test_features.shape}")

train_labels = np.array([int(label) for label in train_labels])
print(f"Train labels shape: {train_labels.shape}")
print(f"Number of training features: {len(train_features)}")
print(f"Number of training labels: {len(train_labels)}")
print(f"Number of test features: {len(test_features)}")

if len(train_features) == 0:
    print("No training features available. Please check the data.")

Loaded train features from file.
Loaded test features from file.
Train features shape: (11886, 40)
Test features shape: (2447, 40)
Train labels shape: (11886,)
Number of training features: 11886
Number of training labels: 11886
Number of test features: 2447


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split, KFold
import pandas as pd
import numpy as np

torch.cuda.empty_cache()

# Block 5: Model Training and Prediction
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

if len(train_features) > 0:
    if len(train_features) != len(train_labels):
        raise ValueError("Number of train features and labels do not match.")

    # Convert the input features to a PyTorch tensor
    train_features = torch.tensor(train_features, dtype=torch.float32)
    train_labels = torch.tensor(train_labels, dtype=torch.long)

    # Define the model architecture
    class AudioClassifier(nn.Module):
        def __init__(self, num_classes):
            super(AudioClassifier, self).__init__()
            self.conv1 = nn.Conv1d(1, 32, kernel_size=3, stride=1, padding=1)
            self.relu1 = nn.ReLU()
            self.maxpool1 = nn.MaxPool1d(kernel_size=2, stride=2)
            self.conv2 = nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1)
            self.relu2 = nn.ReLU()
            self.maxpool2 = nn.MaxPool1d(kernel_size=2, stride=2)
            self.flatten = nn.Flatten()
            self.fc1 = nn.Linear(640, 128)
            self.relu3 = nn.ReLU()
            self.dropout1 = nn.Dropout(0.5)
            self.fc2 = nn.Linear(128, num_classes)

        def forward(self, x):
            x = x.view(x.size(0), 1, -1)  # Reshape the input tensor
            x = self.conv1(x)
            x = self.relu1(x)
            x = self.maxpool1(x)
            x = self.conv2(x)
            x = self.relu2(x)
            x = self.maxpool2(x)
            x = self.flatten(x)
            x = self.fc1(x)
            x = self.relu3(x)
            x = self.dropout1(x)
            x = self.fc2(x)
            return x

    # Create an instance of the model
    num_classes = 4  # Assuming 4 classes in the audio classification task
    model = AudioClassifier(num_classes).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, verbose=True)

    num_epochs = 400
    batch_size = 128

    # Use k-fold cross-validation
    num_folds = 5
    kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    for fold, (train_idx, val_idx) in enumerate(kfold.split(train_features)):
        print(f"Fold {fold + 1}/{num_folds}")

        train_data = train_features[train_idx].to(device)
        train_labels_fold = train_labels[train_idx].to(device)
        val_data = train_features[val_idx].to(device)
        val_labels = train_labels[val_idx].to(device)

        train_dataset = torch.utils.data.TensorDataset(train_data, train_labels_fold)
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        best_val_loss = np.inf
        patience = 20
        counter = 0

        for epoch in range(num_epochs):
            model.train()
            for batch_data, batch_labels in train_loader:
                optimizer.zero_grad()
                outputs = model(batch_data)
                loss = criterion(outputs, batch_labels)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()

            model.eval()
            with torch.no_grad():
                outputs = model(val_data)
                loss = criterion(outputs, val_labels)
                _, predicted = torch.max(outputs, 1)
                accuracy = (predicted == val_labels).float().mean()
                print(f"Epoch [{epoch+1}/{num_epochs}], Validation Loss: {loss.item():.4f}, Validation Accuracy: {accuracy.item():.4f}")

                if loss.item() < best_val_loss:
                    best_val_loss = loss.item()
                    counter = 0
                else:
                    counter += 1

                if counter >= patience:
                    print(f"Early stopping at epoch {epoch+1}")
                    break

                scheduler.step(loss)

    test_features = torch.tensor(test_features, dtype=torch.float32, device=device)
    model.eval()
    with torch.no_grad():
        outputs = model(test_features)
        _, predicted_labels = torch.max(outputs, 1)
        predicted_labels = predicted_labels.cpu().tolist()

    submission = pd.DataFrame({'id': range(len(predicted_labels)), 'category': predicted_labels})
    submission.to_csv('submission.csv', index=False)

else:
    print("No training features available. Please check the data.")

torch.cuda.empty_cache()

Using device: cuda




Epoch [1/10000], Validation Loss: 1.4152, Validation Accuracy: 0.0945
Epoch [2/10000], Validation Loss: 1.2493, Validation Accuracy: 0.4675
Epoch [3/10000], Validation Loss: 1.0845, Validation Accuracy: 0.5308
Epoch [4/10000], Validation Loss: 0.9954, Validation Accuracy: 0.5757
Epoch [5/10000], Validation Loss: 0.9127, Validation Accuracy: 0.6268
Epoch [6/10000], Validation Loss: 0.8411, Validation Accuracy: 0.6469
Epoch [7/10000], Validation Loss: 0.7946, Validation Accuracy: 0.6626
Epoch [8/10000], Validation Loss: 0.7593, Validation Accuracy: 0.6803
Epoch [9/10000], Validation Loss: 0.7387, Validation Accuracy: 0.6884
Epoch [10/10000], Validation Loss: 0.7047, Validation Accuracy: 0.7013
Epoch [11/10000], Validation Loss: 0.6923, Validation Accuracy: 0.7061
Epoch [12/10000], Validation Loss: 0.6843, Validation Accuracy: 0.7114
Epoch [13/10000], Validation Loss: 0.6829, Validation Accuracy: 0.7142
Epoch [14/10000], Validation Loss: 0.6675, Validation Accuracy: 0.7238
Epoch [15/10000

KeyboardInterrupt: 