In [None]:
import os
import librosa
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

# Define the dataset path
dataset_path = "Data/genres_original"

# List all genres
genres = os.listdir(dataset_path)
print("Genres:", genres)

In [None]:
def extract_features(file_path, n_mfcc=13, n_mels=128):
    # Load audio file
    y, sr = librosa.load(file_path, duration=30)  # Load 30 seconds of audio
    # Extract MFCCs
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    # Extract Mel Spectrogram
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    return mfccs, mel_spec_db

# Example: Extract features from one file
file_path = os.path.join(dataset_path, "blues/blues.00000.wav")
mfccs, mel_spec = extract_features(file_path)
print("MFCCs shape:", mfccs.shape)
print("Mel Spectrogram shape:", mel_spec.shape)

In [None]:
class GTZANDataset(Dataset):
    def __init__(self, dataset_path, transform=None):
        self.dataset_path = dataset_path
        self.genres = os.listdir(dataset_path)
        self.file_paths = []
        self.labels = []
        for genre in self.genres:
            genre_path = os.path.join(dataset_path, genre)
            for file_name in os.listdir(genre_path):
                self.file_paths.append(os.path.join(genre_path, file_name))
                self.labels.append(genre)
        self.transform = transform

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        label = self.labels[idx]
        # Extract features
        mfccs, mel_spec = extract_features(file_path)
        # Convert to tensor
        features = torch.tensor(mel_spec, dtype=torch.float32)  # Use Mel Spectrogram
        # Encode label as integer
        label_idx = self.genres.index(label)
        label = torch.tensor(label_idx, dtype=torch.long)
        return features, label

# Create dataset
dataset = GTZANDataset(dataset_path)
print("Number of samples:", len(dataset))

In [None]:
from sklearn.model_selection import train_test_split

# Split indices
train_idx, test_idx = train_test_split(range(len(dataset)), test_size=0.2, random_state=42)
train_idx, val_idx = train_test_split(train_idx, test_size=0.2, random_state=42)

# Create subsets
train_dataset = torch.utils.data.Subset(dataset, train_idx)
val_dataset = torch.utils.data.Subset(dataset, val_idx)
test_dataset = torch.utils.data.Subset(dataset, test_idx)

# Create DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
import torch.nn as nn

class AudioRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(AudioRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # Initialize hidden state and cell state
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        # Forward pass through LSTM
        out, _ = self.lstm(x, (h0, c0))
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

# Define model
input_size = 128  # Number of Mel bands
hidden_size = 256
num_layers = 2
num_classes = len(genres)
model = AudioRNN(input_size, hidden_size, num_layers, num_classes)
print(model)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import librosa
import numpy as np
import os

# Define the dataset class
class GTZANDataset(Dataset):
    def __init__(self, dataset_path):
        self.dataset_path = dataset_path
        self.genres = os.listdir(dataset_path)
        self.file_paths = []
        self.labels = []
        for genre in self.genres:
            genre_path = os.path.join(dataset_path, genre)
            for file_name in os.listdir(genre_path):
                if file_name.endswith('.wav'):  # Ensure only WAV files
                    self.file_paths.append(os.path.join(genre_path, file_name))
                    self.labels.append(genre)

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        label = self.labels[idx]
        try:
            y, sr = librosa.load(file_path, duration=30)
            mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
            mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max).T
            features = torch.tensor(mel_spec_db, dtype=torch.float32)
            label_idx = self.genres.index(label)
            label = torch.tensor(label_idx, dtype=torch.long)
            return features, label
        except Exception as e:
            print(f"Skipping {file_path} due to error: {e}")
            return torch.zeros((128, 128)), torch.tensor(-1)  # Dummy sample

# Define the RNN model
class AudioRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(AudioRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # Initialize hidden state and cell state
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        # Forward pass through LSTM
        out, _ = self.lstm(x, (h0, c0))
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

# Create dataset
dataset_path = "data/genres_original"
dataset = GTZANDataset(dataset_path)

# Split dataset into train, validation, and test sets
from sklearn.model_selection import train_test_split

train_idx, test_idx = train_test_split(range(len(dataset)), test_size=0.2, random_state=42)
train_idx, val_idx = train_test_split(train_idx, test_size=0.2, random_state=42)

train_dataset = torch.utils.data.Subset(dataset, train_idx)
val_dataset = torch.utils.data.Subset(dataset, val_idx)
test_dataset = torch.utils.data.Subset(dataset, test_idx)

# Define collate_fn to pad sequences
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    inputs = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    # Pad sequences to the same length
    inputs_padded = pad_sequence(inputs, batch_first=True)
    labels = torch.stack(labels)
    return inputs_padded, labels

# Create DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Define model
input_size = 128  # Number of Mel bands
hidden_size = 256
num_layers = 2
num_classes = len(dataset.genres)
model = AudioRNN(input_size, hidden_size, num_layers, num_classes)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # Print loss
        if (i+1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}")

In [None]:
model.eval()  # Set the model to evaluation mode
val_loss = 0
correct = 0
total = 0

with torch.no_grad():  # Disable gradient computation
    for inputs, labels in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        val_loss += loss.item()
        # Calculate accuracy
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

val_loss /= len(val_loader)
val_accuracy = 100 * correct / total
print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%")

In [None]:
torch.save(model.state_dict(), "audio_rnn_model.pth")

In [None]:
model.eval()  # Set the model to evaluation mode
test_loss = 0
correct = 0
total = 0

with torch.no_grad():  # Disable gradient computation
    for inputs, labels in test_loader:
        # Skip invalid samples
        if labels[0] == -1:
            continue
        inputs = inputs.to(device)
        labels = labels.to(device)
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        # Calculate accuracy
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

test_loss /= len(test_loader)
test_accuracy = 100 * correct / total
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")

In [None]:
# Save the model's state dictionary
torch.save(model.state_dict(), "audio_rnn_model.pth")

In [None]:
# Define the model architecture
model = AudioRNN(input_size, hidden_size, num_layers, num_classes).to(device)

# Load the saved state dictionary
model.load_state_dict(torch.load("audio_rnn_model.pth"))

# Set the model to evaluation mode
model.eval()

In [None]:
def preprocess_audio(file_path):
    # Extract features (e.g., Mel Spectrogram)
    y, sr = librosa.load(file_path, duration=30)
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    # Transpose the spectrogram to (time_steps, n_mels)
    mel_spec_db = mel_spec_db.T  # Transpose
    # Convert to tensor
    features = torch.tensor(mel_spec_db, dtype=torch.float32)
    return features

# Example: Preprocess a new audio file
new_audio_path = "path_to_new_audio_file.wav"
new_features = preprocess_audio(new_audio_path)

In [None]:
# Add batch dimension and move to device
new_features = new_features.unsqueeze(0).to(device)

# Forward pass
with torch.no_grad():
    output = model(new_features)
    _, predicted = torch.max(output.data, 1)

# Map predicted index to genre
predicted_genre = dataset.genres[predicted.item()]
print(f"Predicted Genre: {predicted_genre}")

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Collect predictions

In [None]:
import soundfile as sf
import audioread

# Test soundfile
try:
    sf.read("Data/genres_original/jazz/jazz.00000.wav")
    print("Soundfile backend is working.")
except Exception as e:
    print("Soundfile backend error:", e)

# Test audioread
try:
    with audioread.audio_open("Data/genres_original/jazz/jazz.00000.wav") as f:
        print("Audioread backend is working.")
except Exception as e:
    print("Audioread backend error:", e)

In [19]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import librosa
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# 1. Device configuration - same as your CNN
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 2. Enhanced Dataset Class with validation like your CNN
class GTZANDataset(Dataset):
    def __init__(self, dataset_path):
        self.dataset_path = dataset_path
        self.genres = sorted(os.listdir(dataset_path))
        self.valid_samples = []
        self.corrupted_files = []
        
        # Validate all files upfront like ImageFolder does
        for genre_idx, genre in enumerate(self.genres):
            genre_path = os.path.join(dataset_path, genre)
            for file_name in os.listdir(genre_path):
                file_path = os.path.join(genre_path, file_name)
                if self._validate_audio(file_path):
                    self.valid_samples.append((file_path, genre_idx))
                else:
                    self.corrupted_files.append(file_path)
        
        print(f"Loaded {len(self.valid_samples)} valid samples")
        print(f"Skipped {len(self.corrupted_files)} corrupted files")

    def _validate_audio(self, file_path):
        """Thorough validation like ImageFolder's image checks"""
        try:
            # Basic file checks
            if not file_path.endswith('.wav') or os.path.getsize(file_path) < 1024:
                return False
            
            # Test loading
            y, sr = librosa.load(file_path, duration=30, sr=22050)
            if len(y) < 660000:  # 30s of audio
                return False
                
            # Test feature extraction
            mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
            if np.isnan(mel_spec).any() or np.isinf(mel_spec).any():
                return False
                
            return True
        except:
            return False

    def __len__(self):
        return len(self.valid_samples)

    def __getitem__(self, idx):
        file_path, label_idx = self.valid_samples[idx]
        try:
            # Load and process audio - ensure consistent shape
            y, sr = librosa.load(file_path, duration=30, sr=22050)
            mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
            mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max).T  # Transpose to (time, n_mels)
            
            # Standardize to fixed sequence length (like CNN's resize)
            if mel_spec_db.shape[0] < 128:
                mel_spec_db = np.pad(mel_spec_db, ((0, 128-mel_spec_db.shape[0]), (0,0)))
            mel_spec_db = mel_spec_db[:128]  # Truncate if too long
            
            return (
                torch.tensor(mel_spec_db, dtype=torch.float32), 
                torch.tensor(label_idx, dtype=torch.long)
            )
        except:
            return torch.zeros((128, 128)), torch.tensor(-1)  # Mark as invalid

# 3. Collate function with filtering like your CNN's batch handling
def collate_fn(batch):
    # Filter out invalid samples (label = -1)
    batch = [item for item in batch if item[1] != -1]
    
    if not batch:
        return torch.zeros(0, 128, 128), torch.zeros(0, dtype=torch.long)
    
    # Stack inputs and labels
    inputs = torch.stack([item[0] for item in batch])  # (batch, time, n_mels)
    labels = torch.stack([item[1] for item in batch])
    
    return inputs, labels

# 4. Enhanced RNN Model with better initialization like your CNN
class AudioRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=0.2 if num_layers > 1 else 0
        )
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_classes)
        )
        
        # Initialize weights properly like your CNN
        for name, param in self.named_parameters():
            if 'weight' in name:
                if 'lstm' in name:
                    if 'weight_ih' in name:
                        nn.init.xavier_uniform_(param)
                    elif 'weight_hh' in name:
                        nn.init.orthogonal_(param)
                else:
                    nn.init.kaiming_normal_(param)
            elif 'bias' in name:
                nn.init.constant_(param, 0)

    def forward(self, x):
        # Input validation
        if x.device != device:
            x = x.to(device)
            
        out, _ = self.lstm(x)  # (batch, seq, hidden)
        return self.fc(out[:, -1, :])  # Last timestep

# 5. Training loop with same structure as your CNN
def train_model():
    # Create dataset
    dataset = GTZANDataset("Data/genres_original")
    
    # Split dataset
    train_idx, test_idx = train_test_split(range(len(dataset)), test_size=0.2, random_state=42)
    train_idx, val_idx = train_test_split(train_idx, test_size=0.2, random_state=42)
    
    # Create DataLoaders
    batch_size = 32
    train_loader = DataLoader(
        torch.utils.data.Subset(dataset, train_idx),
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate_fn,
        pin_memory=True
    )
    val_loader = DataLoader(
        torch.utils.data.Subset(dataset, val_idx),
        batch_size=batch_size,
        collate_fn=collate_fn,
        pin_memory=True
    )
    
    # Initialize model
    model = AudioRNN(
        input_size=128,
        hidden_size=256,
        num_layers=2,
        num_classes=len(dataset.genres)
    ).to(device)
    
    # Loss and optimizer - same as your CNN
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # Training loop
    num_epochs = 10
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        
        for i, (inputs, labels) in enumerate(train_loader):
            # Skip empty batches
            if inputs.size(0) == 0:
                continue
                
            # Move to device
            inputs = inputs.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)
            
            # Forward pass
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            
            if (i+1) % 10 == 0:
                print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}")
        
        # Validation
        model.eval()
        val_loss, correct, total = 0, 0, 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                if inputs.size(0) == 0:
                    continue
                    
                inputs = inputs.to(device)
                labels = labels.to(device)
                
                outputs = model(inputs)
                val_loss += criterion(outputs, labels).item()
                
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        
        print(f"Epoch {epoch+1}, Val Loss: {val_loss/len(val_loader):.4f}, Val Acc: {100*correct/total:.2f}%")
    
    print("Training complete!")
    torch.save(model.state_dict(), "audio_rnn_model.pth")

if __name__ == "__main__":
    train_model()

Using device: cuda


  y, sr = librosa.load(file_path, duration=30, sr=22050)


Loaded 999 valid samples
Skipped 1 corrupted files


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

In [None]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"cuDNN version: {torch.backends.cudnn.version()}")
print(f"Device count: {torch.cuda.device_count()}")
print(f"Current device: {torch.cuda.current_device()}")
print(f"Device name: {torch.cuda.get_device_name(0)}")