In [1]:
import os
import pandas as pd
import numpy as np
import librosa
import kagglehub
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import gradio as gr

# Step 1: Download dataset using kagglehub
dataset_path = kagglehub.dataset_download("mmoreaux/environmental-sound-classification-50")

# Step 2: Locate CSV and audio files
csv_file = None
audio_dir = None
for root, dirs, files in os.walk(dataset_path):
    for file in files:
        if file == "esc50.csv":
            csv_file = os.path.join(root, file)
    if "audio" in dirs:
        audio_dir = os.path.join(root, "audio")

assert csv_file is not None, "esc50.csv not found."
assert audio_dir is not None, "Audio directory not found."

# Step 3: Dataset loader
class AudioDataset(Dataset):
    def __init__(self, csv_file, audio_dir, transform=None):
        self.df = pd.read_csv(csv_file)
        self.audio_dir = audio_dir
        self.labels = sorted(self.df["category"].unique())
        self.label2idx = {label: idx for idx, label in enumerate(self.labels)}
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        file_path = os.path.join(self.audio_dir, row["filename"])
        y, sr = librosa.load(file_path, sr=16000)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
        mfcc = np.mean(mfcc.T, axis=0)
        label = self.label2idx[row["category"]]
        return torch.tensor(mfcc, dtype=torch.float32), torch.tensor(label)

# Step 4: Model
class LSTMModel(nn.Module):
    def __init__(self, input_size=40, hidden_size=128, output_size=50):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = x.unsqueeze(1)  # [batch, seq=1, feature]
        _, (hn, _) = self.lstm(x)
        out = self.fc(hn.squeeze(0))
        return out

# Step 5: Load model
labels = sorted(pd.read_csv(csv_file)["category"].unique())
model = LSTMModel(output_size=len(labels))
model_path = "lstm_audio.pth"
if os.path.exists(model_path):
    model.load_state_dict(torch.load(model_path, map_location="cpu"))
model.eval()

# Step 6: Predict function
def classify_sound(file):
    y, sr = librosa.load(file, sr=16000)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    mfcc = np.mean(mfcc.T, axis=0)
    input_tensor = torch.tensor(mfcc, dtype=torch.float32).unsqueeze(0)
    with torch.no_grad():
        output = model(input_tensor)
    predicted = torch.argmax(output).item()
    return f"Predicted class: {labels[predicted]}"

# Step 7: Gradio UI
gr.Interface(
    fn=classify_sound,
    inputs=gr.Audio(type="filepath", label="Upload Audio"),
    outputs="text",
    title="Environmental Sound Classification (ESC-50 + LSTM)"
).launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://bacd1beffe23286343.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


