In [6]:
import torch
import torchaudio
import pandas as pd


class UrbanSound8KDataset(torch.utils.data.Dataset):
    def __init__(self, csv_file, file_path, processor, sample_rate, seconds=None):
        self.annotations = pd.read_csv(csv_file)
        self.file_path = file_path
        self.processor = processor
        self.sample_rate = sample_rate
        self.resampler = torchaudio.transforms.Resample(orig_freq=44100, new_freq=self.sample_rate)
        self.seconds = seconds * self.sample_rate if seconds else self.sample_rate * 5

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        audio_file_path = os.path.join(self.file_path, 'fold' + str(self.annotations.iloc[index]['fold']), self.annotations.iloc[index]['slice_file_name'])
        label = self.annotations.iloc[index, 6]
        waveform, sample_rate = torchaudio.load(audio_file_path)
        
        # Convert to mono by averaging channels
        waveform = waveform.mean(dim=0)
        
        # pad/truncate waveform to target_length
        if self.seconds:
            if waveform.shape[0] < self.seconds:
                padding = torch.zeros(self.seconds - waveform.shape[0])
                waveform = torch.cat((waveform, padding))
            elif waveform.shape[0] > self.seconds:
                waveform = waveform[:self.seconds]

        # Resample from 44.1kHz to 16kHz
        waveform = self.resampler(waveform)
        
        # Now truncating to 4 seconds of audio (64000 samples at 16000Hz)
        inputs = self.processor(waveform, sampling_rate=self.sample_rate, max_length=self.seconds, return_tensors="pt", padding=True, truncation=True)
        
        return inputs.input_values[0], torch.tensor(label)


# Load the dataset
try:
    file_path = '/Users/cafalena/sound_datasets/urbansound8k/UrbanSound8K/audio/'
except:
    file_path = 'C:/Users/PC/AppData/@FOLDER/@Project/UrbanSound8K/audio/'
try:
    csv_file = pd.read_csv('/Users/cafalena/sound_datasets/urbansound8k/UrbanSound8K/metadata/UrbanSound8K.csv')
except:
    csv_file = pd.read_csv('C:/Users/PC/AppData/@FOLDER/@Project/UrbanSound8K/metadata/UrbanSound8K.csv')


# Create datasets
from sklearn.model_selection import train_test_split

model = VGGish.from_pretrained("facebook/vggish")
processor = VGGishProcessor()

#this model is trained with 16,000 Hz sample rate
train_dataset = UrbanSound8KDataset(csv_file=csv_file, file_path=file_path, processor=processor,sample_rate=16000)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH, shuffle=True)

val_dataset = UrbanSound8KDataset(csv_file=csv_file, file_path=file_path, processor=processor,sample_rate=16000)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH, shuffle=False)




# Now, `train_data` is your training set (70% of total),
# `val_set` is your validation set (15% of total), and
# `test_data` is your testing set (15% of total).
from tqdm import tqdm
device = 'cpu'
# Hyperparameters

os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'


EPOCHS = 100
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
loss_fn = torch.nn.CrossEntropyLoss()

model = model.to(device)
model = model.train()

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)
    losses = []
    correct_predictions = 0

    for data in tqdm(train_loader):
        input_values = data[0].to(device)
        labels = data[1].to(device)

        outputs = model(input_values, labels=labels)
        _, preds = torch.max(outputs.logits, dim=1)
        loss = loss_fn(outputs.logits, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    train_acc = correct_predictions.double() / len(train_loader.dataset)
    train_loss = np.mean(losses)
    print(f'Train loss {train_loss} accuracy {train_acc}')

    # Evaluate on the validation set
    with torch.no_grad():
        val_loss = 0
        val_acc = 0
        for data in tqdm(val_loader):
            input_values = data[0].to(device)
            labels = data[1].to(device)

            outputs = model(input_values, labels=labels)
            _, preds = torch.max(outputs.logits, dim=1)
            loss = loss_fn(outputs.logits, labels)

            val_loss += loss.item()
            val_acc += torch.sum(preds == labels)

        val_loss /= len(val_loader.dataset)
        val_acc = val_acc.double() / len(val_loader.dataset)
        print(f'Val loss {val_loss} accuracy {val_acc}')

    # Save the model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), 'model.pt')

print('Finished training')


NameError: name 'VGGish' is not defined

In [9]:
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torchvggish
import torch.nn as nn
from torch.optim import Adam


class UrbanSound8KDataset2(Dataset):
    def __init__(self, csv_file, file_path, sample_rate, seconds=None):
        self.annotations = csv_file
        self.file_path = file_path
        self.resampler = torchaudio.transforms.Resample(orig_freq=44100, new_freq=sample_rate)
        self.seconds = seconds * sample_rate if seconds else sample_rate * 5

    def __len__(self):
        return len(self.annotations)
    def __getitem__(self, index):
        audio_file_path = os.path.join(self.file_path, 'fold' + str(self.annotations.iloc[index]['fold']), self.annotations.iloc[index]['slice_file_name'])
        label = self.annotations.iloc[index, 6]
        
        waveform, sample_rate = torchaudio.load(audio_file_path)
        
        # Convert to mono by averaging channels
        waveform = waveform.mean(dim=0)
        
        # pad/truncate waveform to target_length
        if self.seconds:
            if waveform.shape[0] < self.seconds:
                padding = torch.zeros(self.seconds - waveform.shape[0])
                waveform = torch.cat((waveform, padding))
            elif waveform.shape[0] > self.seconds:
                waveform = waveform[:self.seconds]

        # Resample from 44.1kHz to 16kHz
        waveform = self.resampler(waveform)
        
        # Add channel dimension
        waveform = waveform.unsqueeze(0)
        
        return waveform, torch.tensor(label)




# Load the dataset
try:
    file_path = '/Users/cafalena/sound_datasets/urbansound8k/UrbanSound8K/audio/'
except:
    file_path = 'C:/Users/PC/AppData/@FOLDER/@Project/UrbanSound8K/audio/'
try:
    csv_file = pd.read_csv('/Users/cafalena/sound_datasets/urbansound8k/UrbanSound8K/metadata/UrbanSound8K.csv')
except:
    csv_file = pd.read_csv('C:/Users/PC/AppData/@FOLDER/@Project/UrbanSound8K/metadata/UrbanSound8K.csv')

BATCH = 16

train_dataset = UrbanSound8KDataset2(csv_file=csv_file, file_path=file_path, sample_rate=16000)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH, shuffle=True)


vggish_model = torchvggish.vggish().to('cpu')

vggish_model.classifier = nn.Sequential(
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Linear(64, 10)  # num_labels는 레이블의 개수에 따라 수정해야 합니다.
)

vggish_model = vggish_model.train()

optimizer = Adam(vggish_model.parameters(), lr=0.0001)
loss_fn = nn.CrossEntropyLoss()

EPOCHS = 10

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)
    losses = []
    correct_predictions = 0
    
    for data in tqdm(train_loader):
        inputs, labels = data
        inputs, labels = inputs.to('cpu'), labels.to('cpu')
        print(f'inputs.shape before unsqueeze: {inputs.shape}')
        inputs = inputs.unsqueeze(2)  # add height dimension
        print(f'inputs.shape after unsqueeze: {inputs.shape}')
        
        outputs = vggish_model(inputs)

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()


    train_acc = correct_predictions.double() / len(train_loader.dataset)
    train_loss = np.mean(losses)
    print(f'Train loss {train_loss} accuracy {train_acc}')



Epoch 1/10
----------


  0%|          | 0/546 [00:00<?, ?it/s]

inputs.shape before unsqueeze: torch.Size([16, 1, 29025])
inputs.shape after unsqueeze: torch.Size([16, 1, 1, 29025])


  0%|          | 0/546 [00:00<?, ?it/s]


RuntimeError: Given input size: (64x1x29025). Calculated output size: (64x0x14512). Output size is too small