In [2]:
from scipy import signal
from scipy.io import wavfile

import torch
import torch.nn as nn
from torch.hub import load_state_dict_from_url
import torchvision.models as models
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

import numpy as np
import pandas as pd

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

import torchaudio
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
import os
import csv

In [3]:
def wav_to_spectrogram(wav_file, n_fft=400, hop_length=160, n_mels=128):
    waveform, sample_rate = torchaudio.load(wav_file)
    spectrogram_transform = torchaudio.transforms.MelSpectrogram(
        sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels
    )
    spectrogram = spectrogram_transform(waveform)
    return spectrogram

def pad_spectrogram(spectrogram, max_len):
    c, h, w = spectrogram.size()
    if w < max_len:
        pad = max_len - w
        spectrogram = F.pad(spectrogram, (0, pad), mode='constant', value=0)
    return spectrogram


In [4]:
def load_data(data_dir, csv_file):
    df = pd.read_csv(csv_file)
    data = []
    for index, row in df.iterrows():
        wav_id, label = row['wav_id'], row['label']
        wav_path = os.path.join(data_dir, f"{wav_id}.wav")
        waveform, sample_rate = torchaudio.load(wav_path)
        spectrogram = torchaudio.transforms.MelSpectrogram()(waveform)
        data.append((wav_id, spectrogram, label))
    return data

In [5]:
import torchvision.models as models

class AudioResNet(nn.Module):
    def __init__(self, num_classes):
        super(AudioResNet, self).__init__()
        self.resnet = models.resnet18(pretrained=True)
        self.resnet.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, num_classes)

    def forward(self, x):
        x = self.resnet(x)
        return x

In [6]:
def train(model, data, criterion, optimizer, num_epochs):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for i, (wav_id, spectrogram, label) in enumerate(data):
            spectrogram = spectrogram.to(device).unsqueeze(0)  # Add batch dimension
            label = torch.tensor([label], dtype=torch.long).to(device)
            
            optimizer.zero_grad()
            outputs = model(spectrogram)
            loss = criterion(outputs, label)

            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            
            if (i + 1) % 10 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}], Loss: {loss.item():.4f}')
        
        epoch_loss = running_loss / len(train_data)
        print(f'Epoch [{epoch+1}/{num_epochs}] Average Loss: {epoch_loss:.4f}')


In [12]:
def evaluate(model, data):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for wav_id ,spectrogram, label in data:
            spectrogram = spectrogram.to(device).unsqueeze(0)
            label = torch.tensor([label], dtype=torch.long).to(device)
            outputs = model(spectrogram)
            _, preds = torch.max(outputs, 1)
            all_labels.append(label.cpu().numpy())
            all_preds.append(preds.cpu().numpy())

    return np.concatenate(all_labels), np.concatenate(all_preds)

In [8]:
data_dir = "data/5차년도_2차" 
csv_file = "data/df_calculation.csv"

data = load_data(data_dir, csv_file)

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)



In [9]:
labels = [item[2] for item in data]
num_classes = len(set(labels))

model = AudioResNet(num_classes)

print(f"Number of classes: {num_classes}")

# 손실 함수 및 옵티마이저 설정
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-4, eps=1e-8)



Number of classes: 4


In [14]:
# 학습 파라미터 설정
num_epochs = 10

# 모델 학습
train(model, train_data, criterion, optimizer, num_epochs)

Epoch [1/10], Step [10], Loss: 0.5845
Epoch [1/10], Step [20], Loss: 1.9826
Epoch [1/10], Step [30], Loss: 1.3603
Epoch [1/10], Step [40], Loss: 1.0992
Epoch [1/10], Step [50], Loss: 0.8799
Epoch [1/10], Step [60], Loss: 0.6825
Epoch [1/10], Step [70], Loss: 2.5797
Epoch [1/10], Step [80], Loss: 0.7460
Epoch [1/10], Step [90], Loss: 1.6337
Epoch [1/10], Step [100], Loss: 1.2357
Epoch [1/10], Step [110], Loss: 1.5568
Epoch [1/10], Step [120], Loss: 1.0352
Epoch [1/10], Step [130], Loss: 1.1182
Epoch [1/10], Step [140], Loss: 2.1246
Epoch [1/10], Step [150], Loss: 1.1371
Epoch [1/10], Step [160], Loss: 0.4857
Epoch [1/10], Step [170], Loss: 1.0740
Epoch [1/10], Step [180], Loss: 0.8666
Epoch [1/10], Step [190], Loss: 1.8489
Epoch [1/10], Step [200], Loss: 0.9942
Epoch [1/10], Step [210], Loss: 0.9472
Epoch [1/10], Step [220], Loss: 1.2416
Epoch [1/10], Step [230], Loss: 2.3200
Epoch [1/10], Step [240], Loss: 1.5063
Epoch [1/10], Step [250], Loss: 1.7142
Epoch [1/10], Step [260], Loss: 0.

In [None]:
torch.save(model, 'trained_model_new_audio.pt')

In [10]:
model = torch.load('trained_model_new_audio.pt')

In [13]:
train_labels, train_preds = evaluate(model, train_data)
test_labels, test_preds = evaluate(model, test_data)

train_f1 = f1_score(train_labels, train_preds, average='weighted')
train_acc = accuracy_score(train_labels, train_preds)
test_f1 = f1_score(test_labels, test_preds, average='weighted')
test_acc = accuracy_score(test_labels, test_preds)

print(f'Train F1 Score: {train_f1:.4f}, Train Accuracy: {train_acc:.4f}')
print(f'Test F1 Score: {test_f1:.4f}, Test Accuracy: {test_acc:.4f}')

# 예측값을 npy 파일로 저장
np.save('train_preds.npy', train_preds)
np.save('test_preds.npy', test_preds)

Train F1 Score: 0.2415, Train Accuracy: 0.3681
Test F1 Score: 0.2437, Test Accuracy: 0.3679
