In [3]:
import os
import numpy as np
import librosa
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import soundfile
import torch.nn.functional as F
from tqdm import tqdm

In [218]:
# 데이터셋 경로 (임의 설정)
input_data_dir = "./raw_data/train"
output_data_dir = "./split_data/train"

In [184]:
def preprocess_all_wav_files(directory, output_directory, duration=1.0):
    for file_name in os.listdir(directory):
        if file_name.endswith('.wav'):
            file_path = os.path.join(directory, file_name)
            preprocess_and_split_wav(file_path, output_directory, duration)

In [185]:
# 데이터 전처리 및 분할 (학습을 위해 wav 파일을 1초 단위로 자름)
def preprocess_and_split_wav(file_path, output_dir, duration=1.0):
    y, sr = librosa.load(file_path, sr=None)
    total_samples = len(y)
    num_segments = int(np.ceil(total_samples / sr / duration))

    for i in range(num_segments):
        start = int(i * sr * duration)
        end = int(min((i + 1) * sr * duration, total_samples))
        
        segment = y[start:end]
        if len(segment) < sr * duration:
            padding = np.zeros(int(sr * duration - len(segment)))  
            segment = np.concatenate((segment, padding))
            
        output_file = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(file_path))[0]}_{i}.wav")
        soundfile.write(output_file, segment, sr)

In [191]:
# 모든 wav 파일에 대해 전처리 수행
preprocess_all_wav_files(input_data_dir, output_data_dir, duration=1.0)

In [177]:
class SoundDataset(Dataset):
    def __init__(self, data_dir, max_length=128):  
        self.samples = []
        self.target_classes = {'car_horn': 0, 'dog': 1, 'siren' : 2, 'screaming' : 3, 'talk' : 4}  # 클래스별 정수 레이블 매핑
        self.labels = []
        self.max_length = max_length

        for file_name in os.listdir(data_dir):
            if file_name.endswith('.wav'):
                self.samples.append(os.path.join(data_dir, file_name))
                if "경적" in file_name:  
                    self.labels.append(self.target_classes['car_horn'])  
                elif "동물" in file_name: 
                    self.labels.append(self.target_classes['dog'])
                elif "자동차" in file_name: 
                    self.labels.append(self.target_classes['siren'])
                elif "비명" in file_name: 
                    self.labels.append(self.target_classes['screaming'])
                elif "대화" in file_name: 
                    self.labels.append(self.target_classes['talk'])
                    
        self.labels = torch.tensor(self.labels, dtype=torch.long).to(device) 

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        file_path = self.samples[idx]
        y, sr = librosa.load(file_path, sr=None)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        
        if mfccs.shape[1] > self.max_length:
            mfccs = mfccs[:, :self.max_length]
        else:
            mfccs = np.pad(mfccs, ((0, 0), (0, self.max_length - mfccs.shape[1])), mode='constant')

        return torch.Tensor(mfccs).to('cuda'), self.labels[idx]

In [98]:
class SoundClassifier(nn.Module):
    def __init__(self, num_classes):
        super(SoundClassifier, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(128 * 16 * 16, 512) 
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, num_classes)   
        
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = x.view(-1, 128 * 16 * 16)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        #return F.softmax(x, dim=1)  # softmax를 사용한 결과값 출력
        return x

In [126]:
def train_model(model, dataloader, criterion, optimizer, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in tqdm(dataloader):
            inputs = inputs.unsqueeze(1).to(device)  
            inputs = F.interpolate(inputs, size=(128, 128))  
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * inputs.size(0)
        
        epoch_loss = running_loss / len(dataloader.dataset)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")
        torch.save(model.state_dict(), "./model.pt")

In [219]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = SoundClassifier(num_classes=5).to(device)

#가중치 적용을 위한 각 라벨별 데이터 개수
nsumsample = [19050, 19069, 29527, 408, 14727]
normedWeights = [1 - (x / sum(nsumsample)) for x in nsumsample]
normedWeights = torch.FloatTensor(normedWeights).to(device)

criterion = nn.CrossEntropyLoss(normedWeights)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [220]:
dataset = SoundDataset(output_data_dir)
dataloader = DataLoader(dataset, batch_size=1024, shuffle=True)
train_model(model, dataloader, criterion, optimizer, num_epochs=10)
print("학습 완료")

100%|███████████████████████████████████████████| 81/81 [05:38<00:00,  4.18s/it]


Epoch [1/10], Loss: 0.9312


100%|███████████████████████████████████████████| 81/81 [05:14<00:00,  3.88s/it]


Epoch [2/10], Loss: 0.0919


100%|███████████████████████████████████████████| 81/81 [05:04<00:00,  3.76s/it]


Epoch [3/10], Loss: 0.0562


100%|███████████████████████████████████████████| 81/81 [05:01<00:00,  3.72s/it]


Epoch [4/10], Loss: 0.0380


100%|███████████████████████████████████████████| 81/81 [05:03<00:00,  3.75s/it]


Epoch [5/10], Loss: 0.0341


100%|███████████████████████████████████████████| 81/81 [05:06<00:00,  3.79s/it]


Epoch [6/10], Loss: 0.0265


100%|███████████████████████████████████████████| 81/81 [05:05<00:00,  3.77s/it]


Epoch [7/10], Loss: 0.0206


100%|███████████████████████████████████████████| 81/81 [05:14<00:00,  3.88s/it]


Epoch [8/10], Loss: 0.0216


100%|███████████████████████████████████████████| 81/81 [05:04<00:00,  3.76s/it]


Epoch [9/10], Loss: 0.0162


100%|███████████████████████████████████████████| 81/81 [05:02<00:00,  3.73s/it]


Epoch [10/10], Loss: 0.0151
학습 완료


In [226]:
#모델 저장
torch.save(model.state_dict(), "./model.pt")

In [221]:
def validate_model(model, dataloader, criterion):
    model.eval()  
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    with torch.no_grad():
        for inputs, labels in tqdm(dataloader):
            inputs = inputs.unsqueeze(1).to(device)
            inputs = F.interpolate(inputs, size=(128, 128))
            labels = labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item() * inputs.size(0)
            
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == labels).sum().item()
            total_samples += labels.size(0)

    average_loss = total_loss / total_samples
    accuracy = correct_predictions / total_samples * 100

    print(f"Validation Loss: {average_loss:.4f}, Accuracy: {accuracy:.2f}%")

In [222]:
validation_data_dir = "./split_data/validation"  
validation_dataset = SoundDataset(validation_data_dir) 
validation_dataloader = DataLoader(validation_dataset, batch_size=32, shuffle=False)  

In [224]:
#모델 검증
validate_model(model, validation_dataloader, criterion)

100%|█████████████████████████████████████████| 639/639 [01:20<00:00,  7.98it/s]

Validation Loss: 0.5231, Accuracy: 92.85%





In [261]:
#임의의 데이터 추론 정확도 실험 코드
resdataset = SoundDataset("./resdataset")
resdataloader = DataLoader(resdataset, batch_size=1024, shuffle=True)

with torch.no_grad():
    for inputs, labels in resdataloader:
        inputs = inputs.unsqueeze(1).to(device)
        inputs = F.interpolate(inputs, size=(128, 128))
        labels = labels.to(device)
        outputs_probs = resmodel(inputs)
        outputs_probs = F.softmax(outputs_probs, dim=1)
        predicted_class_idx = torch.argmax(outputs_probs, dim=1).item()
        print(predicted_class_idx) #가장 높은 확률의 인덱스
        print(max(outputs_probs[0])) #추론한 결과의 확률

2
tensor(0.5888, device='cuda:0')
