# **Urban Sound Classification**

In [41]:
import IPython.display as ipd
import librosa
import librosa.display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [42]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data_utils

In [43]:
path = '/content/drive/MyDrive/Colab Notebooks/data/'
#path = 'D:\data/'
path_train = '/content/drive/MyDrive/Colab Notebooks/data/Train/'
df = pd.read_csv(path + 'train.csv')
test_df = pd.read_csv(path + 'test.csv')
df

Unnamed: 0,ID,Class
0,0,siren
1,1,street_music
2,2,drilling
3,3,siren
4,4,dog_bark
...,...,...
5430,8725,engine_idling
5431,8726,dog_bark
5432,8727,engine_idling
5433,8728,engine_idling


In [44]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **데이터 전처리**

**범주형(Categorical) 데이터셋으로 변환**

In [45]:
# Converting classes into numeric format
df['numeric_class'] = df['Class'].astype('category').cat.codes
df

Unnamed: 0,ID,Class,numeric_class
0,0,siren,8
1,1,street_music,9
2,2,drilling,4
3,3,siren,8
4,4,dog_bark,3
...,...,...,...
5430,8725,engine_idling,5
5431,8726,dog_bark,3
5432,8727,engine_idling,5
5433,8728,engine_idling,5


**Train Dataset과 Validation Dataset으로 나누기**

In [47]:
def train_val_split(df):
    train_df = pd.DataFrame(columns = df.columns)
    val_df = pd.DataFrame(columns = df.columns)

    train_df = df[:int(df['ID'].count()*0.8)]
    val_df = df[4348:]
        
    return train_df, val_df

In [48]:
train_df, val_df = train_val_split(df)
train_df.shape, val_df.shape

((4348, 3), (1087, 3))

**진행 시각화(Visualization)**

In [49]:
import cv2
import sys

def drawProgressBar(current, total, string = '', barLen = 20):

    percent = current/total
    arrow = ">"
    if percent == 1:
        arrow = ""
   
    sys.stdout.write("\r")
    sys.stdout.write("Progress: [{:<{}}] {}/{}".format("=" * int(barLen * percent) + arrow, 
                                                         barLen, current, total) + string)
    sys.stdout.flush()

**동일한 크기의 오디오 클립 만들기**

In [50]:
def get_audio_same_len(wav, sr):
    if wav.shape[0] < 4 * sr:
        wav = np.pad(wav, int(np.ceil((4 * sr - wav.shape[0])/2)), mode = 'reflect')
    wav = wav[:4 * sr]
    
    return wav

**스펙트로그램 만들기**

In [68]:
def get_melspectrogram_db(wav, sr):
  
    wav = get_audio_same_len(wav, sr)
        
    spec = librosa.feature.melspectrogram(wav, sr, n_fft = 2048, hop_length = 512, 
                          n_mels = 128, fmin = 20, fmax = 8300)
    
    spec = librosa.power_to_db(spec, top_db = 80)
    return spec

**표준화와 정규화**

In [52]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

def standard_norm(spec):
    mMscaler = MinMaxScaler()
    sdscaler = StandardScaler()

    spec = sdscaler.fit_transform(spec)
    spec = mMscaler.fit_transform(spec)
    spec_scaled = spec*255

    return spec_scaled

In [53]:
BATCH_SIZE = 32

**음성 데이터 로딩(loading)**

In [54]:
def load_data(df):
    audio_data = []
    sample_rates = []
    labels = []
    
    tot = len(df)
    curr = 0
    
    for idx in df.index:
        try:
            file_name = str(df['ID'][idx]) + '.wav'
            wav, sr = librosa.load('/content/drive/MyDrive/Colab Notebooks/data/Train/' + file_name)
            
            wav = get_audio_same_len(wav, sr)
    
            audio_data.append(wav)
            sample_rates.append(sr)
            
            labels.append(df['numeric_class'][idx])
            
            curr += 1
            drawProgressBar(curr, tot, barLen = 40)
        
        except KeyboardInterrupt:
            print('KeyBoardInterrupt')
            break
        
        except Exception:
            print("Couldn't read file", df['ID'][idx])
            curr += 1
            
    print('\n')
    return np.stack(audio_data, axis = 0), np.array(sample_rates), np.array(labels)

In [55]:
train_data, train_sr, train_labels = load_data(train_df)
val_data, val_sr, val_labels = load_data(val_df)





In [56]:
train_data.shape, val_data.shape

((4348, 88200), (1087, 88200))

**데이터 변환(Coversion)과 Tensor Dataset 구축**

In [57]:
# Convert numpy arrays to torch tensors
train_data = torch.from_numpy(train_data)
train_labels = torch.from_numpy(train_labels).long()

val_data = torch.from_numpy(val_data)
val_labels = torch.from_numpy(val_labels).long()

# Create data loaders
train_data = data_utils.TensorDataset(train_data, train_labels)
val_data = data_utils.TensorDataset(val_data, val_labels)

## **Convolutional Neural Network(CNN)** on Spectrogram Images

In [58]:
set(train_sr), set(val_sr)

({22050}, {22050})

In [59]:
train_sr = 22050
val_sr = 22050

**DataLoader 구축하기**

In [90]:
def get_spectrogram_loader(audio_data, sr, batch_size, shuffle = False):

    hop_length = 512 # 샘플의 수
    n_fft = 2048 # 윈도우 # spectral resolution / window length

    audio_spec_img = []
    labels = []
    curr = 0
    tot = len(audio_data)

    for wav, label in audio_data:
        spec_img = standard_norm(get_melspectrogram_db(wav.numpy(), sr))
        spec_img = np.expand_dims(spec_img, axis = 0)
        audio_spec_img.append(spec_img)
        labels.append(label)

        curr += 1
        drawProgressBar(curr, tot, barLen = 40)

    audio_spec_img = torch.Tensor(audio_spec_img)
    audio_spec_img = audio_spec_img / 255
    
    labels = torch.Tensor(labels).long()

    audio_spec_img = data_utils.TensorDataset(audio_spec_img, labels)
    #audio_loader = data_utils.DataLoader(audio_spec_img, batch_size = batch_size, shuffle = shuffle)
    
    #return audio_loader
    return audio_spec_img

In [100]:
train_spec_dataset = get_spectrogram_loader(train_data, train_sr, BATCH_SIZE, shuffle = True)



In [101]:
train_spec_dataset[0][0].size()

torch.Size([1, 128, 173])

In [102]:
train_loader = data_utils.DataLoader(train_spec_dataset, batch_size = BATCH_SIZE, shuffle = False)

In [103]:
val_spec_dataset  = get_spectrogram_loader(val_data, val_sr, BATCH_SIZE)



In [104]:
val_spec_dataset[0][0].size()

torch.Size([1, 128, 173])

In [105]:
val_loader = data_utils.DataLoader(train_spec_dataset, batch_size = BATCH_SIZE, shuffle = False)

**CNN 모델 구축하기**

In [186]:
# 파라미터 설정 방법
input = torch.Tensor(1,1,128,173)
conv1 = nn.Conv2d(1, 8, (5, 6))
pool =  nn.MaxPool2d(2)
out=conv1(input)
out=pool(out)

out.shape

torch.Size([1, 8, 62, 84])

In [187]:
NUM_CLASSES = 10
LEARNING_RATE = 0.001
EPOCHS = 10

class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        
        # Layer 1, Input shape (1, 128, 173) ->  Output shape (8, 62, 84)
        self.layer1 = nn.Sequential(
            nn.Conv2d(in_channels = 1, out_channels = 8, kernel_size = (5, 6)), 
            nn.ReLU(), 
            nn.MaxPool2d(kernel_size = (2, 2)))
        
        # Layer 2, Input shape (8, 62, 84) -> Output shape (16, 30, 41)
        self.layer2 = nn.Sequential(
            nn.Conv2d(in_channels = 8, out_channels = 16, kernel_size = (3, 3)), 
            nn.ReLU(), 
            nn.MaxPool2d(kernel_size = (2, 2)))
        
        # Layer 3, Input shape (16, 30, 41) -> Output shape (64, 10, 15)
        self.layer3 = nn.Sequential(
            nn.Conv2d(in_channels = 16, out_channels = 32, kernel_size = (6, 7)), 
            nn.ReLU(), 
            nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size = (6, 6)), 
            nn.ReLU(), 
            nn.MaxPool2d(kernel_size = (2, 2)))
        
        # Fully Connected layer 1, Input features 64 * 10 * 15 -> Output features 512
        self.fc1 = nn.Linear(in_features = 64 * 10 * 15, out_features = 512)
        
        # Fully Connected layer 2, Input features 512 -> Output features 256
        self.fc2 = nn.Linear(in_features = 512, out_features = 256)
        
        # Fully Connected layer 3, Input features 256 -> Output features 128
        self.fc3 = nn.Linear(in_features = 256, out_features = 128)
        
        # Fully Connected layer 4, Input features 128 -> Output features 10
        self.fc4 = nn.Linear(in_features = 128, out_features = NUM_CLASSES)
        
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        
        x = x.view(-1, self.num_flat_features(x))
        
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        
        return x
    
    def num_flat_features(self, x):
        size = x.size()[1:]
        n_features = 1
        for s in size:
            n_features = n_features * s
        
        return n_features

**CNN 클래스 불러오기**

In [188]:
model = ConvNet()

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE)

**학습(Training)**

In [189]:
THRESHOLD = 0.001 
num_train_batches = len(train_loader)

for epoch in range(EPOCHS):
    print("Epoch " + str(epoch + 1) + ":")
    
    for i, batch in enumerate(train_loader):
        
        data, labels = batch
        
        outputs = model(data)
        loss = loss_fn(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total = labels.size(0)
        _, predicted = torch.max(outputs, dim = 1)
        correct = (predicted == labels).sum().item()
        accuracy = correct / total
        
        drawProgressBar((i + 1), num_train_batches, 
                              '\t loss: {:.4f} \t acc: {:.4f}'.format(round(loss.item(), 4), round(accuracy, 4)))
    
    print('\n\n')

Epoch 1:


Epoch 2:


Epoch 3:


Epoch 4:


Epoch 5:


Epoch 6:


Epoch 7:


Epoch 8:


Epoch 9:


Epoch 10:




**평가**

In [190]:
def evaluate(model, test_loader):

    model.eval()
    num_test_batches = len(test_loader)
    with torch.no_grad():
        correct = 0
        total = 0
        total_loss = 0
        for i, batch in enumerate(test_loader):
            inputs, labels = batch
            outputs = model(inputs)
            _, predicted = torch.max(outputs, dim = 1)
            loss = loss_fn(outputs, labels)
            total_loss += loss.item()

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            drawProgressBar((i+1), num_test_batches)
        
        accuracy = correct/total
        test_loss = total_loss/num_test_batches
    
    return accuracy, test_loss

In [191]:
val_acc, val_loss = evaluate(model, val_loader)

print("\n\nValidation accuracy: {:.4f}".format(round(val_acc, 4)))
print("Validation loss: {:.4f}".format(round(val_loss, 4)))


Validation accuracy: 0.9121
Validation loss: 0.2446
