### spoken-digit-dataset
#### https://www.kaggle.com/divyanshu99/spoken-digit-dataset

In [1]:
import torch
import os
import random
from torch.utils.data import Dataset
from torch.utils.data import DataLoader, TensorDataset

import numpy as np
from scipy.io import wavfile


EPOCHS = 10
BATCH_SIZE = 50
PATH = "C:/Projects/keras_talk/keras/intern/0713/recordings/"
file_list = os.listdir(PATH)


##### if  (x_data size < N)  -->  zero padding

In [2]:

def padding(x_data, N):
    if len(x_data) >= N:
        return x_data[:N]
    else:
        zeros = np.asarray([0]*(N-len(x_data)))
        return np.append(np.transpose(x_data)[0], zeros) if len(x_data.shape) == 2 else np.append(x_data, zeros)
    


##### size of test_dataset : 100

In [3]:


#test_set_index        
test_idx = []
num = random.randrange(0,1500)
for i in range(100):
    while num in test_idx:
        num = random.randrange(0,1500)
    test_idx.append(num)
test_idx = np.asarray(test_idx)


##### wav -> numpy array

In [4]:

        
x_train = []
y_train = []
x_test  = []
y_test  = []


for i,file in enumerate(file_list):
    digit_sound = padding( wavfile.read(PATH+file)[1] , 5000)
    #digit_sound = noise(   wavfile.read(PATH+file)[1] , 5000)
    digit = int( file[0] )
    if len(digit_sound.shape) == 2:
        digit_sound = np.transpose(digit_sound)[0]

    if i in test_idx:
        x_test.append(digit_sound)
        y_test.append(digit)
    else:
        x_train.append(digit_sound)
        y_train.append(digit)
    
    
x_train = np.asarray(x_train, dtype = np.float32).reshape(1400,1,5000)
y_train = np.asarray(y_train)
x_test  = np.asarray(x_test, dtype = np.float32).reshape(100,1,5000)
y_test  = np.asarray(y_test)



  


##### Dataset

In [5]:

x_test  = torch.from_numpy(x_test ).float().to('cpu')
x_train = torch.from_numpy(x_train).float().to('cpu')
y_test  = torch.from_numpy(y_test ).long().to('cpu')
y_train = torch.from_numpy(y_train).long().to('cpu')


dataset_train2 = TensorDataset(x_train, y_train)
dataset_test2  = TensorDataset(x_test , y_test )

train_dataset = DataLoader(dataset = dataset_train2, batch_size = BATCH_SIZE, shuffle = True, drop_last = True)
test_dataset  = DataLoader(dataset = dataset_test2 , batch_size = BATCH_SIZE, shuffle = True, drop_last = True)


##### Model

In [6]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim

class CNN_spoken_digit_dataset( nn.Module ):
    def __init__(self):
        super(CNN_spoken_digit_dataset, self).__init__()
        
        self.conv1 = nn.Conv1d( in_channels = 1,   out_channels = 128, kernel_size = 2 )
        self.conv2 = nn.Conv1d( in_channels = 128, out_channels = 64,  kernel_size = 2 )
        self.conv3 = nn.Conv1d( in_channels = 64,  out_channels = 32,  kernel_size = 2 )
        
        self.maxp1 = nn.MaxPool1d( kernel_size = 2, stride = 5 )
        self.maxp2 = nn.MaxPool1d( kernel_size = 2, stride = 5 )
        
        self.lay1  = nn.Linear( 6400, 128 )
        self.lay2  = nn.Linear( 128 , 64 )
        self.lay3  = nn.Linear( 64 , 10 )
        
        self.relu  = nn.ReLU()
        self.drop  = nn.Dropout()
        
    
    def forward(self, x):
        output = self.conv1(x)
        output = self.relu(output)
        output = self.maxp1(output)
        output = self.drop(output)
        
        output = self.conv2(output)
        output = self.relu(output)
        
        output = self.conv3(output)
        output = self.relu(output)
        output = self.maxp2(output)
        
        
        
        output = output.view(-1, 6400)
        output = self.drop(output)
        
        output = self.lay1(output)
        output = self.relu(output)
        output = self.drop(output)
        
        output = self.lay2(output)
        output = self.relu(output)
        output = self.drop(output)
        
        output = self.lay3(output)
        output = F.log_softmax(output, dim=1)
        
        return output
        
        

##### model 학습

In [7]:
from torch.autograd import Variable#
#torch.manual_seed(1)


model = CNN_spoken_digit_dataset()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)

train_loss = []
train_acc  = []


model.train()
for epoch in range(EPOCHS):
        
    print('epoch'+str(epoch+1))
    for i,(data, label) in enumerate(train_dataset):
        optimizer.zero_grad()
        output = model(data)
        
        loss = torch.nn.functional.nll_loss(output, label)
        loss.backward()
        train_loss.append(loss.item())
        optimizer.step()
        
        total = label.size(0)
        preds = output.data.max(1)[1]
        correct = (preds==label).sum().item()
        accuracy = correct/BATCH_SIZE*100
        train_acc.append( accuracy )
        
        
        print('\tLoss:{:.3f}\tAcc:{:.3f}'.format(loss.item(),accuracy))
        

epoch1
	Loss:46.091	Acc:12.000
	Loss:49.955	Acc:14.000
	Loss:26.718	Acc:8.000
	Loss:32.560	Acc:14.000
	Loss:40.198	Acc:4.000
	Loss:38.082	Acc:10.000
	Loss:18.174	Acc:12.000
	Loss:22.824	Acc:2.000
	Loss:17.322	Acc:6.000
	Loss:27.949	Acc:12.000
	Loss:17.478	Acc:10.000
	Loss:18.019	Acc:10.000
	Loss:11.218	Acc:12.000
	Loss:17.448	Acc:16.000
	Loss:16.741	Acc:14.000
	Loss:14.438	Acc:10.000
	Loss:13.607	Acc:10.000
	Loss:13.542	Acc:12.000
	Loss:11.187	Acc:6.000
	Loss:11.360	Acc:6.000
	Loss:12.291	Acc:8.000
	Loss:12.130	Acc:10.000
	Loss:14.176	Acc:6.000
	Loss:8.058	Acc:14.000
	Loss:7.797	Acc:8.000
	Loss:7.297	Acc:12.000
	Loss:7.506	Acc:14.000
	Loss:10.080	Acc:8.000
epoch2
	Loss:8.294	Acc:0.000
	Loss:6.636	Acc:6.000
	Loss:7.981	Acc:12.000
	Loss:4.558	Acc:8.000
	Loss:5.546	Acc:16.000
	Loss:6.542	Acc:4.000
	Loss:8.641	Acc:4.000
	Loss:5.272	Acc:4.000
	Loss:6.751	Acc:10.000
	Loss:6.953	Acc:10.000
	Loss:5.333	Acc:12.000
	Loss:5.449	Acc:10.000
	Loss:5.424	Acc:4.000
	Loss:3.455	Acc:20.000
	Loss:4.127	A

In [8]:
model.eval()

with torch.no_grad():
    correct = 0
    total = 0
    for data, label in test_dataset:
        output = model(data)
        preds  = torch.max(output.data, 1)[1]
        total   += len(label)
        correct += (preds==label).sum().item()
    print('Test Accuracy: ', 100.*correct/total)

Test Accuracy:  11.0
