<a href="https://colab.research.google.com/github/KeerthanaPravallika/Digits-automated-speech-recognition/blob/main/Digits_automated_speech_recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Dataset link : https://github.com/Jakobovski/free-spoken-digit-dataset

In [None]:
#include the dataset 

#copying the zip file link and use wget command

!wget https://github.com/Jakobovski/free-spoken-digit-dataset/archive/refs/heads/master.zip 

#unzip the dataset folder
! unzip master.zip

# Preprocessing the data

**Spectrogram :** Spectrogram is a visual representation of the spectrum of frequencies of a signal.

**Converting the audio clips to spectrograms**

In [None]:
#importing necessary modules for converting into spectrograms

import numpy as np
from matplotlib import pyplot as plt

import os
from os import listdir
from os.path import isfile, join

import scipy.io.wavfile as wav

from keras.preprocessing import image
from keras.utils.np_utils import to_categorical

In [None]:
#Parameters:
  # Path of audio file
  # Destination folder path
  # Spectrogram dimenensions
  # Number of overlap
  # colour scheme

def convert_wav_to_spectrogram(audio_path, dest_path, spectrogram_dim=(64, 64), noverlap=16, cmap='gray_r'):
  sample_rate, samples = wav.read(audio_path) #reading the audio file

  # setting its size in inches
  fig = plt.figure()
  fig.set_size_inches((spectrogram_dim[0]/fig.get_dpi(), spectrogram_dim[1]/fig.get_dpi()))

  # set the axis of the figure
  ax = plt.Axes(fig, [0., 0., 1., 1.])
  ax.set_axis_off()
  fig.add_axes(ax)

  # spectrogram creation
  ax.specgram(samples, cmap=cmap, Fs=2, noverlap=noverlap)

  # set the locator for the x and y axis
  ax.xaxis.set_major_locator(plt.NullLocator())
  ax.yaxis.set_major_locator(plt.NullLocator())

  # Saving  the figure in the destination path
  fig.savefig(dest_path, bbox_inches="tight", pad_inches=0)


In [None]:
# Iterating over recordings folder and converts its contents


def dir_to_spectrogram(audio_dir, spectrogram_dir, spectrogram_dimensions=(64, 64), noverlap=16, cmap='gray_r'):
  file_names = [f for f in listdir(audio_dir) if isfile(join(audio_dir, f)) and '.wav' in f]
  for file_name in file_names:
    #print(file_name)
    audio_path = audio_dir + file_name
    spectogram_path = spectrogram_dir + file_name.replace('.wav', '.png')
    convert_wav_to_spectrogram(audio_path, spectogram_path, spectrogram_dim=spectrogram_dimensions, noverlap=noverlap, cmap=cmap)

In [None]:
audio_files_folder = "recordings/"
spectrogram_folder = "Spectrograms/"
dir_to_spectrogram(audio_files_folder, spectrogram_folder)

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from keras.preprocessing import image
from keras.utils.np_utils import to_categorical

In [None]:
#spliting the name of the file
# Ex : 0_george_4
# 0 - Digit , georger - performer, 4 - sample number

imagesDir = "Spectrograms/"
trainset = []
testset = []
for file in os.listdir(imagesDir):
  label = file.split('_')[0]
  sample_number = file.split('_')[2]
  img = image.load_img(imagesDir+file)

  # Dividing into train and testing dataset
  if sample_number in ['0.png','1.png','2.png','3.png','4.png']:
    testset.append([image.img_to_array(img), label])
  else:
    trainset.append([image.img_to_array(img), label])

In [None]:
#Dividing into featues and target 

# Getting only images in the train list
X_train = [item[0] for item in trainset]
# Getting only Labels in the train list
y_train = [item[1] for item in trainset]


# Getting only images in the test list 
X_test = [item[0] for item in testset]
# Getting only Labels in the test list 
y_test = [item[1] for item in testset]

In [None]:
# Convert list to numpy array 

X_train = np.asanyarray(X_train)
y_train = np.asanyarray(y_train)
X_test = np.asanyarray(X_test)
y_test = np.asanyarray(y_test)

In [None]:
# convert to one hot representation
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

# Normalize the images
X_train /= 255
X_test /= 255

# Model Designing

In [32]:
from keras.models import Sequential
from keras import optimizers
from keras.layers import Dense, Activation, Flatten, Conv2D, MaxPooling2D, Dropout, BatchNormalization # for creating CNN
from keras import models
import tensorflow as tf


In [39]:
data_shape = (X_train.shape[1], X_train.shape[2], X_train.shape[3]) # setting the shape to (64,64,1)

def basic_cnn():
  model = Sequential()

  # three layers of convolution
  # batch normalization will decrease the image size
  model.add(Conv2D(32, kernel_size=(2, 2), activation='relu', input_shape=data_shape))
  model.add(BatchNormalization())
  model.add(Conv2D(48, kernel_size=(2, 2), activation='relu'))
  model.add(BatchNormalization())
  model.add(Conv2D(120, kernel_size=(2, 2), activation='relu'))
  model.add(BatchNormalization())

  # max pooling layer for extracting the useful features in the image 
  model.add(MaxPooling2D(pool_size=(2, 2)))

  # a dropout layer will discard some neurons in the network and that will help reduce overfitting in the model
  model.add(Dropout(0.25))
  model.add(Flatten())

  # adding a fully connected dense layer of 128 neurons with radioactivation along with a batch normalization and another dropout layer
  model.add(Dense(128, activation='relu'))
  model.add(BatchNormalization())
  model.add(Dropout(0.25))

  # adding another dense layer of 64 neurons with radioactivation along with a batch normalization and another dropout layer
  model.add(Dense(64, activation='relu'))
  model.add(BatchNormalization())
  model.add(Dropout(0.4))

  #  output dense layer is of 10 neurons because we need to recognize 10 digits from 0 to 9 
  model.add(Dense(10, activation='softmax'))
  
  #model.compile(optimizer='Adadelta',loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),  metrics=['accuracy'])
  model.compile(loss = 'categorical_crossentropy', optimizer='Adadelta', metrics=['accuracy'])
  return model

In [40]:
first_model = basic_cnn()
first_model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_18 (Conv2D)          (None, 63, 63, 32)        416       
                                                                 
 batch_normalization_30 (Bat  (None, 63, 63, 32)       128       
 chNormalization)                                                
                                                                 
 conv2d_19 (Conv2D)          (None, 62, 62, 48)        6192      
                                                                 
 batch_normalization_31 (Bat  (None, 62, 62, 48)       192       
 chNormalization)                                                
                                                                 
 conv2d_20 (Conv2D)          (None, 61, 61, 120)       23160     
                                                                 
 batch_normalization_32 (Bat  (None, 61, 61, 120)     

In [None]:
first_model.fit(X_train, y_train, batch_size = 50, validation_split=0.2, epochs = 20, verbose = 1)

In [None]:
# Checking for accuracy

first_model.evaluate(X_test, y_test)

In [None]:
# Saving the model
first_model.save("spoken_digit_recognition_.h5")

In [None]:
index = 33
print('ground Truth',np.argmax(y_test[index]))
print('Prediction' ,np.argmax(first_model.predict(X_test[index].reshape(1,64,64,3))))