# DEEE725 Speech Signal Processing Lab
### 2023 Spring, Kyungpook National University 
### Instructor: Gil-Jin Jang

## Lab 01 Korean digit recognition using python-hmmlearn
version 2, 2023/03/24
source: [jayaram1125's github repository](https://github.com/jayaram1125/Single-Word-Speech-Recognition-using-GMM-HMM-)

__update description:__

1. assigns sound files 8 and 9 for test out of 0...9, the rest (0...7) are for training
    no random selection for reproducibility
2. folder structure change

> segmented/${username}/${dnum}/kdigits${trial}-${dnum}.wav
> > for example, for user "gjang", digit 2, recording trial 0 (1st)
> > "segmented/gjang/2/kdigits0-2.wav"

In [5]:
!pip install librosa
!pip install hmmlearn

Collecting hmmlearn
  Downloading hmmlearn-0.2.8-cp39-cp39-win_amd64.whl (110 kB)
     -------------------------------------- 110.1/110.1 kB 6.2 MB/s eta 0:00:00
Installing collected packages: hmmlearn
Successfully installed hmmlearn-0.2.8


In [6]:
# import necessary packages
import numpy as np
import matplotlib.pyplot as plt
#from scikits.talkbox.features import mfcc
#librosa.feature.mfcc(*, y=None, sr=22050, S=None, n_mfcc=20, dct_type=2, norm='ortho', lifter=0, **kwargs)[source]
from librosa.feature import mfcc
from scipy.io import wavfile
from hmmlearn import hmm
import numpy as np
import os
import warnings
import scipy.stats as sp
from time import time

warnings.filterwarnings("ignore")

__hyperparameters__ - CHANGE THEM TO IMPROVE PERFORMANCE
1. number of MFCC (feature dimension), try `num_mfcc` 6, 10, 13

2. Parameters needed to train GMMHMM: number of HMM states, number of Gaussian mixtures, diagonal or full covariance matrix, etc.

In [7]:
# 1. number of MFCC (feature dimension)
num_mfcc = 6
#num_mfcc = 10
#num_mfcc = 13
# 2. Parameters needed to train GMMHMM
m_num_of_HMMStates = 3  # number of states
m_num_of_mixtures = 2  # number of mixtures for each hidden state
m_covarianceType = 'diag'  # covariance type
m_n_iter = 10  # number of iterations
m_bakisLevel = 2

In [11]:
# extract MFCC features
def extmfcc(file):
    samplerate, d = wavfile.read(file)
    #features.append(mfcc(d, nwin=int(samplerate * 0.03), fs=samplerate, nceps= 6)[0])
    x = np.float32(d)
    hop=samplerate//100
    mc = mfcc(y=x, sr=samplerate, n_mfcc=num_mfcc, hop_length=hop, win_length=hop*2)
    return np.transpose(mc, (1,0))

__load data files__

1. find files: 
    for user `"gjang"`, digit 2, recording trial 0 (1st)
    `"segmented/gjang/2/kdigits0-2.wav"`
2. extract MFCC features for training and testing
    for each digit, indexes 4 and 9 for test, and the rest for training

In [33]:
#fpaths = []
#labels = []
spoken = []
m_trainingsetfeatures = []
m_trainingsetlabels = []
m_testingsetfeatures = []
m_testingsetlabels = []
n_folds = 5   # 0...3 for training, 4 for testing

apath = 'segmented'
count = 0
for username in os.listdir(apath):
    apath2 = apath + '/' + username    # example: segmented/gjang
    for ii in range(10):   #dnum in os.listdir(apath2):
        dnum = str(ii)
        apath3 = apath2 + '/' + dnum     # example: segmented/gjang/2
        if dnum not in spoken:
            spoken.append(dnum)
        for trial in range(10):
            file = apath3 + '/' + "kdigits{}-{}.wav".format(trial,dnum)      # segmented/gjang/2/kdigits0-2.wav
            mc = extmfcc(file)

            # display file names for the first 20 files only
            count += 1
            if count <= 20:
                print(file, dnum, end=' '); print(mc.shape, end=' ')
            elif count == 21:
                print('...'); print('')

            # 0...3 for training, 4 for testing
            if trial % n_folds == (n_folds-1):
                if count <= 20: print('testing')
                m_testingsetfeatures.append(mc)
                m_testingsetlabels.append(dnum)
            else:
                if count <= 20: print('training')
                m_trainingsetfeatures.append(mc)
                m_trainingsetlabels.append(dnum)


print('Words spoken:', spoken)
#print("number of labels and features = %d, %d" % ( len(labels), len(features) ))
#print("feature shape = ", end='')
#print(features[0].shape)

segmented/deokkyukwon/0/kdigits0-0.wav 0 (28, 6) training
segmented/deokkyukwon/0/kdigits1-0.wav 0 (26, 6) training
segmented/deokkyukwon/0/kdigits2-0.wav 0 (39, 6) training
segmented/deokkyukwon/0/kdigits3-0.wav 0 (38, 6) training
segmented/deokkyukwon/0/kdigits4-0.wav 0 (37, 6) testing
segmented/deokkyukwon/0/kdigits5-0.wav 0 (24, 6) training
segmented/deokkyukwon/0/kdigits6-0.wav 0 (29, 6) training
segmented/deokkyukwon/0/kdigits7-0.wav 0 (27, 6) training
segmented/deokkyukwon/0/kdigits8-0.wav 0 (31, 6) training
segmented/deokkyukwon/0/kdigits9-0.wav 0 (33, 6) testing
segmented/deokkyukwon/1/kdigits0-1.wav 1 (42, 6) training
segmented/deokkyukwon/1/kdigits1-1.wav 1 (35, 6) training
segmented/deokkyukwon/1/kdigits2-1.wav 1 (46, 6) training
segmented/deokkyukwon/1/kdigits3-1.wav 1 (30, 6) training
segmented/deokkyukwon/1/kdigits4-1.wav 1 (43, 6) testing
segmented/deokkyukwon/1/kdigits5-1.wav 1 (41, 6) training
segmented/deokkyukwon/1/kdigits6-1.wav 1 (42, 6) training
segmented/deokkyu

In [34]:
# gjang: shuffling the data (x)
# c = list(zip(features, labels))
# np.random.shuffle(c)
# features,labels = zip(*c)

In [35]:
# test and training for 100 files
ntest  = len(m_testingsetlabels)
ntrain = len(m_trainingsetlabels)
nfiles = ntest + ntrain

print("[training] number of labels and features = %d, %d" % 
        ( len(m_trainingsetlabels), len(m_trainingsetfeatures)) )
print("[test] number of labels and features = %d, %d" % 
        ( len(m_testingsetlabels), len(m_testingsetfeatures)) )

print ('Loading data completed')

[training] number of labels and features = 80, 80
[test] number of labels and features = 20, 20
Loading data completed


In [36]:
# model initialization
gmmhmmindexdict = {}
index = 0
for word in spoken:
    gmmhmmindexdict[word] = index
    index = index +1

def initByBakis(inumstates, ibakisLevel):
    startprobPrior = np.zeros(inumstates)
    startprobPrior[0: ibakisLevel - 1] = 1/float((ibakisLevel - 1))
    transmatPrior = getTransmatPrior(inumstates, ibakisLevel)
    return startprobPrior, transmatPrior

def getTransmatPrior(inumstates, ibakisLevel):
    transmatPrior = (1 / float(ibakisLevel)) * np.eye(inumstates)

    for i in range(inumstates - (ibakisLevel - 1)):
        for j in range(ibakisLevel - 1):
            transmatPrior[i, i + j + 1] = 1. / ibakisLevel

    for i in range(inumstates - ibakisLevel + 1, inumstates):
        for j in range(inumstates - i - j):
            transmatPrior[i, i + j] = 1. / (inumstates - i)

    return transmatPrior

m_startprobPrior ,m_transmatPrior = initByBakis(m_num_of_HMMStates,m_bakisLevel)

print("StartProbPrior=")
print(m_startprobPrior)

print("TransMatPrior=")
print(m_transmatPrior)

StartProbPrior=
[1. 0. 0.]
TransMatPrior=
[[0.5 0.5 0. ]
 [0.  0.5 0.5]
 [0.  0.  1. ]]


In [37]:
# acoustic model definition
class SpeechModel:
    def __init__(self,Class,label):
        self.traindata = np.zeros((0,num_mfcc))
        self.Class = Class
        self.label = label
        self.model  = hmm.GMMHMM(n_components = m_num_of_HMMStates, n_mix = m_num_of_mixtures, \
                transmat_prior = m_transmatPrior, startprob_prior = m_startprobPrior, \
                covariance_type = m_covarianceType, n_iter = m_n_iter)

In [38]:
# training GMMHMM Models 
start = time()

speechmodels = [None] * len(spoken)
for key in gmmhmmindexdict:
    speechmodels[gmmhmmindexdict[key]] = SpeechModel(gmmhmmindexdict[key],key)

for i in range(0,len(m_trainingsetfeatures)):
     for j in range(0,len(speechmodels)):
         if int(speechmodels[j].Class) == int(gmmhmmindexdict[m_trainingsetlabels[i]]):
            speechmodels[j].traindata = np.concatenate((speechmodels[j].traindata , m_trainingsetfeatures[i]))

for speechmodel in speechmodels:
    speechmodel.model.fit(speechmodel.traindata)

print ('Training completed -- {0} GMM-HMM models are built for {0} different types of words'.format(len(spoken)))
print('time elapsed: %.2f seconds' % ( time() - start ))
print (" "); print(" ")

Training completed -- 10 GMM-HMM models are built for 10 different types of words
time elapsed: 1.60 seconds
 
 


In [39]:
# testing
print("Prediction started")
m_PredictionlabelList = []

for i in range(0,len(m_testingsetfeatures)):
    scores = []
    for speechmodel in speechmodels:
         scores.append(speechmodel.model.score(m_testingsetfeatures[i]))
    id  = scores.index(max(scores))
    m_PredictionlabelList.append(speechmodels[id].Class)
    print(str(np.round(scores, 3)) + " " + str(max(np.round(scores, 3))) +" "+":"+ speechmodels[id].label)

accuracy = 0.0
count = 0
print("")
print("Prediction for Testing DataSet:")

for i in range(0,len(m_testingsetlabels)):
    print( "Label"+str(i+1)+":"+m_testingsetlabels[i])
    if gmmhmmindexdict[m_testingsetlabels[i]] == m_PredictionlabelList[i]:
       count = count+1

accuracy = 100.0*count/float(len(m_testingsetlabels))

print("")
print("accuracy ="+str(accuracy))
print("")

# end of testing

Prediction started
[-1013.845 -1650.752 -1244.184 -1159.703 -1170.113 -1320.368 -1089.595
 -1081.473 -1012.062 -1613.073] -1012.062 :8
[ -898.474 -1535.013 -1067.602 -1091.171 -1078.507 -1164.176 -1000.173
  -990.207  -939.912 -1450.546] -898.474 :0
[-1739.05  -1064.112 -1599.904 -1539.339 -1963.034 -1155.689 -1196.41
 -1485.434 -1456.624 -1215.284] -1064.112 :1
[-1760.859 -1133.983 -1528.318 -1560.974 -2145.242 -1177.463 -1185.216
 -1706.574 -1600.604 -1284.03 ] -1133.983 :1
[-1031.258 -1390.329  -922.507 -1018.829 -1164.36  -1084.701  -986.963
  -995.935  -952.857 -1423.447] -922.507 :2
[-1425.253 -1967.236 -1159.471 -1542.467 -1780.803 -1474.596 -1337.211
 -1493.621 -1330.411 -1879.173] -1159.471 :2
[-735.293 -898.693 -729.815 -605.07  -737.917 -690.665 -637.944 -702.311
 -649.195 -888.508] -605.07 :3
[-1092.775 -1294.946  -910.71   -880.545 -1233.509  -922.153  -805.37
 -1093.089  -899.186 -1318.008] -805.37 :6
[-596.718 -856.859 -750.311 -572.801 -528.321 -689.804 -644.934 -582.18

## End of Lab 01