-
Notifications
You must be signed in to change notification settings - Fork 2
/
melbankm.m
302 lines (296 loc) · 11.1 KB
/
melbankm.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
function [x,mc,mn,mx]=melbankm(p,n,fs,fl,fh,w)
%MELBANKM determine matrix for a mel/erb/bark-spaced filterbank [X,MN,MX]=(P,N,FS,FL,FH,W)
%
% Inputs:
% p number of filters in filterbank or the filter spacing in k-mel/bark/erb [ceil(4.6*log10(fs))]
% n length of fft
% fs sample rate in Hz
% fl low end of the lowest filter as a fraction of fs [default = 0]
% fh high end of highest filter as a fraction of fs [default = 0.5]
% w any sensible combination of the following:
% 'b' = bark scale instead of mel
% 'e' = erb-rate scale
% 'l' = log10 Hz frequency scale
% 'f' = linear frequency scale
%
% 'c' = fl/fh specify centre of low and high filters
% 'h' = fl/fh are in Hz instead of fractions of fs
% 'H' = fl/fh are in mel/erb/bark/log10
%
% 't' = triangular shaped filters in mel/erb/bark domain (default)
% 'n' = hanning shaped filters in mel/erb/bark domain
% 'm' = hamming shaped filters in mel/erb/bark domain
%
% 'z' = highest and lowest filters taper down to zero [default]
% 'y' = lowest filter remains at 1 down to 0 frequency and
% highest filter remains at 1 up to nyquist freqency
%
% 'u' = scale filters to sum to unity
%
% 's' = single-sided: do not double filters to account for negative frequencies
%
% 'g' = plot idealized filters [default if no output arguments present]
%
% Note that the filter shape (triangular, hamming etc) is defined in the mel (or erb etc) domain.
% Some people instead define an asymmetric triangular filter in the frequency domain.
%
% If 'ty' or 'ny' is specified, the total power in the fft is preserved.
%
% Outputs: x a sparse matrix containing the filterbank amplitudes
% If the mn and mx outputs are given then size(x)=[p,mx-mn+1]
% otherwise size(x)=[p,1+floor(n/2)]
% Note that the peak filter values equal 2 to account for the power
% in the negative FFT frequencies.
% mc the filterbank centre frequencies in mel/erb/bark
% mn the lowest fft bin with a non-zero coefficient
% mx the highest fft bin with a non-zero coefficient
% Note: you must specify both or neither of mn and mx.
%
% Examples of use:
%
% (a) Calcuate the Mel-frequency Cepstral Coefficients
%
% f=rfft(s); % rfft() returns only 1+floor(n/2) coefficients
% x=melbankm(p,n,fs); % n is the fft length, p is the number of filters wanted
% z=log(x*abs(f).^2); % multiply x by the power spectrum
% c=dct(z); % take the DCT
%
% (b) Calcuate the Mel-frequency Cepstral Coefficients efficiently
%
% f=fft(s); % n is the fft length, p is the number of filters wanted
% [x,mc,na,nb]=melbankm(p,n,fs); % na:nb gives the fft bins that are needed
% z=log(x*(f(na:nb)).*conj(f(na:nb)));
%
% (c) Plot the calculated filterbanks
%
% plot((0:floor(n/2))*fs/n,melbankm(p,n,fs)') % fs=sample frequency
%
% (d) Plot the idealized filterbanks (without output sampling)
%
% melbankm(p,n,fs);
%
% References:
%
% [1] S. S. Stevens, J. Volkman, and E. B. Newman. A scale for the measurement
% of the psychological magnitude of pitch. J. Acoust Soc Amer, 8: 185–19, 1937.
% [2] S. Davis and P. Mermelstein. Comparison of parametric representations for
% monosyllabic word recognition in continuously spoken sentences.
% IEEE Trans Acoustics Speech and Signal Processing, 28 (4): 357–366, Aug. 1980.
% Copyright (C) Mike Brookes 1997-2009
% Version: $Id: melbankm.m 713 2011-10-16 14:45:43Z dmb $
%
% VOICEBOX is a MATLAB toolbox for speech processing.
% Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% This program is free software; you can redistribute it and/or modify
% it under the terms of the GNU General Public License as published by
% the Free Software Foundation; either version 2 of the License, or
% (at your option) any later version.
%
% This program is distributed in the hope that it will be useful,
% but WITHOUT ANY WARRANTY; without even the implied warranty of
% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
% GNU General Public License for more details.
%
% You can obtain a copy of the GNU General Public License from
% http://www.gnu.org/copyleft/gpl.html or by writing to
% Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Note "FFT bin_0" assumes DC = bin 0 whereas "FFT bin_1" means DC = bin 1
if nargin < 6
w='tz'; % default options
if nargin < 5
fh=0.5; % max freq is the nyquist
if nargin < 4
fl=0; % min freq is DC
end
end
end
sfact=2-any(w=='s'); % 1 if single sided else 2
wr=' '; % default warping is mel
for i=1:length(w)
if any(w(i)=='lebf');
wr=w(i);
end
end
if any(w=='h') || any(w=='H')
mflh=[fl fh];
else
mflh=[fl fh]*fs;
end
if ~any(w=='H')
switch wr
case 'f' % no transformation
case 'l'
if fl<=0
error('Low frequency limit must be >0 for l option');
end
mflh=log10(mflh); % convert frequency limits into log10 Hz
case 'e'
mflh=frq2erb(mflh); % convert frequency limits into erb-rate
case 'b'
mflh=frq2bark(mflh); % convert frequency limits into bark
otherwise
mflh=frq2mel(mflh); % convert frequency limits into mel
end
end
melrng=mflh*(-1:2:1)'; % mel range
fn2=floor(n/2); % bin index of highest positive frequency (Nyquist if n is even)
if isempty(p)
p=ceil(4.6*log10(fs)); % default number of filters
end
if any(w=='c') % c option: specify fiter centres not edges
if p<1
p=round(melrng/(p*1000))+1;
end
melinc=melrng/(p-1);
mflh=mflh+(-1:2:1)*melinc;
else
if p<1
p=round(melrng/(p*1000))-1;
end
melinc=melrng/(p+1);
end
%
% Calculate the FFT bins corresponding to [filter#1-low filter#1-mid filter#p-mid filter#p-high]
%
switch wr
case 'f'
blim=(mflh(1)+[0 1 p p+1]*melinc)*n/fs;
case 'l'
blim=10.^(mflh(1)+[0 1 p p+1]*melinc)*n/fs;
case 'e'
blim=erb2frq(mflh(1)+[0 1 p p+1]*melinc)*n/fs;
case 'b'
blim=bark2frq(mflh(1)+[0 1 p p+1]*melinc)*n/fs;
otherwise
blim=mel2frq(mflh(1)+[0 1 p p+1]*melinc)*n/fs;
end
mc=mflh(1)+(1:p)*melinc; % mel centre frequencies
b1=floor(blim(1))+1; % lowest FFT bin_0 required might be negative)
b4=min(fn2,ceil(blim(4))-1); % highest FFT bin_0 required
%
% now map all the useful FFT bins_0 to filter1 centres
%
switch wr
case 'f'
pf=((b1:b4)*fs/n-mflh(1))/melinc;
case 'l'
pf=(log10((b1:b4)*fs/n)-mflh(1))/melinc;
case 'e'
pf=(frq2erb((b1:b4)*fs/n)-mflh(1))/melinc;
case 'b'
pf=(frq2bark((b1:b4)*fs/n)-mflh(1))/melinc;
otherwise
pf=(frq2mel((b1:b4)*fs/n)-mflh(1))/melinc;
end
%
% remove any incorrect entries in pf due to rounding errors
%
if pf(1)<0
pf(1)=[];
b1=b1+1;
end
if pf(end)>=p+1
pf(end)=[];
b4=b4-1;
end
fp=floor(pf); % FFT bin_0 i contributes to filters_1 fp(1+i-b1)+[0 1]
pm=pf-fp; % multiplier for upper filter
k2=find(fp>0,1); % FFT bin_1 k2+b1 is the first to contribute to both upper and lower filters
k3=find(fp<p,1,'last'); % FFT bin_1 k3+b1 is the last to contribute to both upper and lower filters
k4=numel(fp); % FFT bin_1 k4+b1 is the last to contribute to any filters
if isempty(k2)
k2=k4+1;
end
if isempty(k3)
k3=0;
end
if any(w=='y') % preserve power in FFT
mn=1; % lowest fft bin required (1 = DC)
mx=fn2+1; % highest fft bin required (1 = DC)
r=[ones(1,k2+b1-1) 1+fp(k2:k3) fp(k2:k3) repmat(p,1,fn2-k3-b1+1)]; % filter number_1
c=[1:k2+b1-1 k2+b1:k3+b1 k2+b1:k3+b1 k3+b1+1:fn2+1]; % FFT bin1
v=[ones(1,k2+b1-1) pm(k2:k3) 1-pm(k2:k3) ones(1,fn2-k3-b1+1)];
else
r=[1+fp(1:k3) fp(k2:k4)]; % filter number_1
c=[1:k3 k2:k4]; % FFT bin_1 - b1
v=[pm(1:k3) 1-pm(k2:k4)];
mn=b1+1; % lowest fft bin_1
mx=b4+1; % highest fft bin_1
end
if b1<0
c=abs(c+b1-1)-b1+1; % convert negative frequencies into positive
end
% end
if any(w=='n')
v=0.5-0.5*cos(v*pi); % convert triangles to Hanning
elseif any(w=='m')
v=0.5-0.46/1.08*cos(v*pi); % convert triangles to Hamming
end
if sfact==2 % double all except the DC and Nyquist (if any) terms
msk=(c+mn>2) & (c+mn<n-fn2+2); % there is no Nyquist term if n is odd
v(msk)=2*v(msk);
end
%
% sort out the output argument options
%
if nargout > 2
x=sparse(r,c,v);
if nargout == 3 % if exactly three output arguments, then
mc=mn; % delete mc output for legacy code compatibility
mn=mx;
end
else
x=sparse(r,c+mn-1,v,p,1+fn2);
end
if any(w=='u')
sx=sum(x,2);
x=x./repmat(sx+(sx==0),1,size(x,2));
end
%
% plot results if no output arguments or g option given
%
if ~nargout || any(w=='g') % plot idealized filters
ng=201; % 201 points
me=mflh(1)+(0:p+1)'*melinc;
switch wr
case 'f'
fe=me; % defining frequencies
xg=repmat(linspace(0,1,ng),p,1).*repmat(me(3:end)-me(1:end-2),1,ng)+repmat(me(1:end-2),1,ng);
case 'l'
fe=10.^me; % defining frequencies
xg=10.^(repmat(linspace(0,1,ng),p,1).*repmat(me(3:end)-me(1:end-2),1,ng)+repmat(me(1:end-2),1,ng));
case 'e'
fe=erb2frq(me); % defining frequencies
xg=erb2frq(repmat(linspace(0,1,ng),p,1).*repmat(me(3:end)-me(1:end-2),1,ng)+repmat(me(1:end-2),1,ng));
case 'b'
fe=bark2frq(me); % defining frequencies
xg=bark2frq(repmat(linspace(0,1,ng),p,1).*repmat(me(3:end)-me(1:end-2),1,ng)+repmat(me(1:end-2),1,ng));
otherwise
fe=mel2frq(me); % defining frequencies
xg=mel2frq(repmat(linspace(0,1,ng),p,1).*repmat(me(3:end)-me(1:end-2),1,ng)+repmat(me(1:end-2),1,ng));
end
v=1-abs(linspace(-1,1,ng));
if any(w=='n')
v=0.5-0.5*cos(v*pi); % convert triangles to Hanning
elseif any(w=='m')
v=0.5-0.46/1.08*cos(v*pi); % convert triangles to Hamming
end
v=v*sfact; % multiply by 2 if double sided
v=repmat(v,p,1);
if any(w=='y') % extend first and last filters
v(1,xg(1,:)<fe(2))=sfact;
v(end,xg(end,:)>fe(p+1))=sfact;
end
if any(w=='u') % scale to unity sum
dx=(xg(:,3:end)-xg(:,1:end-2))/2;
dx=dx(:,[1 1:ng-2 ng-2]);
vs=sum(v.*dx,2);
v=v./repmat(vs+(vs==0),1,ng)*fs/n;
end
plot(xg',v','b');
set(gca,'xlim',[fe(1) fe(end)]);
xlabel(['Frequency (' xticksi 'Hz)']);
end