In [None]:
import matplotlib.pyplot as plt
from scipy.io import wavfile
import scipy
import numpy as np

import IPython.display as ipd

In [None]:
filename = "./sounds/windowing_test.wav"
fs, data = wavfile.read(filename)
print(fs)
data.shape

data = data[:, 0] # Working on channel 1

In [None]:
ipd.display(ipd.Audio(data, rate=fs))

plt.plot(data)

In [None]:
window_length_ms = 30

window_length = int(window_length_ms*fs/1000)
print(f"Window length in samples: {window_length}")

data_length = data.shape[0]

# Choose segment from RANDOM position in sample
starting_position = np.random.randint(data_length - window_length)

time_vector = np.linspace(0, window_length_ms, window_length)

plt.plot(time_vector, data[starting_position:(starting_position+window_length)])
plt.xlabel("Time (ms)")
plt.ylabel("Amplitude")
plt.show()

In [None]:
zero_length = int(window_length/4)
print(zero_length)
zero_length_ms = window_length_ms/4
print(zero_length_ms)

# Choose segment from random position in sample
starting_position = np.random.randint(data_length - window_length)

time_vector = np.linspace(-zero_length_ms, 
                            window_length_ms+zero_length_ms, 
                            window_length+2*zero_length)

zero_vector = np.zeros([zero_length, ])
data_vector = np.concatenate((zero_vector, data[starting_position:(starting_position+window_length)], zero_vector))

plt.plot(time_vector, data_vector)
plt.xlabel("Time (ms)")
plt.ylabel("Amplitude")
plt.show()

In [None]:
windowing_fn = np.sin(np.pi * np.arange(0.5, window_length, 1)/window_length)**2
windowing_fn_extended = np.concatenate((zero_vector, windowing_fn, zero_vector))

starting_position = zero_length + np.random.randint(data_length - window_length - 2*zero_length)

zero_vector = np.zeros([zero_length, ])
data_vector = np.concatenate((zero_vector, data[starting_position:(starting_position+window_length)], zero_vector))

plt.figure(figsize=[12, 6])

plt.subplot(221)
plt.plot(time_vector, data_vector)
plt.title("Original window (with zero extension)")
plt.xlabel("Time (ms)")
plt.ylabel("Amplitude")

plt.subplot(222)
plt.plot(time_vector, windowing_fn_extended)
plt.title("Hann windowing function")
plt.xlabel("Time (ms)")
plt.ylabel("Amplitude")

plt.subplot(223)
plt.plot(time_vector, data_vector*windowing_fn_extended)
plt.xlabel("Time (ms)")
plt.ylabel("Amplitude")

plt.tight_layout()
plt.show()

# Test each visualization

In [None]:
window_fn = np.sin(np.pi*np.arange(0.5, window_length, 1)/window_length)**2

window_length_ms = 30
window_length = int(window_length_ms*fs/1000)

window_fn = np.sin(np.pi*np.arange(0.5, window_length, 1)/window_length)**2

starting_position = zero_length + np.random.randint(data_length - window_length)

data_vector = data[starting_position:(starting_position+window_length), ]
time_vector = np.linspace(0, window_length_ms, window_length)
frequency_vector = np.linspace(0, fs/2000, int(window_length/2 + 1))

plt.figure(figsize=[12, 8])

plt.subplot(321)
plt.plot(time_vector, data_vector)
plt.title("Original window")
plt.xlabel("Time (ms)")
plt.ylabel("Amplitude")

plt.subplot(322)
plt.plot(time_vector, data_vector*windowing_fn)
plt.title("Window signal")
plt.xlabel("Time (ms)")
plt.ylabel("Amplitude")

plt.subplot(323)
plt.plot(frequency_vector, np.abs(scipy.fft.rfft(data_vector*windowing_fn)))
plt.title("Magnitude spectrum")
plt.xlabel("Frequency (kHz)")
plt.ylabel("Magnitude $|X_k|$")

plt.subplot(324)
plt.plot(frequency_vector, np.abs(scipy.fft.rfft(data_vector*windowing_fn))**2)
plt.title("Power Spectrum")
plt.xlabel("Frequency (kHz)")
plt.ylabel("Power $|X_k|^2$")

plt.subplot(325)
plt.plot(frequency_vector,20*np.log10(np.abs(scipy.fft.rfft(data_vector*windowing_fn))))
plt.title('Log-magnitude spectrum')
plt.xlabel('Frequency (kHz)')
plt.ylabel('Magnitude $20\\log_{10}|X_k|$')
ax = plt.axis() # save ax with list value [xmin, xmax, ymin, ymax]

plt.subplot(326)
plt.plot(frequency_vector,20*np.log10(np.abs(scipy.fft.rfft(data_vector*windowing_fn))))
plt.title('Log-magnitude spectrum zoomed to [0,8kHz]')
plt.xlabel('Frequency (kHz)')
plt.ylabel('Magnitude $20\\log_{10}|X_k|$ (dB)')
ax = [0, 8, ax[2], ax[3]] # This will ax with xmin=0, xmax=8, ymin=ymin of above ax and ymax=ymax of above ax
plt.axis(ax)

plt.tight_layout()
plt.show()

# Speech feature in the spectrum

## Envelop

In [None]:
from scipy.linalg import solve_toeplitz, toeplitz

In [None]:
windowing_fn = np.sin(np.pi*np.arange(0.5, window_length, 1)/window_length)**2

starting_position = zero_length + np.random.randint(data_length - window_length)

data_vector = data[starting_position:(starting_position+window_length), ]
time_vector = np.linspace(0, window_length_ms, window_length)
frequency_vector = np.linspace(0, fs/2000, int(window_length/2 + 1))

plt.figure(figsize=[12, 8])

plt.subplot(221)
plt.plot(time_vector, data_vector)
plt.title("Original window")
plt.xlabel("Time (ms)")
plt.ylabel("Amplitude")

plt.subplot(222)
plt.plot(time_vector, data_vector*windowing_fn)
plt.title("Windowing signal")
plt.xlabel("Time (ms)")
plt.ylabel("Amplitude")

# Envelop calculation
autocorrelation = scipy.fft.irfft(np.abs(scipy.fft.rfft(data_vector*window_fn))**2)
lpc_order = int(fs/1000 + 2)
u = np.zeros([lpc_order+1, 1])
u[0] = 1
lpc_model = solve_toeplitz(autocorrelation[0:lpc_order+1], u)
lpc_model /= lpc_model[0]
envelope_spectrum = np.abs(scipy.fft.rfft(lpc_model, window_length, axis=0))**-1
signal_spectrum = np.abs(scipy.fft.rfft(data_vector*windowing_fn))
envelope_spectrum *= np.max(signal_spectrum)/np.max(envelope_spectrum)

plt.subplot(223)
plt.plot(frequency_vector, 20*np.log10(signal_spectrum), label="Spectrum")
plt.plot(frequency_vector, 20*np.log10(envelope_spectrum), linewidth=2, label="Envelope")
plt.legend()
plt.title("Log-magnitude spectrum and its envelope")
plt.xlabel("Frequency (kHz)")
plt.ylabel("Magnitude $20\\log_{10}|X_k|$ (dB)")
ax = plt.axis()

plt.subplot(224)
plt.plot(frequency_vector, 20*np.log10(signal_spectrum), label="Spectrum")
plt.plot(frequency_vector, 20*np.log10(envelope_spectrum), linewidth=2, label="Envelope")
plt.title("Log-magnitude spectrum and its envelope")
plt.xlabel("Frequency (kHz)")
plt.ylabel("Magnitude $20\\log_{10}|X_k|$ (dB)")
ax = [0, 8, ax[2], ax[3]]
plt.axis(ax)

plt.tight_layout()
plt.show()

## Formants

In [None]:
window_fn = np.sin(np.pi*np.arange(0.5, window_length, 1)/window_length)**2

starting_position = zero_length + np.random.randint(data_length-window_length)

data_vector = data[starting_position:(starting_position+window_length), ]
time_vector = np.linspace(0, window_length_ms, window_length)
frequency_vector = np.linspace(0, fs/2000, int(window_length/2 + 1))


# Envelop calculation
from scipy.linalg import solve_toeplitz, toeplitz
autocorrelation = scipy.fft.irfft(np.abs(scipy.fft.rfft(data_vector*window_fn))**2)
lpc_order = int(fs/1000 + 2)
u = np.zeros([lpc_order+1,1])
u[0] = 1
lpc_model = solve_toeplitz(autocorrelation[0: lpc_order+1], u)
lpc_model /= lpc_model[0]
envelope_spectrum = np.abs(scipy.fft.rfft(lpc_model, window_length, axis=0))**-1
signal_spectrum = np.abs(scipy.fft.rfft(data_vector*windowing_fn))
envelope_spectrum *= np.max(signal_spectrum)/np.max(envelope_spectrum)


# Find fundamental frequency peak
# Derivative is zero at peaks and valleys; Similarly, the first difference changes sign
# At peaks, the first difference changes sign from positive negative
diff_envelope = np.diff(signal_spectrum, axis=0) # We can also not need axis=0 because it is vector
signal_diff_envelope = np.sign(diff_envelope)
diff_sign_diff_envelope = np.diff(signal_diff_envelope, axis=0)
peak_indices = np.argwhere(diff_sign_diff_envelope[:] < 0)[:,0] + 1
peak_indices = peak_indices[0:5]

plt.figure(figsize=[12, 4])

plt.subplot(121)
plt.plot(frequency_vector, 20*np.log10(signal_spectrum), label="Spectrum")
plt.plot(frequency_vector, 20*np.log10(envelope_spectrum), linewidth=3, label="Envelope")
plt.plot(frequency_vector[peak_indices], 20*np.log10(envelope_spectrum[peak_indices, 0]), marker='^', linestyle='', label="Peaks")
for k in range(len(peak_indices)):
    x = frequency_vector[peak_indices[k]]
    y = 20*np.log10(envelope_spectrum[peak_indices[k], 0]) + 5
    plt.text(x, y, 'F'+str(k+1))
plt.legend()
plt.title("Log-magnitude spectrum and its envelope")
plt.xlabel("Frequency (kHz)")
plt.ylabel("Magnitude $20\\log{10}|X_k|$ (dB)")
ax = plt.axis()
ax = [ax[0], ax[1], ax[2], ax[3]+5]
plt.axis(ax)

plt.subplot(122)
plt.plot(frequency_vector, 20*np.log10(signal_spectrum), label="Spectrum")
plt.plot(frequency_vector, 20*np.log10(envelope_spectrum), linewidth=3, label="Envelope")
plt.plot(frequency_vector[peak_indices], 20*np.log10(envelope_spectrum[peak_indices, 0]), marker='^', linestyle='', label="Peaks")
for k in range(len(peak_indices)):
    x = frequency_vector[peak_indices[k]]
    y = 20*np.log10(envelope_spectrum[peak_indices[k], 0]) + 5
    plt.text(x, y, 'F'+str(k))
plt.legend()
plt.title("Log-magnitude soectrum zoomed to 0.8Hz")
plt.xlabel("Frequency (kHz)")
plt.ylabel("Magnitude $20\\log_10|X_k|$ (dB)")
ax = [0, 8, ax[2], ax[3]+5]
plt.axis(ax)

plt.tight_layout()
plt.show()

## Fundamental frequency

In [None]:
windowing_fn = np.sin(np.pi*np.arange(0.5, window_length, 1)/window_length)**2

# Choose segment from random position in sample
starting_position = zero_length + np.random.randint(data_length - window_length)

# Without zero-extension, becasue that was just for illustration
data_vector = data[starting_position:(starting_position+window_length),]
time_vector = np.linspace(0, window_length_ms, window_length)
frequency_vector = np.linspace(0, fs/2000, int(window_length/2+1))

signal_spectrum = np.abs(scipy.fft.rfft(data_vector*windowing_fn))

# Find fundamental frequency peak
## Derivative is zero at peaks and valleys -> the first difference changes sign
## At peaks, the first difference changes sign from positive negative
diff_envelope = np.diff(signal_spectrum, axis=0)
sign_diff_envelope = np.sign(diff_envelope)
diff_sign_diff_envelope = np.diff(sign_diff_envelope, axis=0)
peak_indices = np.argwhere(diff_sign_diff_envelope[:] < 0)[:, 0]+1
peak_frequencies = frequency_vector[peak_indices]*1000

# Suppose F0 is in the range 80 to 400, then the highest peak in that range belongs to the comb structure
in_range_indices = np.argwhere((peak_frequencies >= 80) & (peak_frequencies <= 400))[:, 0]
if in_range_indices.size > 0:
    largest_index = np.argmax(signal_spectrum[peak_indices[in_range_indices]])
    within_6dB = np.argwhere(signal_spectrum[peak_indices[in_range_indices]] > signal_spectrum[peak_indices[largest_index]]*0.5)
    F0_index = peak_indices[in_range_indices[within_6dB[0]]]

plt.figure(figsize=[12, 6])

plt.subplot(221)
plt.plot(time_vector, data_vector)
plt.title("Original window")
plt.xlabel("Time (ms)")
plt.ylabel("Amplitude")

plt.subplot(222)
plt.plot(frequency_vector, 20*np.log10(signal_spectrum), label="Spectrum")
plt.title("Log-magnitude spectrum")
plt.xlabel("Frequency (Hz)")
plt.ylabel("Magnitude $20\\log_10|X_k|$")
plt.legend()
ax = plt.axis()
ax = [ax[0], ax[1], ax[2], ax[3]+5]
plt.axis(ax)

plt.subplot(223)
plt.plot(frequency_vector*1000, 20*np.log10(signal_spectrum), label="Spectrum")
plt.plot(frequency_vector[F0_index]*1000, 20*np.log10(signal_spectrum[F0_index]), marker='v', linestyle='', label="Fundamental")
plt.text(frequency_vector[F0_index]*1000, 20*np.log10(signal_spectrum[F0_index]) + 5, "F0="+str(frequency_vector[F0_index[0]]*1000) + "Hz")
plt.legend()
plt.title("Log-magnitude spectrum zoomed to 0.2kHz")
plt.xlabel("Frequency (Hz)")
plt.ylabel("Magnitude $20\\log_10|X_k| (dB)$")
ax = [0, 2000, ax[2], ax[3]+5]
plt.axis(ax)

plt.tight_layout()
plt.show()

## Harmonics of the fundamental

In [None]:
windowing_fn = np.sin(np.pi*np.arange(0.5, window_length, 1)/window_length)**2

# CHoose segment from random position in sample
starting_position = zero_length + np.random.randint(data_length - window_length)

# Without the zero-extension
data_vector = data[starting_position:(starting_position+window_length),]
time_vector = np.linspace(0, window_length_ms, window_length)

signal_spectrum = np.abs(scipy.fft.rfft(data_vector*windowing_fn, n=window_length*4))
frequency_vector = np.linspace(0, fs/2000, len(signal_spectrum))

# Find fundamental frequency peak
## Derivative is zero at peaks and valleys -> the first difference changes sign
## At peaks the first difference changes sign from positive negative
diff_envelope = np.diff(signal_spectrum, axis=0)
sign_diff_envelope = np.sign(diff_envelope)
diff_sign_diff_envelope = np.diff(sign_diff_envelope, axis=0)
peak_indices = np.argwhere(diff_sign_diff_envelope[:] < 0)[:, 0]+1
peak_frequencies = frequency_vector[peak_indices]*1000

# F0 is in the range 80 to 400, then the highest peak in that range belongs to the comb structure
in_range_indices = np.argwhere((peak_frequencies >= 80) & (peak_frequencies <= 400))[:,0]
if in_range_indices.size > 0:
    largest_index = np.argmax(signal_spectrum[peak_indices[in_range_indices]])
    within_6dB = np.argwhere(signal_spectrum[peak_indices[in_range_indices]] > signal_spectrum[peak_indices[largest_index]]*0.5)
    F0_index = peak_indices[in_range_indices[within_6dB[0]]]

plt.figure(figsize=[12,6])

plt.subplot(221)
plt.plot(time_vector, data_vector)
plt.title("Original window")
plt.xlabel("Time (ms)")
plt.ylabel("Amplitude")

plt.subplot(222)
plt.plot(frequency_vector, 20*np.log10(signal_spectrum), label="Spectrum")
plt.legend()
plt.title("Log-magnitude spectrum")
plt.xlabel("Frequency (kHz)")
plt.ylabel("Magnitude $20\\log_10|X_k|$")
ax = plt.axis()
ax = [ax[0], ax[1], ax[2], ax[3]+5]
plt.axis(ax)

plt.subplot(223)
plt.plot(frequency_vector*1000, 20*np.log10(signal_spectrum), label="spectrum")
F0vector = np.arange(frequency_vector[F0_index].item(), 2, frequency_vector[F0_index].item())
for h in F0vector:
    plt.plot(h*np.ones(2)*1000, [-100, 100], linestyle="--", color="gray")
plt.legend()
plt.title("Log-magnitude spectrum zoomed to 0.2kHz")
plt.xlabel("Frequency (Hz)")
plt.ylabel("Magnitude $20\\log_10|X_k|$ (dB)")
ax = [0, 2000, ax[2], ax[3]+5]
plt.axis(ax)

plt.tight_layout()
plt.show()

## Spectrogram

In [None]:
import matplotlib as mpl

In [None]:
window_step = int(window_length/8)
window_count = int(np.floor((len(data)-window_length)/window_step)+1)
spectrum_length = int((window_length+1)/2)+1

spectrogram = np.zeros((window_count, spectrum_length))
time_vector = np.linspace(0, window_length_ms, window_length)
frequency_vector = np.linspace(0, fs/2000, spectrum_length)

for k in range(window_count):
    starting_position = k*window_step

    data_vector = data[starting_position:(starting_position+window_length),]
    window_spectrum = np.abs(scipy.fft.rfft(data_vector*windowing_fn))

    spectrogram[k, :] = window_spectrum

black_eps = 1e-1

default_figsize = mpl.rcParamsDefault["figure.figsize"]
mpl.rcParams["figure.figsize"] = [val*2 for val in default_figsize]
plt.imshow(20*np.log10(np.abs(np.transpose(spectrogram))+black_eps), aspect="auto", origin="lower", extent=[0, len(data)/fs, 0, fs/2000])
plt.xlabel("Time (ms)")
plt.ylabel("Frequency (kHz)")
plt.title("Spectrogram of signal")
plt.show()

default_figsize = mpl.rcParamsDefault["figure.figsize"]
mpl.rcParams["figure.figsize"] = [val*2 for val in default_figsize]
plt.imshow(20*np.log10(np.abs(np.transpose(spectrogram))+black_eps), aspect="auto", origin="lower", extent=[0, len(data)/fs, 0, fs/2000])
plt.xlabel("Time (ms)")
plt.ylabel("Frequency (kHz)")
plt.axis([0, len(data)/fs, 0, 8])
plt.title("Spectrogram zoomed to 8kHz")
plt.show()

In [None]:
from ipywidgets import *

In [None]:
def plot_spectrogram(window_length_ms, window_step_ms, zero_extension_factor=1):
    window_length = int(window_length_ms*fs/1000)
    window_step = int(window_step_ms*fs/1000)
    total_length = int(window_length*zero_extension_factor)

    window_count = int(np.floor((len(data)-window_length)/window_step)+1)
    spectrum_length = int((total_length+1)/2)+1
    windowing_fn = np.sin(np.pi*np.arange(0.5, window_length, 1)/window_length)**2

    spectrogram = np.zeros((window_count, spectrum_length))
    time_vector = np.linspace(0, window_length_ms, window_length)
    frequency_vector = np.linspace(0, fs/2000, spectrum_length)

    for k in range(window_count):
        starting_position = k*window_step

        data_vector = data[starting_position:(starting_position+window_length),]
        window_spectrum = np.abs(scipy.fft.rfft(data_vector*windowing_fn, n=total_length))

        spectrogram[k,:] = window_spectrum

    black_eps = 1e-1

    default_figsize = mpl.rcParamsDefault["figure.figsize"]
    mpl.rcParams["figure.figsize"] = [val*2 for val in default_figsize]

    plt.imshow(20*np.log10(np.abs(np.transpose(spectrogram))+black_eps), aspect="auto", origin="lower", extent=[0, len(data)/fs, 0, fs/2000])
    plt.xlabel("Time (ms)")
    plt.ylabel("Frequency (kHz)")
    plt.axis([0, len(data)/fs, 0, 8])
    plt.title("Spectrogram zoomed to 8kHz")
    plt.show()

interact(plot_spectrogram, window_length_ms=(0, 100, 10), window_step_ms=(0, 20, 2), zero_extension_factor=(0, 100, 1))