In [3]:
!pip install -r requirements.txt

Collecting tensorflow==2.13.0 (from -r requirements.txt (line 1))
  Using cached tensorflow-2.13.0-cp39-cp39-win_amd64.whl.metadata (2.6 kB)
Collecting keras==2.13.1 (from -r requirements.txt (line 2))
  Using cached keras-2.13.1-py3-none-any.whl.metadata (2.4 kB)
Collecting numpy==1.24.3 (from -r requirements.txt (line 3))
  Using cached numpy-1.24.3-cp39-cp39-win_amd64.whl.metadata (5.6 kB)
Collecting scipy==1.10.1 (from -r requirements.txt (line 4))
  Using cached scipy-1.10.1-cp39-cp39-win_amd64.whl.metadata (58 kB)
Collecting pandas==1.5.3 (from -r requirements.txt (line 5))
  Using cached pandas-1.5.3-cp39-cp39-win_amd64.whl.metadata (12 kB)
Collecting matplotlib==3.8.2 (from -r requirements.txt (line 6))
  Using cached matplotlib-3.8.2-cp39-cp39-win_amd64.whl.metadata (5.9 kB)
Collecting h5py==3.9.0 (from -r requirements.txt (line 7))
  Using cached h5py-3.9.0-cp39-cp39-win_amd64.whl.metadata (2.5 kB)
Collecting tqdm==4.65.0 (from -r requirements.txt (line 8))
  Using cached tqd

ERROR: Could not find a version that satisfies the requirement tensorflow-io==0.33.0 (from versions: 0.18.0, 0.19.0, 0.19.1, 0.20.0, 0.21.0, 0.22.0, 0.23.0, 0.23.1, 0.24.0, 0.25.0, 0.26.0, 0.27.0, 0.28.0, 0.29.0, 0.30.0, 0.31.0)
ERROR: No matching distribution found for tensorflow-io==0.33.0


In [5]:
import os
import copy
import json
import importlib
import numpy as np
import tensorflow as tf
import IPython.display as ipd

import util_cochlea
import util_network


In [6]:
"""
Build tensorflow2 objects for the `network_model and the `cochlea_model`.

The `cochlea_model` has no learnable parameters and does not require a checkpoint to be loaded.
The `network_model` parameters are loaded from a checkpoint file.
"""
dir_model = 'models/tensorflow2/arch01'
fn_arch = os.path.join(dir_model, 'arch.json')          # 网络层定义
fn_config = os.path.join(dir_model, 'config.json')      # 网络层超参数定义
fn_ckpt = os.path.join(dir_model, 'ckpt_BEST')          #

with open(fn_arch, 'r') as f:
    list_layer_dict = json.load(f)
with open(fn_config, 'r') as f:
    CONFIG = json.load(f)

def model_input_to_output_mapping(x):
    y = x
    if CONFIG.get('kwargs_cochlea', {}):
        msg = "expected input with shape [batch, time, channel=2]"
        assert (len(y.shape) == 3) and (y.shape[-1] == 2), msg
        # Cochlear model for ear index 0
        y0, _ = util_cochlea.cochlea(y[..., 0], **copy.deepcopy(CONFIG['kwargs_cochlea']))
        # Cochlear model for ear index 1
        y1, _ = util_cochlea.cochlea(y[..., 1], **copy.deepcopy(CONFIG['kwargs_cochlea']))
        # Binaural cochlear model representation with shape [batch, freq, time, channel=2]
        y = tf.concat([y0[..., tf.newaxis], y1[..., tf.newaxis]], axis=-1)
        msg = "expected cochlear model output with shape [batch, freq, time, channel=2]"
        assert (len(y.shape) == 4) and (y.shape[-1] == 2), msg
    y, _ = util_network.build_network(y, list_layer_dict, n_classes_dict=CONFIG['n_classes_dict'])
    return y

tf.keras.backend.clear_session()
inputs = tf.keras.Input(shape=(48000, 2), batch_size=None, dtype=tf.float32)
model = tf.keras.Model(
    inputs=inputs,
    outputs=model_input_to_output_mapping(inputs))
model.load_weights(fn_ckpt)
print('Loaded: {}'.format(fn_ckpt))


[cochlea] converting audio to subbands using half_cosine_filterbank
[cochlea] half-wave rectified subbands
[tf_fir_resample] interpreted `tensor_input.shape` as [batch, freq=39, time=48000]
[tf_fir_resample] `kwargs_fir_lowpass_filter`: {'cutoff': 4000, 'numtaps': 4097, 'window': ['kaiser', 5.0]}
[fir_lowpass_filter] sr_filt = 48000 Hz
[fir_lowpass_filter] numtaps = 4097 samples
[fir_lowpass_filter] fir_dur = 0.08533333333333333 seconds
[fir_lowpass_filter] cutoff = 4000 Hz
[fir_lowpass_filter] window = ('kaiser', 5.0)
[cochlea] resampled subbands from 48000 Hz to 8000 Hz with filter: {'cutoff': 4000, 'numtaps': 4097, 'window': ['kaiser', 5.0]}
[cochlea] half-wave rectified resampled subbands
[cochlea] applied 0.3 power compression to subbands
[cochlea] converting audio to subbands using half_cosine_filterbank
[cochlea] half-wave rectified subbands
[tf_fir_resample] interpreted `tensor_input.shape` as [batch, freq=39, time=48000]
[tf_fir_resample] `kwargs_fir_lowpass_filter`: {'cutoff'

In [7]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 48000, 2)]           0         []                            
                                                                                                  
 tf.__operators__.getitem (  (None, 48000)                0         ['input_1[0][0]']             
 SlicingOpLambda)                                                                                 
                                                                                                  
 tf.__operators__.getitem_1  (None, 48000)                0         ['input_1[0][0]']             
  (SlicingOpLambda)                                                                               
                                                                                              

In [8]:
"""
Load audio examples
"""
sr = 48e3
example_stim = np.load('example_stimuli_and_tf1_model_outputs.npz', allow_pickle=True)
list_y = []
for itr_y, raw_y in enumerate(example_stim['orig_stim']):
    y = np.frombuffer(raw_y, dtype=float).reshape([-1, 2]).astype(np.float32)
    list_y.append(y)
list_y = np.array(list_y)
list_y = list_y[:, :int(sr), :]
print("Input shape : [batch, timesteps (sampled at 48 kHz), channel (left / right ear)] :", list_y.shape)


FileNotFoundError: [Errno 2] No such file or directory: 'example_stimuli_and_tf1_model_outputs.npz'

In [5]:
"""
Evaluate model and print out localization judgment for each example sound.

0° azimuth and 0° elevation is directly ahead.
90° azimuth is directly to the left, 270° to the right.
"""
model_class_prob = tf.nn.softmax(model(list_y)['label_loc_int']).numpy()
model_class_pred = np.argmax(model_class_prob, axis=1)

def label_to_azim_elev(label):
    """
    Map localization class integer to source azimuth and elevation
    """
    elev = np.array((label // 72) * 10)
    azim = np.array((label % 72) * 5)
    return np.array(azim).astype(float), np.array(elev).astype(float)

model_azim, model_elev = label_to_azim_elev(model_class_pred)
for itr in range(list_y.shape[0]):
    y = list_y[itr]
    print(f"Model localization judgment: azimuth = {model_azim[itr]}°, elevation = {model_elev[itr]}°")
    ipd.display(ipd.Audio(rate=sr, data=y.T))


2024-05-13 06:09:06.413880: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8600


Model localization judgment: azimuth = 165.0°, elevation = 50.0°


Model localization judgment: azimuth = 145.0°, elevation = 0.0°


Model localization judgment: azimuth = 215.0°, elevation = 60.0°


Model localization judgment: azimuth = 80.0°, elevation = 10.0°


Model localization judgment: azimuth = 265.0°, elevation = 20.0°


Model localization judgment: azimuth = 20.0°, elevation = 40.0°


Model localization judgment: azimuth = 185.0°, elevation = 50.0°


Model localization judgment: azimuth = 130.0°, elevation = 20.0°


In [13]:
import soundfile as sf
import numpy as np
import librosa

y, sr = sf.read(r"G:\GitHub\BinauralLocalizationCNN_tf2\audio\sin\sin_135°.wav")
#y, sr = librosa.load(r"G:\GitHub\BinauralLocalizationCNN_tf2\audio\sin\sin_125°.wav", sr=48000, mono=False)  # sr=48000 自动重采样
print(sr)
# assert sr == 48000, "必须使用采样率为 48kHz 的音频"
assert y.ndim == 2 and y.shape[1] == 2, "必须是双通道立体声"

# 如果长于 1 秒，裁剪前 1 秒；如果不足 1 秒，补零
if y.shape[0] > 48000:
    y = y[:48000]
elif y.shape[0] < 48000:
    pad = np.zeros((48000 - y.shape[0], 2), dtype=np.float32)
    y = np.concatenate([y, pad], axis=0)

y_input = np.expand_dims(y.astype(np.float32), axis=0)  # shape: [1, 48000, 2]

output = model(y_input)
probs = tf.nn.softmax(output['label_loc_int']).numpy()
label = np.argmax(probs, axis=1)[0]

def label_to_azim_elev(label):
    elev = (label // 72) * 10
    azim = (label % 72) * 5
    return azim, elev

azim, elev = label_to_azim_elev(label)
print(f"✅ 声源方向预测结果：Azimuth = {azim}°, Elevation = {elev}°")


44100
✅ 声源方向预测结果：Azimuth = 140°, Elevation = 10°
