<a href="https://colab.research.google.com/github/Hazzd12/CASA0018_coursework/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Audio preprocessing
First, we need to convert the original audio file to the Mayer spectrum, a common representation of audio features that is particularly suitable for feeding convolutional neural networks (CNNS) for training.

In [None]:
import librosa
import numpy as np

def audio_to_melspectrogram(audio_path):
    y, sr = librosa.load(audio_path, sr=None)
    mels = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    mels_db = librosa.power_to_db(mels, ref=np.max)
    return mels_db


In [None]:
# 假设我们有以下数据结构
audio_paths = ['path/to/audio1.wav', 'path/to/audio2.wav', ...]
scores = [3.5, 4.0, ...]  # 假设评分在0到5之间

# 将音频文件转换为梅尔频谱特征
X_train = np.array([audio_to_melspectrogram(path) for path in audio_paths])
y_train = np.array(scores)


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K


In [None]:
def create_base_network(input_shape):
    input_layer = Input(shape=input_shape)
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(input_layer)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    return Model(input_layer, x, name="base_network")


In [None]:
def create_siamese_network(input_shape):
    base_network = create_base_network(input_shape)

    input_a = Input(shape=input_shape)
    input_b = Input(shape=input_shape)

    processed_a = base_network(input_a)
    processed_b = base_network(input_b)

    distance = Lambda(lambda embeddings: K.abs(embeddings[0] - embeddings[1]))([processed_a, processed_b])
    outputs = Dense(1, activation='sigmoid')(distance)
    model = Model([input_a, input_b], outputs)

    return model


In [None]:
input_shape = (128, 128, 1)  # 假设我们的梅尔频谱图大小是128x128，单通道
siamese_model = create_siamese_network(input_shape)

siamese_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
# 假设数据已经准备好
X = [pair_of_spectrograms, ...]  # 每个元素是一个包含一对频谱图的元组
y = [label, ...]  # 每个标签表示对应的一对频谱图是否相似

# 将数据转换为适合模型训练的格式
X_train = [np.array([x[0] for x in X]), np.array([x[1] for x in X])]
y_train = np.array(y)

# 训练模型
siamese_model.fit(X_train, y_train, batch_size=32, epochs=10, validation_split=0.2)
