In [1]:
import os
import numpy as np
import librosa
import tensorflow as tf
from sklearn.model_selection import train_test_split


In [2]:
def augment_audio(audio, sr):
    """Apply random augmentation"""
    if np.random.rand() < 0.3:  # Add noise
        audio = audio + 0.005 * np.random.randn(len(audio))
    if np.random.rand() < 0.3:  # Pitch shift
        audio = librosa.effects.pitch_shift(y=audio, sr=sr, n_steps=np.random.choice([-2, 2]))
    if np.random.rand() < 0.3:  # Time stretch
        rate = np.random.uniform(0.8, 1.2)
        audio = librosa.effects.time_stretch(y=audio, rate=rate)
    return audio

In [3]:
# ---------------------- Audio Loading & Augmentation ----------------------
def load_audio(file_path, sr=16000, augment=False):
    audio, _ = librosa.load(file_path, sr=sr)
    if augment:
        audio = augment_audio(audio, sr)
    return audio

In [4]:
# ---------------------- Feature Extraction ----------------------
def preprocess(audio, sr=16000):
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    return mfccs.T[:80]

def pad_sequence(mfccs, max_length=80):
    return np.pad(mfccs, ((0, max(0, max_length - len(mfccs))), (0, 0)), mode='constant')

In [5]:

# ---------------------- Pair Creation ----------------------
def create_word_pairs(data_dir, max_pairs_per_class=100):
    pairs = []
    labels = []
    all_dirs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
    
    for word_dir in all_dirs:
        word_path = os.path.join(data_dir, word_dir)
        audio_files = [os.path.join(word_path, f) for f in os.listdir(word_path) if f.endswith('.m4a') or f.endswith('.mp3')]

        # Positive pairs
        for i in range(len(audio_files)):
            for j in range(i + 1, len(audio_files)):
                pairs.append((audio_files[i], audio_files[j]))
                labels.append(1)
                if len(pairs) >= max_pairs_per_class:
                    break

        # Negative pairs
        for other_dir in all_dirs:
            if other_dir != word_dir:
                other_path = os.path.join(data_dir, other_dir)
                other_files = [os.path.join(other_path, f) for f in os.listdir(other_path) if f.endswith('.m4a') or f.endswith('.mp3')]
                if audio_files and other_files:
                    pairs.append((audio_files[0], other_files[0]))
                    labels.append(0)

    return pairs, labels

In [6]:
def process_pairs(pairs, augment=False):
    X1, X2 = [], []
    for file1, file2 in pairs:
        a1 = preprocess(load_audio(file1, augment=augment))
        a2 = preprocess(load_audio(file2, augment=augment))
        X1.append(pad_sequence(a1))
        X2.append(pad_sequence(a2))
    return np.array(X1), np.array(X2)

In [7]:
# from google.colab import files
# uploaded = files.upload()
import zipfile
import os

def unzip_file(zip_path, extract_to='.'):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
        print(f"Extracted all files to: {extract_to}")

# Example usage
zip_path = r'C:\Users\KASHF KAMAL\Documents\AudioListenMet\urdualphabets_voices.zip'
extract_to = 'dataurdu1'  

# Create output folder if it doesn't exist
os.makedirs(extract_to, exist_ok=True)

unzip_file(zip_path, extract_to)


Extracted all files to: dataurdu1


In [8]:
import tensorflow as tf

# Encoder for feature extraction
def build_encoder(input_shape):
    model = tf.keras.Sequential([
        tf.keras.layers.Conv1D(64, 5, activation='relu', input_shape=input_shape),
        tf.keras.layers.MaxPooling1D(2),
        tf.keras.layers.Conv1D(128, 5, activation='relu'),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(128, activation='relu')
    ])
    return model

# Custom Layer for Absolute Difference
class AbsDifference(tf.keras.layers.Layer):
    def call(self, inputs):
        return tf.abs(inputs[0] - inputs[1])

# Siamese Network
def build_siamese(input_shape):
    encoder = build_encoder(input_shape)

    input1 = tf.keras.Input(shape=input_shape)
    input2 = tf.keras.Input(shape=input_shape)

    encoded1 = encoder(input1)
    encoded2 = encoder(input2)

    diff = AbsDifference()([encoded1, encoded2])  # No Lambda issues
    output = tf.keras.layers.Dense(1, activation='sigmoid')(diff)

    model = tf.keras.Model(inputs=[input1, input2], outputs=output)
    return model

In [10]:
# ---------------------- Training ----------------------
SEED = 42
data_dir = r"C:\Users\KASHF KAMAL\Documents\AudioListenMet\dataurdu1\alphabets\alphabets"

pairs, labels = create_word_pairs(data_dir)
X1, X2 = process_pairs(pairs, augment=True)
y = np.array(labels)

X1_train, X1_test, X2_train, X2_test, y_train, y_test = train_test_split(
    X1, X2, y, test_size=0.2, random_state=SEED
)

# input_shape = X1_train.shape[1:]
# model = build_siamese(input_shape)
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

input_shape = (X1_train.shape[1], X1_train.shape[2])  # Corrected input shape
model = build_siamese(input_shape)
model.compile(optimizer=tf.keras.optimizers.Adam(1e-4),
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

  audio, _ = librosa.load(file_path, sr=sr)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)





In [11]:
# Callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)
]

In [12]:
# Train
history = model.fit(
    [X1_train, X2_train], y_train,
    validation_data=([X1_test, X2_test], y_test),
    batch_size=32,
    epochs=20,
    callbacks=callbacks
)

# Save model
model.save('siamese_model_optimized2.keras')

Epoch 1/20
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 67ms/step - accuracy: 0.8262 - loss: 0.5090 - val_accuracy: 0.8905 - val_loss: 0.2557 - learning_rate: 1.0000e-04
Epoch 2/20
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.8979 - loss: 0.2627 - val_accuracy: 0.8994 - val_loss: 0.2144 - learning_rate: 1.0000e-04
Epoch 3/20
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.9223 - loss: 0.2182 - val_accuracy: 0.9408 - val_loss: 0.1843 - learning_rate: 1.0000e-04
Epoch 4/20
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.9320 - loss: 0.1946 - val_accuracy: 0.9408 - val_loss: 0.2059 - learning_rate: 1.0000e-04
Epoch 5/20
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.9408 - loss: 0.1706 - val_accuracy: 0.9586 - val_loss: 0.1438 - learning_rate: 1.0000e-04
Epoch 6/20
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [13]:
import librosa
import numpy as np

def predict_similarity2(file1, file2, model):
    # Load and preprocess first audio
    audio1, sr1 = librosa.load(file1, sr=16000)
    a1 = preprocess(audio1, sr=sr1)
    a1 = pad_sequence(a1)
    
    # Load and preprocess second audio
    audio2, sr2 = librosa.load(file2, sr=16000)
    a2 = preprocess(audio2, sr=sr2)
    a2 = pad_sequence(a2)

    # Add batch dimension
    a1 = np.expand_dims(a1, axis=0)
    a2 = np.expand_dims(a2, axis=0)

    # Predict similarity
    pred = model.predict([a1, a2])[0][0]
    return pred


In [37]:
score = predict_similarity2(r"C:\Users\KASHF KAMAL\Documents\AudioListenMet\dataurdu\urdualphabets11\bay\bay_1.mp3",r"C:\Users\KASHF KAMAL\Documents\AudioListenMet\dataurdu\urdualphabets11\bay\bay_10.m4a",model)
print(f"Similarity Score: {score:.2f}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
Similarity Score: 0.91


  audio2, sr2 = librosa.load(file2, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


In [24]:
score = predict_similarity2(r"dataurdu1/alphabets/alphabets/ص/Gali 79A 22.m4a",r"dataurdu1/alphabets/alphabets/ص/20_sawad.mp3",model)
print(f"Similarity Score: {score:.2f}")


  audio1, sr1 = librosa.load(file1, sr=16000)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step
Similarity Score: 0.20
