# Data and Environment Setup

In [1]:
import librosa, soundfile as sf, numpy as np, pandas as pd
from tqdm import tqdm
import os

DATA_DIR = "/kaggle/input/the-lj-speech-dataset/LJSpeech-1.1"
WAV_DIR = os.path.join(DATA_DIR, "wavs")
OUT_DIR = "processed_keras"
os.makedirs(OUT_DIR, exist_ok=True)

SAMPLE_RATE = 16000
N_MELS = 80

metadata = pd.read_csv(os.path.join(DATA_DIR, "metadata.csv"), sep="|", header=None)
metadata.columns = ["id", "raw", "text"]

def preprocess_audio(path):
    wav, sr = sf.read(path)
    if len(wav.shape) > 1:
        wav = np.mean(wav, axis=1)
    if sr != SAMPLE_RATE:
        wav = librosa.resample(y=wav, orig_sr=sr, target_sr=SAMPLE_RATE)
    wav = wav / np.max(np.abs(wav))
    mel = librosa.feature.melspectrogram(
        y=wav, sr=SAMPLE_RATE, n_mels=N_MELS, n_fft=400, hop_length=160
    )
    mel_db = np.log1p(mel)
    return mel_db.T.astype(np.float32), len(wav) / SAMPLE_RATE

rows = []
spec_dir = os.path.join(OUT_DIR, "specs")
os.makedirs(spec_dir, exist_ok=True)

for i, row in tqdm(metadata.iterrows(), total=len(metadata)):
    wav_path = os.path.join(WAV_DIR, row["id"] + ".wav")
    mel, dur = preprocess_audio(wav_path)
    np.save(f"{spec_dir}/{i:06d}.npy", mel)
    rows.append(
        {"spec_path": f"{spec_dir}/{i:06d}.npy", "text": row["text"], "duration": dur}
    )

manifest = pd.DataFrame(rows)
manifest.to_csv(os.path.join(OUT_DIR, "manifest.csv"), index=False)
print("Preprocessing done.")

100%|██████████| 13100/13100 [06:25<00:00, 33.98it/s]


Preprocessing done.


# Data Generator with tf.data

In [2]:
import tensorflow as tf
import string
import librosa, soundfile as sf, numpy as np, pandas as pd
from tqdm import tqdm
import os

DATA_DIR = "/kaggle/input/the-lj-speech-dataset/LJSpeech-1.1"
WAV_DIR = os.path.join(DATA_DIR, "wavs")
OUT_DIR = "processed_keras"
os.makedirs(OUT_DIR, exist_ok=True)

SAMPLE_RATE = 16000
N_MELS = 80


alphabet = list(string.ascii_lowercase) + [" ", "'", "<blank>"]
char2idx = {c: i for i, c in enumerate(alphabet)}
idx2char = {i: c for c, i in char2idx.items()}

def prepare_inputs(mel, text):
    input_len = [mel.shape[0] // 2]  # приблизно після Conv шарів
    label_len = [len(text)]
    return {
        "spectrogram": mel,
        "labels": text,
        "input_length": np.array(input_len, dtype=np.int32),
        "label_length": np.array(label_len, dtype=np.int32),
    }, np.zeros(1)

def prepare_tf_dataset(dataset, batch_size=8):
    def gen():
        for mel, text in dataset:
            x, y = prepare_inputs(mel, text)
            yield x, y

    output_signature = (
        {
            "spectrogram": tf.TensorSpec(shape=(None, N_MELS), dtype=tf.float32),
            "labels": tf.TensorSpec(shape=(None,), dtype=tf.int32),
            "input_length": tf.TensorSpec(shape=(1,), dtype=tf.int32),
            "label_length": tf.TensorSpec(shape=(1,), dtype=tf.int32),
        },
        tf.TensorSpec(shape=(1,), dtype=tf.float32),
    )

    ds = tf.data.Dataset.from_generator(gen, output_signature=output_signature)
    ds = ds.padded_batch(
        batch_size,
        padded_shapes=(
            {
                "spectrogram": [None, N_MELS],
                "labels": [None],
                "input_length": [1],
                "label_length": [1],
            },
            [1],
        ),
        padding_values=(
            {
                "spectrogram": 0.0,
                "labels": 0,
                "input_length": 0,
                "label_length": 0,
            },
            0.0,
        ),
    )
    return ds.prefetch(tf.data.AUTOTUNE)

def text_to_int(text):
    if not isinstance(text, str):
        text = str(text) if text is not None else ""
    text = text.lower()
    return [char2idx[c] for c in text if c in char2idx]

def int_to_text(seq):
    return "".join(idx2char.get(i, "") for i in seq)

df = pd.read_csv(os.path.join(OUT_DIR, "manifest.csv"))

def load_example(row):
    mel = np.load(row["spec_path"])
    text = text_to_int(row["text"])
    return mel, np.array(text, dtype=np.int32)

data = [load_example(row) for _, row in df.iterrows()]

split = int(len(data) * 0.9)
train_data = data[:split]
val_data = data[split:]

def generator(dataset):
    for mel, text in dataset:
        yield mel, text

train_ds = prepare_tf_dataset(train_data)
val_ds = prepare_tf_dataset(val_data)


2025-11-13 18:05:32.389722: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763057132.866480      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763057132.980715      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
I0000 00:00:1763057155.104627      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1763057155.105360      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability:

# DeepSpeech2 Model (Keras)

In [3]:
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras import layers, models

N_MELS = 80
alphabet = list(string.ascii_lowercase) + [" ", "'", "<blank>"]
char2idx = {c: i for i, c in enumerate(alphabet)}
idx2char = {i: c for c, i in char2idx.items()}

def DeepSpeech2_Keras(n_mels=N_MELS, n_classes=len(alphabet)):
    input_spectrogram = layers.Input(shape=(None, n_mels), name="spectrogram")
    labels = layers.Input(shape=(None,), dtype="int32", name="labels")
    input_len = layers.Input(shape=(1,), dtype="int32", name="input_length")
    label_len = layers.Input(shape=(1,), dtype="int32", name="label_length")

    x = layers.Reshape((-1, n_mels, 1))(input_spectrogram)
    x = layers.Conv2D(32, (11, 41), strides=(2, 2), padding="same", activation="relu")(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv2D(32, (11, 21), strides=(1, 2), padding="same", activation="relu")(x)
    x = layers.BatchNormalization()(x)
    x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)

    for _ in range(3):
        x = layers.Bidirectional(layers.GRU(512, return_sequences=True, dropout=0.2))(x)

    x = layers.Dense(512, activation="relu")(x)
    x = layers.Dropout(0.2)(x)
    y_pred = layers.Dense(n_classes + 1, activation="softmax", name="y_pred")(x)

    # --- CTC Loss Layer ---
    def ctc_lambda_func(args):
        y_pred, labels, input_len, label_len = args
        return K.ctc_batch_cost(labels, y_pred, input_len, label_len)

    loss_out = layers.Lambda(ctc_lambda_func, output_shape=(1,), name="ctc")(
        [y_pred, labels, input_len, label_len]
    )

    model = models.Model(
        inputs=[input_spectrogram, labels, input_len, label_len],
        outputs=loss_out
    )

    pred_model = models.Model(inputs=input_spectrogram, outputs=y_pred)
    return model, pred_model


# Train the Model with model.fit()

In [4]:
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, Callback
import numpy as np

model, pred_model = DeepSpeech2_Keras()

class StopWhenLossBelow(Callback):
    def __init__(self, threshold=0.1):
        super().__init__()
        self.threshold = threshold

    def on_epoch_end(self, epoch, logs=None):
        loss = logs.get("loss")
        if loss is not None and loss < self.threshold:
            print(f"\nStopping: loss={loss:.4f} < {self.threshold}")
            self.model.stop_training = True


checkpoint_cb = ModelCheckpoint(
    "best_deepspeech2.weights.h5",
    monitor="val_loss",
    save_best_only=True,
    save_weights_only=True,
    verbose=1
)

reduce_lr_cb = ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=2,
    verbose=1,
    min_lr=1e-6
)

earlystop_cb = StopWhenLossBelow(threshold=0.1)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss=lambda y_true, y_pred: y_pred
)
if os.path.exists("/kaggle/working/best_deepspeech2.weights.h5"):
    model.load_weights("best_deepspeech2.weights.h5")
BATCH_SIZE = 32
steps_per_epoch = len(train_data) // BATCH_SIZE
validation_steps = len(val_data) // BATCH_SIZE

EPOCHS = 10
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    steps_per_epoch=steps_per_epoch,
    validation_steps=validation_steps,
    callbacks=[checkpoint_cb, reduce_lr_cb, earlystop_cb],
    verbose=1
)

model.load_weights("best_deepspeech2.weights.h5")

Epoch 1/10


I0000 00:00:1763057169.541940      64 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 586ms/step - loss: 310.0273
Epoch 1: val_loss improved from inf to 321.15170, saving model to best_deepspeech2.weights.h5
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m238s[0m 608ms/step - loss: 309.9264 - val_loss: 321.1517 - learning_rate: 1.0000e-04
Epoch 2/10
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 373ms/step - loss: 204.3961
Epoch 2: val_loss improved from 321.15170 to 167.40765, saving model to best_deepspeech2.weights.h5
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 391ms/step - loss: 204.3408 - val_loss: 167.4077 - learning_rate: 1.0000e-04
Epoch 3/10
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 341ms/step - loss: 138.9478
Epoch 3: val_loss improved from 167.40765 to 129.66989, saving model to best_deepspeech2.weights.h5
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 359ms/step - loss: 138.9352 - val_loss: 129.




Epoch 5: val_loss improved from 100.64053 to 99.50379, saving model to best_deepspeech2.weights.h5
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 20ms/step - loss: 111.0330 - val_loss: 99.5038 - learning_rate: 1.0000e-04
Epoch 6/10
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 325ms/step - loss: 102.6529
Epoch 6: val_loss improved from 99.50379 to 85.32074, saving model to best_deepspeech2.weights.h5
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 343ms/step - loss: 102.6435 - val_loss: 85.3207 - learning_rate: 1.0000e-04
Epoch 7/10
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 323ms/step - loss: 87.5297
Epoch 7: val_loss improved from 85.32074 to 77.45393, saving model to best_deepspeech2.weights.h5
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 341ms/step - loss: 87.5147 - val_loss: 77.4539 - learning_rate: 1.0000e-04
Epoch 8/10
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

# Evaluate Model on Training and External Audio

In [5]:
import numpy as np
import tensorflow.keras.backend as K

model, pred_model = DeepSpeech2_Keras()

model.load_weights("best_deepspeech2.weights.h5")
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss=lambda y_true, y_pred: y_pred
)

def greedy_decode_tf(logits):
    input_len = np.ones(logits.shape[0]) * logits.shape[1]
    decoded, _ = K.ctc_decode(logits, input_length=input_len, greedy=True)
    decoded = decoded[0].numpy()
    texts = ["".join(idx2char.get(i, "") for i in seq if i > 0) for seq in decoded]
    return texts

for i in range(3):
    mel, text = val_data[i]
    mel_in = np.expand_dims(mel, axis=0)
    pred = pred_model.predict(mel_in)
    decoded_text = greedy_decode_tf(pred)[0]
    out_text = "".join([idx2char.get(l, "") for l in text])
    print("GT:", out_text)
    print("Pred:", decoded_text)
    print("-" * 40)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
GT: many factors were undoubtedly involved in oswald's motivation for the assassination and the commission does not believe
Pred: mny fcters were oun dotedly im boled in oswlds motevtion for the ssssintion nd the commission dos not plive
----------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
GT: that it can ascribe to him any one motive or group of motives
Pred: bt it comi scrived to him ny one moted or gro r motrs
----------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step
GT: it is apparent however that oswald was moved by an overriding hostility to his environment
Pred: it is pprend hewever tht oswld ws moed by in overriding hostility to his invirnment
----------------------------------------


# Calculate Word Error Rate (WER)

In [6]:
!pip install jiwer
from jiwer import wer

gts, preds = [], []
for i in range(10):  # 10 прикладів для демонстрації
    mel, text = val_data[i]
    mel_in = np.expand_dims(mel, axis=0)
    pred = pred_model.predict(mel_in)
    decoded_text = greedy_decode_tf(pred)[0]
    out_text = "".join([idx2char.get(l, "") for l in text])
    gts.append(out_text)
    preds.append(decoded_text)

print("WER:", wer(gts, preds))

Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.14.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading rapidfuzz-3.14.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-4.0.0 rapidfuzz-3.14.3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━

# Text Post-processing with SymSpell

In [7]:
!pip install symspellpy
from symspellpy import SymSpell, Verbosity
import os
sym_spell = SymSpell() 
word_freq = {}
for t in df["text"]:
    if not isinstance(t, str):
        continue
    for w in t.lower().split():
        word_freq[w] = word_freq.get(w, 0) + 1

dict_file = "symspell_dict.txt"
with open(dict_file, "w", encoding="utf-8") as f:
    for w, c in word_freq.items():
        f.write(f"{w} {c}\n")
        
sym_spell.load_dictionary(dict_file, term_index=0, count_index=1)

def symspell_correct(text, max_edit_distance=2):
    if not isinstance(text, str):
        return text
    words = text.split()
    corrected = []
    for i, w in enumerate(words):
        w_lower = w.lower()
        if w_lower in sym_spell.words:
            corrected.append(w_lower)
            continue
        suggestions = sym_spell.lookup(w_lower, Verbosity.ALL, max_edit_distance=max_edit_distance)
        if not suggestions:
            corrected.append(w_lower)
            continue
        best = max(suggestions, key=lambda s: s.count)
        if i > 0 and corrected:
            prev = corrected[-1]
            for s in suggestions:
                if s.term.startswith(prev[:2]) or s.term.endswith(prev[-2:]):
                    best = s
                    break
        corrected.append(best.term)
    return " ".join(corrected)

from jiwer import wer
for i in range(3):
    mel, text = val_data[i]
    mel_in = np.expand_dims(mel, axis=0)
    pred = pred_model.predict(mel_in)
    decoded_text = greedy_decode_tf(pred)[0]
    out_text = "".join([idx2char.get(l, "") for l in text])
    corrected_text = symspell_correct(decoded_text)
    print("GT:", out_text)
    print("Pred:", decoded_text)
    print("Corrected:",corrected_text)
    print("WER before SymSpell:", wer(out_text, decoded_text))
    print("WER after SymSpell:", wer(out_text, corrected_text))
    print("-" * 40)


Collecting symspellpy
  Downloading symspellpy-6.9.0-py3-none-any.whl.metadata (3.9 kB)
Collecting editdistpy>=0.1.3 (from symspellpy)
  Downloading editdistpy-0.1.6-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading symspellpy-6.9.0-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading editdistpy-0.1.6-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (158 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.4/158.4 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: editdistpy, symspellpy
Successfully installed editdistpy-0.1.6 symspellpy-6.9.0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step
GT: many factors were undoubtedly involved in oswald's motivation for the assassinati