In [19]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
import torch
import numpy as np
import torch.nn.functional as F
import torch.nn as nn
from utils.helpers import check_cuda, ModelSaver
from utils.prepare_data import prepare_mfcc, create_audio_path_and_text, read_phonemes, prepare_input_data, phonemes_to_ids
import librosa

In [20]:
class ResidualTDNNBlock(nn.Module):
    def __init__(self, channels, kernel_size, dilation=1, dropout=None,bypass_scale = 0.5):
        super().__init__()
        self.tdnn1 = TDNNBlock(channels, channels, kernel_size, dilation, dropout)
        self.tdnn2 = TDNNBlock(channels, channels, kernel_size, dilation, dropout)
        self.bypass_scale = bypass_scale

    def forward(self, x):
        residual = x
        out = self.tdnn1(x)
        out = self.tdnn2(out)
        return out + residual * self.bypass_scale

class TDNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, dilation=1, dropout=None, activation = True):
        super().__init__()
        padding = dilation * (kernel_size // 2)
        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size,
                            dilation=dilation, padding=padding)
        self.ln = nn.LayerNorm(out_channels)
        self.relu = nn.ReLU() if activation is True else nn.Identity()# wer 17
        self.dropout = nn.Dropout1d(dropout) if dropout is not None else nn.Identity()

    def forward(self, x):  # x: [B, C_in, T]
        out = self.conv(x)           # [B, C_out, T]
        out = out.transpose(1,2)
        out = self.ln(out)
        out = out.transpose(1,2)
        out = self.relu(out)
        out = self.dropout(out)
        return out

class TDNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(TDNN, self).__init__()
        self.tdnn1 = TDNNBlock(input_dim, 400, kernel_size=7,dilation=1)
        self.tdnn2 = TDNNBlock(400, 400, kernel_size=5, dilation=2,dropout=0.2)
        self.tdnn3 = TDNNBlock(400, 400, kernel_size=5, dilation=3,dropout=0.3)
        self.res1 = ResidualTDNNBlock(400, kernel_size=3, dilation=2)
        self.res2 = ResidualTDNNBlock(400, kernel_size=3,dilation=1)
        self.res3 = ResidualTDNNBlock(400, kernel_size=3,dilation=1)
        self.tdnn4 = TDNNBlock(400, 300, kernel_size=1,dropout=0.3)
        self.tdnn5 = TDNNBlock(300, 300, kernel_size=1,dropout=0.4)
        self.tdnn6 = TDNNBlock(300, output_dim, kernel_size=1, activation = False)
 
    def forward(self, x):  # x: [B, T, F]
        x = x.transpose(1, 2)  # [B, F, T]
        x = self.tdnn1(x)
        x = self.tdnn2(x)
        x = self.tdnn3(x)
        x = self.res1(x)
        x = self.res2(x)
        x = self.res3(x)
        x = self.tdnn4(x)
        x = self.tdnn5(x)
        x = self.tdnn6(x)
        x = x.transpose(1, 2)  # [B, T, F]
        return F.log_softmax(x, dim=-1)

In [None]:
y, sr = librosa.load("test.flac", sr=16000)  
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
mfcc = (mfcc - np.mean(mfcc, axis=1, keepdims=True)) / np.std(mfcc, axis=1, keepdims=True)
np.save("test.npy", mfcc)

In [22]:
device = check_cuda()
saver = ModelSaver()
model = TDNN(40, 87)
point_model = saver.load_state(model=model,path = "models/2025-07-23_00-06/point_final_ep-69")
model = point_model 
model.eval()

CUDA is available. Using GPU device: NVIDIA GeForce RTX 3050 Laptop GPU


TDNN(
  (tdnn1): TDNNBlock(
    (conv): Conv1d(40, 400, kernel_size=(7,), stride=(1,), padding=(3,))
    (ln): LayerNorm((400,), eps=1e-05, elementwise_affine=True)
    (relu): ReLU()
    (dropout): Identity()
  )
  (tdnn2): TDNNBlock(
    (conv): Conv1d(400, 400, kernel_size=(5,), stride=(1,), padding=(4,), dilation=(2,))
    (ln): LayerNorm((400,), eps=1e-05, elementwise_affine=True)
    (relu): ReLU()
    (dropout): Dropout1d(p=0.2, inplace=False)
  )
  (tdnn3): TDNNBlock(
    (conv): Conv1d(400, 400, kernel_size=(5,), stride=(1,), padding=(6,), dilation=(3,))
    (ln): LayerNorm((400,), eps=1e-05, elementwise_affine=True)
    (relu): ReLU()
    (dropout): Dropout1d(p=0.3, inplace=False)
  )
  (res1): ResidualTDNNBlock(
    (tdnn1): TDNNBlock(
      (conv): Conv1d(400, 400, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
      (ln): LayerNorm((400,), eps=1e-05, elementwise_affine=True)
      (relu): ReLU()
      (dropout): Identity()
    )
    (tdnn2): TDNNBlock(
      (

In [23]:
phonemes = read_phonemes("../utils/phonemes.txt")
phoneme2id,id2phoneme = phonemes_to_ids(phonemes)

def ctc_greedy_decode(log_probs, blank=0):
    preds = log_probs.argmax(dim=-1).transpose(0, 1)  # [batch, time]

    decoded_batch = []
    for pred in preds:
        prev = None
        decoded = []
        for p in pred.cpu().numpy():
            if p != blank and p != prev:
                decoded.append(p)
            prev = p
        decoded_batch.append(decoded)
    return decoded_batch

with torch.no_grad():
    features = np.load("test.npy")
    print(features.shape)
    features = torch.from_numpy(features).float().unsqueeze(0)
    print(features.shape)
    features = features.transpose(1, 2)
    output = model(features)
    log_probs = output.transpose(0, 1)
    preds = ctc_greedy_decode(log_probs, blank=0)
    for seq in preds:
        print([id2phoneme[p] for p in seq])
    


(40, 81)
torch.Size([1, 40, 81])
['S', 'F', 'AW1', 'ER0', 'ER0', 'F', 'Y', 'UW1']
