In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import WavLMModel, AutoFeatureExtractor
from datasets import load_dataset
import numpy as np

# ————————————————————————————————————————————————————————————————————————
# PhonemeRecognizer: WavLM + CTC for phoneme speech recognition
# ————————————————————————————————————————————————————————————————————————

# Define a list of English phonemes (ARPABET format) + blank token for CTC
PHONEMES = ['AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'B', 'CH', 'D', 'DH', 'EH', 'ER', 'EY',
           'F', 'G', 'HH', 'IH', 'IY', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OY', 'P',
           'R', 'S', 'SH', 'T', 'TH', 'UH', 'UW', 'V', 'W', 'Y', 'Z', 'ZH', 'SIL', 'SP']
# Add blank token for CTC
PHONEME_DICT = {p: i for i, p in enumerate(['<blank>'] + PHONEMES)}
NUM_PHONEMES = len(PHONEME_DICT)

class PhonemeRecognizer(nn.Module):
    def __init__(self, wavlm_model, num_phonemes=NUM_PHONEMES):
        super().__init__()
        self.wavlm = wavlm_model

        # Get the hidden size from the WavLM model
        hidden_size = self.wavlm.config.hidden_size

        # Add a dropout layer for regularization
        self.dropout = nn.Dropout(0.1)

        # Linear layer to map from WavLM hidden states to phoneme classes (including blank)
        self.phoneme_classifier = nn.Linear(hidden_size, num_phonemes)

    def forward(self, inputs):
        # Get WavLM embeddings
        outputs = self.wavlm(**inputs)
        hidden_states = outputs.last_hidden_state

        # Apply dropout
        hidden_states = self.dropout(hidden_states)

        # Apply the linear layer to get logits for each time step
        logits = self.phoneme_classifier(hidden_states)

        # Apply log softmax for CTC loss
        log_probs = F.log_softmax(logits, dim=-1)

        return log_probs

    def recognize(self, inputs, beam_width=100):
        """Perform phoneme recognition with beam search decoding"""
        self.eval()
        with torch.no_grad():
            # Forward pass to get log probabilities
            log_probs = self(inputs)

            # Convert to CPU for decoding
            log_probs_cpu = log_probs.cpu().detach().numpy()

            # Simple greedy decoding (for demonstration)
            # In a real system, you would use beam search with ctcdecode
            predictions = torch.argmax(log_probs, dim=-1).cpu().numpy()

            # Convert to phoneme sequences with CTC decoding rules (merge repeats, remove blanks)
            phoneme_sequences = []
            for pred_seq in predictions:
                seq = []
                prev = -1
                for p in pred_seq:
                    # Skip blanks (index 0) and repeated phonemes (CTC rules)
                    if p != 0 and p != prev:
                        # Convert index back to phoneme
                        phoneme = list(PHONEME_DICT.keys())[list(PHONEME_DICT.values()).index(p)]
                        seq.append(phoneme)
                    prev = p
                phoneme_sequences.append(seq)

            return phoneme_sequences

# ————————————————————————————————————————————————————————————————————————
# Method A: Using the PhonemeRecognizer for speech-to-phoneme ASR
# ————————————————————————————————————————————————————————————————————————

# 1. Load the feature extractor and model
feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/wavlm-base-plus")
wavlm_model = WavLMModel.from_pretrained("microsoft/wavlm-base-plus")

# Create the phoneme recognizer with the WavLM model
phoneme_recognizer = PhonemeRecognizer(wavlm_model)
phoneme_recognizer.eval()  # disable dropout, etc.

# 2. Load an example audio file (here using a small demo from `datasets`)
#    The `audio["array"]` is a NumPy array of floats; sampling_rate is an int.
ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
audio_sample = ds[0]["audio"]["array"]
sr = ds[0]["audio"]["sampling_rate"]

# 3. Preprocess (pad/truncate + batch‐dim)
inputs = feature_extractor(
    audio_sample,
    sampling_rate=sr,
    return_tensors="pt",        # => PyTorch tensors
    padding=True,               # pad to longest in batch
)

# 4. Inference for phoneme recognition
with torch.no_grad():
    # Get phoneme log probabilities
    log_probs = phoneme_recognizer(inputs)

    # Recognize phoneme sequence
    phoneme_sequences = phoneme_recognizer.recognize(inputs)

# Print output
print("Log probabilities shape:", log_probs.shape)  # (batch_size, seq_len, num_phonemes)
print("Recognized phoneme sequence:", phoneme_sequences[0])
print("Transcript for reference:", ds[0]["text"])



Log probabilities shape: torch.Size([1, 292, 42])
Recognized phoneme sequence: ['OY', 'IY', 'NG', 'UW', 'IY', 'AO', 'UW', 'SIL', 'T', 'SIL', 'S', 'HH', 'IY', 'V', 'P', 'JH', 'IY', 'UW', 'AO', 'UW', 'AW', 'AO', 'V', 'W', 'AW', 'P', 'JH', 'G', 'UW', 'AW', 'AY', 'UW', 'NG', 'V', 'EH', 'K', 'EH', 'W', 'IY', 'R', 'V', 'Y', 'B', 'F', 'B', 'IY', 'P', 'G', 'AW', 'W', 'AW', 'ZH', 'V', 'ZH', 'JH', 'SIL', 'K', 'NG', 'SH', 'OY', 'P', 'ER', 'P', 'W', 'EH', 'IY', 'HH', 'ZH', 'SIL', 'AW', 'V', 'ZH', 'SIL', 'K', 'IY', 'UW', 'P', 'DH', 'F', 'W', 'ZH', 'K', 'NG', 'EH', 'SIL', 'EH', 'G', 'CH', 'AW', 'L', 'AO', 'SIL', 'AW', 'V', 'ZH', 'OY', 'SH', 'AO', 'ZH', 'UH', 'V', 'UW', 'EH', 'HH', 'IY', 'M', 'UW', 'AY', 'G', 'ER', 'AY', 'CH', 'IY', 'G', 'IY', 'P', 'AY', 'W', 'K', 'SH', 'EH', 'R', 'EH', 'P', 'D', 'HH', 'UH', 'HH', 'AW', 'SH', 'SIL', 'W', 'AW', 'UW', 'AO', 'Y', 'HH', 'AO', 'IY', 'F', 'AO', 'W', 'AO', 'EH', 'V', 'P', 'Y', 'W', 'P', 'G', 'AW', 'AO', 'W', 'V', 'P', 'V', 'ZH', 'T', 'D', 'UH', 'T', 'ZH', '

In [9]:
import pandas as pd
from datasets import load_dataset, Audio, Features, Sequence, Value

# 1. Location of your CSV
# csv_file = "train_phonemes_clean.csv"  # replace with your path
csv_file = "ground_truth_it_coder_2.csv"  # replace with your path


# 2. Define initial features: audio paths as plain strings, phonemes as plain strings
features = Features({
    "file_name": Value("string"),
    "phoneme_sequence": Value("string"),
})

# 3. Load the CSV into a DatasetDict (default split is 'train')
ds_dict = load_dataset("csv", data_files=csv_file, features=features)
dataset = ds_dict["train"]

# 4. Rename the audio-path column to 'audio' (required by Audio feature)
dataset = dataset.rename_column("file_name", "audio")

# 5. Cast 'audio' to the Audio type (will load the file when you access it)
dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))

# 6. Map + split phoneme strings into lists
def split_phonemes(example):
    # assume phonemes are space-separated, e.g. "AH0 T EH1 S T"
    example["phoneme_sequence"] = example["phoneme_sequence"].split()
    return example

dataset = dataset.map(split_phonemes)

# 7. Cast the phoneme_sequence column to a Sequence of strings
dataset = dataset.cast_column(
    "phoneme_sequence",
    Sequence(feature=Value("string"))
)

# Now 'dataset' has:
#   - dataset[i]["audio"] → { "array": np.ndarray, "sampling_rate": 16000 }
#   - dataset[i]["phoneme_sequence"] → list of strings
print(dataset)
print(dataset[0]["audio"])
print(dataset[0]["phoneme_sequence"])


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/932 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/932 [00:00<?, ? examples/s]

Dataset({
    features: ['audio', 'phoneme_sequence'],
    num_rows: 932
})
{'path': 'Hackathon_ASR/2_Audiofiles/Decoding_IT_T1/1001_edugame2023_59aa8ecf74c44db2adf56d71d1705cf5_1de23ac3deaf4b4d8c7db6d0cc9d6bfe.wav', 'array': array([0.        , 0.        , 0.        , ..., 0.00378418, 0.00424194,
       0.        ], shape=(364544,)), 'sampling_rate': 16000}
['vuzo[PAD]seɡa[PAD]klofɛno[PAD]raviʎo[PAD]da[PAD]pe[PAD]tarse[PAD]doridzːa[PAD]prateʎa[PAD]aː[PAD]ɛrɾe[PAD]lo[PAD]beɲole[PAD]fla[PAD]vɛstro[PAD]kʊɲaripːo']


In [5]:
import librosa

In [7]:
wavfile = librosa.load("Hackathon_ASR/2_Audiofiles/Decoding_IT_T1/1001_edugame2023_59aa8ecf74c44db2adf56d71d1705cf5_1de23ac3deaf4b4d8c7db6d0cc9d6bfe.wav", sr=16000)[0]

In [8]:
wavfile.shape

(364544,)