In [24]:
import matplotlib.pyplot as plt
import numpy as np
import zipfile
from datasets import Dataset
import pandas as pd
import pathlib

In [25]:
import sys
sys.path.append('..')
from scripts.audio import audio_array_to_wav_file
from scripts.ipa import timit2ipa

### Step 0: Loading TIMIT 
Download the TIMIT.zip from: DARPA TIMIT on Kaggle, save to /.data

In [None]:
timit = zipfile.ZipFile('../.data/TIMIT.zip', 'r')
timit_files = timit.namelist()
print(timit_files)

In [None]:
# print README.DOC
timit_readme = timit.open('README.DOC')
for line in timit_readme:
    print(line.decode('utf-8').strip())

In [28]:
# list the file extensions for each data point
set(map(lambda x: x.split('.')[1], filter(lambda x: x.startswith('data/TRAIN'), timit_files)))

{'PHN', 'TXT', 'WAV', 'WRD'}

In [29]:
training_files = list(set(map(lambda x: x.split('.')[0], filter(lambda x: x.startswith('data/TRAIN'), timit_files))))
print(training_files)

['data/TRAIN/DR4/MGRP0/SA2', 'data/TRAIN/DR4/MMBS0/SX71', 'data/TRAIN/DR4/MSMS0/SI803', 'data/TRAIN/DR4/MSTF0/SA1', 'data/TRAIN/DR6/MSMR0/SX325', 'data/TRAIN/DR1/MTPF0/SX245', 'data/TRAIN/DR2/FDXW0/SX251', 'data/TRAIN/DR4/MMBS0/SX251', 'data/TRAIN/DR4/FLHD0/SX174', 'data/TRAIN/DR5/MMWB0/SA2', 'data/TRAIN/DR5/MWSH0/SX166', 'data/TRAIN/DR8/MBSB0/SA2', 'data/TRAIN/DR1/MCPM0/SA1', 'data/TRAIN/DR6/MEJL0/SX422', 'data/TRAIN/DR5/MMVP0/SI1914', 'data/TRAIN/DR3/FGRW0/SX162', 'data/TRAIN/DR7/MADD0/SX178', 'data/TRAIN/DR2/FLMC0/SX22', 'data/TRAIN/DR2/FMJB0/SI547', 'data/TRAIN/DR3/MRJB1/SI1020', 'data/TRAIN/DR2/MTDB0/SI2031', 'data/TRAIN/DR3/MLNS0/SA1', 'data/TRAIN/DR2/MKDT0/SA1', 'data/TRAIN/DR4/MGXP0/SX367', 'data/TRAIN/DR4/FJXP0/SI492', 'data/TRAIN/DR5/FEXM0/SA1', 'data/TRAIN/DR6/MAJP0/SX354', 'data/TRAIN/DR4/FLKM0/SA1', 'data/TRAIN/DR2/FMJB0/SA2', 'data/TRAIN/DR2/FTMG0/SX272', 'data/TRAIN/DR2/MDMT0/SA1', 'data/TRAIN/DR3/MHMR0/SX399', 'data/TRAIN/DR3/MDJM0/SI1455', 'data/TRAIN/DR2/MKDT0/SX443',

In [31]:
def timit_file_to_dict(filename):
    with timit.open(filename + '.PHN') as phn_file:
        timestamped_phonemes = []
        for line in phn_file.read().decode('utf-8').split('\n'):
            if line == '':
                continue
            start, end, phoneme = line.split()
            timestamped_phonemes.append((timit2ipa(phoneme), int(start) / 16_000, int(end) / 16_000))
    with timit.open(filename + '.TXT') as txt_file:
        transcription = txt_file.read().decode('utf-8').strip()
        # first two numbers are the start and end times
        transcription_start, transcription_end, *words = transcription.split()
        transcription = ' '.join(words)
    with timit.open(filename + '.WRD') as wrd_file:
        timestamped_words = []
        for line in wrd_file.read().decode('utf-8').split('\n'):
            if line == '':
                continue
            start, end, word = line.split()
            timestamped_words.append((word, int(start) / 16_000, int(end) / 16_000))
    path_segments = pathlib.Path(filename).parts
    speaker_initials = path_segments[-2][1:]
    return {'speaker_initials': speaker_initials, 'timestamped_phonemes': timestamped_phonemes, 'transcription': transcription, 'transcription_start': int(transcription_start) / 16_000, 'transcription_end': int(transcription_end) / 16_000, 'timestamped_words': timestamped_words, 'wav_filename': filename + '.WAV'}

In [32]:
parsed_0 = timit_file_to_dict(training_files[0])
print(parsed_0)
parsed_1 = timit_file_to_dict(training_files[1])
print(parsed_1)

{'speaker_initials': 'GRP0', 'timestamped_phonemes': [('', 0.0, 0.1325), ('', 0.1325, 0.1625), ('oʊ', 0.1625, 0.27), ('ɾ̃', 0.27, 0.29), ('æ', 0.29, 0.4525), ('s', 0.4525, 0.5351875), ('k', 0.5351875, 0.5975), ('m', 0.5975, 0.6325), ('i', 0.6325, 0.7525), ('t', 0.7525, 0.80675), ('', 0.80675, 0.8425), ('ɨ', 0.8425, 0.8975), ('k', 0.8975, 0.9275), ('', 0.9275, 0.9925), ('ɪ', 0.9925, 1.086125), ('ɹ', 1.086125, 1.180125), ('i', 1.180125, 1.2871875), ('æ', 1.2871875, 1.37725), ('ɾ̃', 1.37725, 1.416125), ('ɔ', 1.416125, 1.624), ('l', 1.624, 1.671), ('ɨ', 1.671, 1.7310625), ('ɹ', 1.7310625, 1.8485625), ('æ', 1.8485625, 2.0425), ('g', 2.0425, 2.0825), ('', 2.0825, 2.103875), ('l', 2.103875, 2.15525), ('aɪ', 2.15525, 2.2575), ('k', 2.2575, 2.368125), ('ð', 2.368125, 2.3925), ('æ', 2.3925, 2.6086875), ('t', 2.6086875, 2.7075625), ('', 2.7075625, 2.7551875), ('', 2.7551875, 3.39)], 'transcription': "Don't ask me to carry an oily rag like that.", 'transcription_start': 0.0, 'transcription_end': 3

In [33]:
WAV_HEADER_SIZE = 44
def zipped_wav_to_array(filename):
    with timit.open(filename) as wav_file:
        return np.frombuffer(wav_file.read(), dtype=np.int16)[WAV_HEADER_SIZE//2:]

In [34]:
def files_to_df(files):
    records = []
    for filename in files:
        parsed = timit_file_to_dict(filename)
        parsed['audio'] = zipped_wav_to_array(parsed['wav_filename'])
        del parsed['wav_filename']
        parsed['ipa'] = "".join(phoneme for phoneme, _, _ in parsed['timestamped_phonemes'])
        parsed['phoneme_starts'] = [start for _, start, _ in parsed['timestamped_phonemes']]
        parsed['phoneme_ends'] = [end for _, _, end in parsed['timestamped_phonemes']]
        del parsed['timestamped_phonemes']
        records.append(parsed)
    return pd.DataFrame(records)

In [35]:
train_df = files_to_df(training_files)
train_df.head()


Unnamed: 0,speaker_initials,transcription,transcription_start,transcription_end,timestamped_words,audio,ipa,phoneme_starts,phoneme_ends
0,GRP0,Don't ask me to carry an oily rag like that.,0.0,3.392,"[(don't, 0.1325, 0.29), (ask, 0.29, 0.5975), (...","[25971, 30303, 29285, 26995, 28271, 11552, 131...",oʊɾ̃æskmitɨkɪɹiæɾ̃ɔlɨɹæglaɪkðæt,"[0.0, 0.1325, 0.1625, 0.27, 0.29, 0.4525, 0.53...","[0.1325, 0.1625, 0.27, 0.29, 0.4525, 0.5351875..."
1,MBS0,Porcupines resemble sea urchins.,0.0,2.9825,"[(porcupines, 0.1525, 0.874625), (resemble, 0....","[25971, 30303, 29285, 26995, 28271, 11552, 131...",ɔɹkjʉpaɪnzɹizɛmbl̩ siʔɝttʃɨnz,"[0.0, 0.1525, 0.1825, 0.245375, 0.2925, 0.3475...","[0.1525, 0.1825, 0.245375, 0.2925, 0.3475, 0.3..."
2,SMS0,"Insulate, weatherstrip, double-glaze to the ma...",0.0,3.404812,"[(insulate, 0.1425, 0.7775), (weatherstrip, 0....","[25971, 30303, 29285, 26995, 28271, 11552, 131...",ʔɪnsəleɪtwɛðɚstɹɪpʌbl̩ gleɪztɨðɨmæksəməm,"[0.0, 0.1425, 0.188375, 0.2440625, 0.280625, 0...","[0.1425, 0.188375, 0.2440625, 0.280625, 0.3875..."
3,STF0,She had your dark suit in greasy wash water al...,0.0,3.456,"[(she, 0.136875, 0.3025), (had, 0.3025, 0.5725...","[25971, 30303, 29285, 26995, 28271, 11552, 131...",siɦæddʒɚdɑɹksʉtʔɨŋgɹisiwɑʃwɔɾɚʔɔljɪɝ,"[0.0, 0.136875, 0.241875, 0.3025, 0.3425, 0.49...","[0.136875, 0.241875, 0.3025, 0.3425, 0.4975, 0..."
4,SMR0,The full moon shone brightly that night.,0.0,2.316813,"[(the, 0.1474375, 0.194375), (full, 0.194375, ...","[25971, 30303, 29285, 26995, 28271, 11552, 131...",ðɨfʊlmunʃoʊnbɹaɪʔliðætnaɪt,"[0.0, 0.1474375, 0.1719375, 0.194375, 0.3275, ...","[0.1474375, 0.1719375, 0.194375, 0.3275, 0.383..."


### Getting vocabulary of TIMIT

In [46]:
def extract_phoneme_vocabulary(train_df, phoneme_column):
    """
    Extracts a unique vocabulary of phonemes from concatenated phoneme strings in a DataFrame.

    Args:
        train_df (pd.DataFrame): DataFrame containing phoneme sequences.
        phoneme_column (str): Name of the column containing phoneme sequences.

    Returns:
        set: A set of unique phonemes.
    """
    phoneme_vocabulary = set()

    for phoneme_sequence in train_df[phoneme_column]:
        # Split the phoneme sequence into smaller phonemes
        if isinstance(phoneme_sequence, str):
            # Assuming phonemes are space-separated; otherwise adjust this
            phonemes = list(phoneme_sequence.strip())  # Splits by each character
        else:
            raise ValueError(f"Unexpected format in phoneme sequence: {phoneme_sequence}")
        
        # Add each phoneme to the vocabulary
        phoneme_vocabulary.update(phonemes)

    return phoneme_vocabulary


In [47]:
timit_phoneme_vocab = extract_phoneme_vocabulary(train_df, 'ipa')
# Update: unfortunately this function fails to parse diphones so we will manually add them to the vocab: 
phonemes_to_add = {"aɪ", "aʊ", "dʒ", "eɪ", "l̩", "m̩", "n̩", "oʊ", "tʃ", "ŋ̍", "ɔɪ", "ə̥", "ɾ̃"}
timit_phoneme_vocab.update(phonemes_to_add)

print("Phoneme Vocabulary:", timit_phoneme_vocab)
print("Length of Vocab: ", len(timit_phoneme_vocab))

Phoneme Vocabulary: {'b', ' ', 'ə', 'l̩', 'ŋ̍', 'ʔ', 'd', '̍', 'ʊ', 'θ', '̩', 'p', 'ð', '̃', 'a', 'g', 'e', 'ɨ', 'n', 'w', 'ʒ', 't', 'f', 'ɾ', 'ʉ', 'u', 'ɝ', 'ŋ', 'h', 'ɾ̃', 'm', 's', 'v', 'm̩', 'dʒ', 'l', 'ɪ', 'tʃ', 'eɪ', 'o', 'ɹ', 'ʌ', 'z', '̥', 'k', 'aɪ', 'aʊ', 'ɔɪ', 'æ', 'ɔ', 'ɛ', 'n̩', 'ʃ', 'ɦ', 'ə̥', 'oʊ', 'ɚ', 'j', 'i', 'ɑ'}
Length of Vocab:  60


## Step 1: Creating Timit Files for Allosaurus
File 1: Wave file (titled WAVE no file extension) with \<utt_id\> \<full path to .wav\>

File 2: Phoneme file (titled TEXT no file extension) with \<utt_id> \<phoneme sequence\>

In [48]:
import os
import pandas as pd

def create_wav_and_phoneme_files(wav_files, wav_output, phoneme_output):
    """
    Creates two files: one for WAV paths and another for phoneme transcriptions.

    Args:
        wav_files (list): List of paths to WAV files.
        wav_output (str): Path to save the WAV file list.
        phoneme_output (str): Path to save the phoneme transcription file.
    """
    wav_records = []
    phoneme_records = []

    for wav_path in wav_files:
        parsed = timit_file_to_dict(wav_path)  
        # Extract unique utt_id from the filename
        utt_id = os.path.splitext(os.path.basename(wav_path))[0]
        phoneme_sequence = [phoneme for phoneme, _, _ in parsed['timestamped_phonemes']]

        # Clean up the phoneme sequence
        phoneme_sequence_cleaned = " ".join(phoneme_sequence)  # Join the phonemes with spaces

        # Extract the parent directory of the script
        parent_dir = os.path.dirname(os.path.abspath(wav_output))  # Parent directory where the output will be saved

        # Convert the relative path to absolute and add .WAV.wav
        full_wav_path = os.path.join(parent_dir, wav_path) + ".WAV.wav"

        # Add entry for the WAV file list
        wav_records.append((utt_id, full_wav_path))
        
        # Add entry for the phoneme file list
        phoneme_records.append((utt_id, phoneme_sequence_cleaned))

    # Save the WAV file list
    wav_df = pd.DataFrame(wav_records, columns=["utt_id", "file_path"])
    wav_df.to_csv(wav_output, sep=" ", index=False, header=False)

    # Save the phoneme file list
    with open(phoneme_output, "w") as phoneme_file:
        for utt_id, phoneme_sequence_cleaned in phoneme_records:
            phoneme_file.write(f"{utt_id} {phoneme_sequence_cleaned}\n")


In [49]:
# <TODO> create a allosaurus directory containing a train directory, validate directory, and your audio/wave.txt files

In [50]:

create_wav_and_phoneme_files(training_files, "../data/allosaurus_data/audio.txt", "../data/allosaurus_data/phoneme.txt")

In [None]:
# If you want test files to be created, you can use the following code: (make sure to create a test dir first)
 # create_wav_and_phoneme_files(test_files, "../data/allosaurus_data/test/audio.txt", "../data/allosaurus_data/test/phoneme.txt")

## Step 2: Splitting for Train/Val
Just split those first two files into 4 files for train/val

We will do a 80-20 split for the train-val files

In [51]:
import random

def split_txt_files(wav_file, phoneme_file, train_percentage=0.8):
    """
    Splits wav and phoneme TXT files into train and validation subsets.

    Args:
        wav_file (str): Path to the wav TXT file (utt_id and file paths).
        phoneme_file (str): Path to the phoneme TXT file (utt_id and transcriptions).
        train_percentage (float): Percentage of data for training (default: 80%).

    Outputs:
        Four files: train_wav.txt, val_wav.txt, train_phoneme.txt, val_phoneme.txt
    """
    # Read the wav file and phoneme file
    with open(wav_file, "r") as f:
        wav_lines = f.readlines()

    with open(phoneme_file, "r") as f:
        phoneme_lines = f.readlines()

    assert len(wav_lines) == len(phoneme_lines), "Mismatch in wav and phoneme file lengths!"
    # Verify that the utterance IDs match for every line
    for wav_line, phoneme_line in zip(wav_lines, phoneme_lines):    
        # Extract utterance ID from the WAV line
        wav_utt_id = wav_line.split()[0]  # Assumes the utterance ID is the first token
        # Extract utterance ID from the phoneme line
        phoneme_utt_id = phoneme_line.split()[0]  # Assumes the utterance ID is the first token

        # Assert that the IDs are equal
        assert wav_utt_id == phoneme_utt_id, f"Mismatch in utterance IDs: {wav_utt_id} vs {phoneme_utt_id}"
    
    # Pair up lines to maintain utt_id consistency
    combined = list(zip(wav_lines, phoneme_lines))
    random.shuffle(combined)  # Shuffle the data for randomness

    # Split into train and validation subsets
    split_idx = int(len(combined) * train_percentage)
    train_data = combined[:split_idx]
    val_data = combined[split_idx:]

    # Unpack the train and validation data
    train_wav, train_phoneme = zip(*train_data)
    val_wav, val_phoneme = zip(*val_data)

    # Save the train and validation splits
    with open("../data/allosaurus_data/train/wave", "w") as f:
        f.writelines(train_wav)

    with open("../data/allosaurus_data/validate/wave", "w") as f:
        f.writelines(val_wav)

    with open("../data/allosaurus_data/train/text", "w") as f:
        f.writelines(train_phoneme)

    with open("../data/allosaurus_data/validate/text", "w") as f:
        f.writelines(val_phoneme)



In [52]:
split_txt_files("../data/allosaurus_data/audio.txt", "../data/allosaurus_data/phoneme.txt", train_percentage=0.8)

## Step 3: Check the difference between Allosaurus and TIMIT
We will need to ensure that the Allosaurus vocab aligns to TIMIT's 

In [53]:
# Allosaurus vocabulary
allosaurus_vocab = {
    "a", "aː", "b", "d", "d̠", "e", "eː", "e̞", "f", "h", "i", "iː", "j", "k", "kʰ", "l", 
    "m", "n", "o", "oː", "p", "pʰ", "r", "s", "t", "tʰ", "t̠", "u", "uː", "v", "w", "x", 
    "z", "æ", "ð", "øː", "ŋ", "ɐ", "ɐː", "ɑ", "ɑː", "ɒ", "ɒː", "ɔ", "ɔː", "ɘ", "ə", 
    "əː", "ɛ", "ɛː", "ɜː", "ɡ", "ɪ", "ɪ̯", "ɯ", "ɵː", "ɹ", "ɻ", "ʃ", "ʉ", "ʉː", "ʊ", 
    "ʌ", "ʍ", "ʒ", "ʔ", "θ"
}
# Note: you can see they have aspirated t and plosives that might not be needed, you can adjust there probabilities if needed

# Compute differences
allosaurus_only = allosaurus_vocab - timit_phoneme_vocab
timit_only = timit_phoneme_vocab - allosaurus_vocab

# Print results
print("Symbols in Allosaurus but not in TIMIT:")
print(sorted(allosaurus_only))

print("\nSymbols in TIMIT but not in Allosaurus:")
print(sorted(timit_only))


Symbols in Allosaurus but not in TIMIT:
['aː', 'd̠', 'eː', 'e̞', 'iː', 'kʰ', 'oː', 'pʰ', 'r', 'tʰ', 't̠', 'uː', 'x', 'øː', 'ɐ', 'ɐː', 'ɑː', 'ɒ', 'ɒː', 'ɔː', 'ɘ', 'əː', 'ɛː', 'ɜː', 'ɡ', 'ɪ̯', 'ɯ', 'ɵː', 'ɻ', 'ʉː', 'ʍ']

Symbols in TIMIT but not in Allosaurus:
[' ', 'aɪ', 'aʊ', 'dʒ', 'eɪ', 'g', 'l̩', 'm̩', 'n̩', 'oʊ', 'tʃ', 'ŋ̍', 'ɔɪ', 'ə̥', 'ɚ', 'ɝ', 'ɦ', 'ɨ', 'ɾ', 'ɾ̃', '̃', '̍', '̥', '̩']


### Update the Allosaurus vocab accordingly so it has the full TIMIT vocab

On each new line in the file add these phonemes: [' ', 'aɪ', 'aʊ', 'dʒ', 'eɪ', 'g', 'l̩', 'm̩', 'n̩', 'oʊ', 'tʃ', 'ŋ̍', 'ɔɪ', 'ə̥', 'ɚ', 'ɝ', 'ɦ', 'ɨ', 'ɾ', 'ɾ̃', '̃', '̍', '̥', '̩']

In [79]:
# Update the vocab, run this or just make the file yourself
!python -m allosaurus.bin.write_phone --lang eng --output ./data/allosaurus_data/allosaurus_eng_inventory.txt
# Add the missing phonemes

In [80]:
# List of phonemes
phonemes = sorted(timit_only)

# File path
file_path = './data/allosaurus_data/allosaurus_eng_inventory.txt'

# Read the existing file content into a set of phonemes
with open(file_path, 'r') as file:
    existing_phonemes = set(file.read().splitlines())

# Write new phonemes to the file, only if they are not already present
with open(file_path, 'a') as file:
    for phoneme in phonemes:
        if phoneme not in existing_phonemes:
            file.write(f"{phoneme}\n")
            existing_phonemes.add(phoneme)

In [85]:
# Save the vocab 
!python -m allosaurus.bin.update_phone --lang eng --input ./data/allosaurus_data/allosaurus_eng_inventory.txt

In [86]:
!python -m allosaurus.bin.list_phone --lang eng

a aː b d d̠ e eː e̞ f h i iː j k kʰ l m n o oː p pʰ r s t tʰ t̠ u uː v w x z æ ð øː ŋ ɐ ɐː ɑ ɑː ɒ ɒː ɔ ɔː ɘ ə əː ɛ ɛː ɜː ɡ ɪ ɪ̯ ɯ ɵː ɹ ɻ ʃ ʉ ʉː ʊ ʌ ʍ ʒ ʔ θ aɪ aʊ dʒ eɪ g l̩ m̩ n̩ oʊ tʃ ŋ̍ ɔɪ ə̥ ɚ ɝ ɦ ɨ ɾ ɾ̃ ̃ ̍ ̥ ̩


## Step 4: Unzip TIMIT files to extract audio features from WAVE path
This is not the best way to extract, but its quick. Just delete the timit files after extracting the features and tokenizing 

Make sure the TIMIT directory is the same as the file paths provided in the audio.txt/WAVE file


In [None]:
# import zipfile
# from pathlib import Path

# # Path to your TIMIT ZIP file
# zip_path = "./.data/TIMIT.zip"
# extract_to = "../data/allosaurus_data/"  # Where to extract

# # Ensure the ZIP file exists
# assert Path(zip_path).exists(), f"{zip_path} does not exist!"

# # Extract the TIMIT data
# with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#     zip_ref.extractall(extract_to)

# print(f"Extracted {zip_path} to {extract_to}")


## Step 5: Features

In [88]:
# Create audio features
!python -m allosaurus.bin.prep_token --model=uni2005 --lang=eng --path=/home/arunasrivastava/ML/data/allosaurus_data/train
!python -m allosaurus.bin.prep_token --model=uni2005 --lang=eng --path=/home/arunasrivastava/ML/data/allosaurus_data/validate
# Create text features 
!python -m allosaurus.bin.prep_feat --model=uni2005 --path=/home/arunasrivastava/ML/data/allosaurus_data/train
!python -m allosaurus.bin.prep_feat --model=uni2005 --path=/home/arunasrivastava/ML/data/allosaurus_data/validate


100%|████████████████████████████████████| 3696/3696 [00:00<00:00, 58419.75it/s]
100%|██████████████████████████████████████| 924/924 [00:00<00:00, 58090.07it/s]
100%|███████████████████████████████████████| 3696/3696 [07:39<00:00,  8.04it/s]
100%|█████████████████████████████████████████| 924/924 [01:49<00:00,  8.46it/s]
[0m

## Step 6: Train

In [1]:
# training the model: (im using english universal model)
!python -m allosaurus.bin.adapt_model --pretrained_model uni25a --new_model uni25b --path /home/arunasrivastava/ML/data/allosaurus_data --lang eng --device_id -1 --epoch 20 --log /home/arunasrivastava/ML/data/allosaurus_data/log.txt

  model_state_dict = torch.load(str(path), map_location=torch.device('cpu'))
epoch[batch]: 00[0000] | train loss 0.70162 train per 0.20244
epoch[batch]: 00[0010] | train loss 0.63866 train per 0.18673
epoch[batch]: 00[0020] | train loss 0.61282 train per 0.17973
epoch0 | validate per : 0.21897
saving model
epoch[batch]: 01[0000] | train loss 0.53440 train per 0.16050
epoch[batch]: 01[0010] | train loss 0.61022 train per 0.18236
epoch[batch]: 01[0020] | train loss 0.58034 train per 0.17165
epoch1 | validate per : 0.21381
saving model
epoch[batch]: 02[0000] | train loss 0.63332 train per 0.19220
epoch[batch]: 02[0010] | train loss 0.54775 train per 0.16100
epoch[batch]: 02[0020] | train loss 0.56646 train per 0.17035
epoch2 | validate per : 0.21035
saving model
epoch[batch]: 03[0000] | train loss 0.51749 train per 0.16041
epoch[batch]: 03[0010] | train loss 0.53707 train per 0.15972
epoch[batch]: 03[0020] | train loss 0.52731 train per 0.15880
epoch3 | validate per : 0.20825
saving model

In [113]:
# to test the model
# python -m allosaurus.run [--lang <language name>] [--model <model name>] [--device_id <gpu_id>] [--output <output_file>] [--topk <int>] -i <audio file/directory>
!python -m allosaurus.run --lang eng --model uni25a -i /home/arunasrivastava/ML/data/ExamplesWithComments/TIMIT_sample_0.wav

  model_state_dict = torch.load(str(path), map_location=torch.device('cpu'))
ʔ aɪ ɹ eɪ t ʔ æ k ɚ s t ʌ m ð ə w eɪ ʔ ɨ ɾ i æ ɾ ɨ k l i
[0m

In [110]:
import sys, os
sys.path.append('..')
from scripts.eval_tests.panphon_model_eval import panphon_model_eval 

### Small Sample Evaluations
We will use a timit sample and see how it goes

In [116]:
# We can do a baby evaluation on this model by running the following command
# Call panphon_model_eval with label and predictedipa
pred_str = "ʔ aɪ ɹ eɪ t ʔ æ k ɚ s t ʌ m ð ə w eɪ ʔ ɨ ɾ i æ ɾ ɨ k l i" # <TODO> replace this with the predicted ipa string from above
print(pred_str)
label_str = "ðɨaɪɹeɪtʔækɚstɑmpəweɪʔɨɾiɑɾɨkli"
results = panphon_model_eval(label_str, pred_str)

# Output results
print("Evaluation Results:")
print(f"Feature edit distance: {results['feature_dist']}")
print(f"Weighted feature edit distance: {results['weighted_feature_dist']}")
print(f"Hamming distance: {results['hamming_feature_dist']}")
print(f"CER: {results['cer_score']}")

ʔ aɪ ɹ eɪ t ʔ æ k ɚ s t ʌ m ð ə w eɪ ʔ ɨ ɾ i æ ɾ ɨ k l i
Evaluation Results:
Feature edit distance: 1.6136363636363635
Weighted feature edit distance: 11.625
Hamming distance: 1.7727272727272727
CER: 0.967741935483871



    # non required options
    parser.add_argument('--batch_frame_size', type=int,   default=6000,  help='this indicates how many frame in each batch, if you get any memory related errors, please use a lower value for this size')
    parser.add_argument('--criterion',        type=str,   default='ctc', choices=['ctc'], help='criterion, only ctc now')
    parser.add_argument('--optimizer',        type=str,   default='sgd', choices=['sgd'], help='optimizer, only sgd now')
    parser.add_argument('--lr',               type=float, default=0.01,  help='learning rate')
    parser.add_argument('--grad_clip',        type=float, default=5.0,   help='grad clipping')
    parser.add_argument('--epoch',            type=int,   default=10,    help='number of epoch to run')
    parser.add_argument('--log',              type=str,   default='none',help='file to store training logs. do not save if none')
    parser.add_argument('--verbose',          type=bool,  default=True,  help='print all training logs on stdout')
    parser.add_argument('--report_per_batch', type=int,   default=10,    help='report training stats every N epoch')

### Evaluate the Model on the TIMIT test
We will use cer and a phonemic distance calculation score that is averaged across all test pairs