In [7]:
import numpy as np

import scipy.io.wavfile as wavfile
from torch.utils.data import Dataset

from IPython.display import clear_output # must not leak any of the data

import psstdata # pip install psstdata==1.0.1

# Load Data
This will prompt for credentials if not already saved in ~/config/psstdata/settings.json

In [None]:
data = psstdata.load(local_dir="../.data/psst-data") # type: ignore
clear_output()

# Process Data

In [8]:
def audio_file_to_array(input_path, desired_sample_rate=16000):
    rate, data = wavfile.read(input_path)
    if rate != desired_sample_rate:
        data = np.interp(
            np.linspace(0, len(data), int(len(data) * desired_sample_rate / rate)),
            np.arange(len(data)),
            data,
        ).astype(np.int16)
    return data
clear_output()

In [5]:
ARPABET2IPA = {'AA':'ɑ','AE':'æ','AH':'ʌ','AH0':'ə','AO':'ɔ','AW':'aʊ','AY':'aɪ','EH':'ɛ','ER':'ɝ','ER0':'ɚ','EY':'eɪ','IH':'ɪ','IH0':'ɨ','IY':'i','OW':'oʊ','OY':'ɔɪ','UH':'ʊ','UW':'u','B':'b','CH':'tʃ','D':'d','DH':'ð','EL':'l̩ ','EM':'m̩','EN':'n̩','F':'f','G':'ɡ','HH':'h','JH':'dʒ','K':'k','L':'l','M':'m','N':'n','NG':'ŋ','P':'p','Q':'ʔ','R':'ɹ','S':'s','SH':'ʃ','T':'t','TH':'θ','V':'v','W':'w','WH':'ʍ','Y':'j','Z':'z','ZH':'ʒ'}
IPA_SUBSTITUTIONS = {
    'ɝ': 'ɹ',   # Simplify rhotacized schwa to 'ɹ'
    'ɚ': 'ɹ',   # Simplify rhotacized schwa to 'ɹ'
    'l̩': 'l',   # Remove syllabic marker from 'l̩'
    'm̩': 'm',   # Remove syllabic marker from 'm̩'
    'n̩': 'n',   # Remove syllabic marker from 'n̩'
    '̩': '',     # Remove syllabic marker
    'ɨ': 'i',    # Replace high central unrounded vowel with high front unrounded vowel
    ' ': '',     # Remove nasalization marker
}
for k, v in ARPABET2IPA:
    if ARPABET2IPA[k] in IPA_SUBSTITUTIONS:
        ARPABET2IPA[k] = IPA_SUBSTITUTIONS[ARPABET2IPA[k]]
clear_output()

In [23]:
class PSSTDataset(Dataset):
    def __init__(self, split="train"):
        if split == "train":
            self.utterances = data.train
        elif split == "valid":
            self.utterances = data.valid
        elif split == "test":
            self.utterances = data.test
        else:
            raise ValueError(f"Unknown split: {split}")

    def __len__(self):
        return len(self.utterances)

    def __getitem__(self, index):
        utterance = self.utterances[index]
        ipa = "".join(map(lambda x: ARPABET2IPA[x] if x in ARPABET2IPA else '<unk>', utterance.transcript.split(' ')))
        audio = audio_file_to_array(utterance.filename_absolute)
        return ipa, audio

clear_output()

In [24]:
train = PSSTDataset("train")
valid = PSSTDataset("valid")
test = PSSTDataset("test")
clear_output()

In [30]:
import matplotlib.pyplot as plt
print(train[0])
plt.plot(train[0][1])
plt.show()
clear_output()