In [161]:
import os
import IPython
import matplotlib
import matplotlib.pyplot as plt
import requests
import torch
import torchaudio
import playsound
from os.path import join as pjoin

In [162]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device {device}")
bundle = torchaudio.pipelines.HUBERT_ASR_LARGE
print(f"Sample Rate: {bundle.sample_rate}")
print(f"Labels: {bundle.get_labels()}")
model = bundle.get_model().to(device)
print(model.__class__)

Using device cuda
Sample Rate: 16000
Labels: ('-', '|', 'E', 'T', 'A', 'O', 'N', 'I', 'H', 'S', 'R', 'D', 'L', 'U', 'M', 'W', 'C', 'F', 'G', 'Y', 'P', 'B', 'V', 'K', "'", 'X', 'J', 'Q', 'Z')
<class 'torchaudio.pipelines._wav2vec2.utils._Wav2Vec2Model'>


In [163]:
class GreedyCTCDecoder(torch.nn.Module):
    def __init__(self, labels, blank=0):
        super().__init__()
        self.labels = labels
        self.blank = blank

    def forward(self, emission: torch.Tensor) -> str:
        """Given a sequence emission over labels, get the best path string
        Args:
        emission (Tensor): Logit tensors. Shape `[num_seq, num_label]`.
        Returns:
        str: The resulting transcript
        """
        indices = torch.argmax(emission, dim=-1) # [num_seq,]
        indices = torch.unique_consecutive(indices, dim=-1)
        indices = [i for i in indices if i != self.blank]
        return "".join([self.labels[i] for i in indices])

In [164]:
cleaned_corpus_path = "donateacry-corpus-master/donateacry_corpus_cleaned_and_updated_data"
label_list = os.listdir(cleaned_corpus_path)[:-1]
print(label_list)

file_label = []
file_path = []
file_label_test = []
file_path_test = []
for l in label_list:
    number_label = 0
    data_dir = pjoin(cleaned_corpus_path, l)
    for file in os.listdir(data_dir):
        path = pjoin(data_dir, file)
        if number_label < 5:
            file_path.append(path)
            file_label.append(l)
        else:
            file_path_test.append(path)
            file_label_test.append(l)
        number_label += 1
        if number_label >= 8:
            break

['hungry', 'discomfort', 'burping', 'belly_pain', 'tired']


In [165]:
aud_vecs = []
prev_cry = ""
for path, cry in zip(file_path, file_label):
    if cry != prev_cry:
        prev_cry = cry
        print("\n------------------------------")
        print(cry)
        print("------------------------------")
    waveform, sample_rate = torchaudio.load(path)
    waveform = waveform.to(device)
    if sample_rate != bundle.sample_rate:
        waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)

    with torch.inference_mode():
        emission, _ = model(waveform)

    decoder = GreedyCTCDecoder(labels=bundle.get_labels())
    aud_vecs.append(emission[0])
    transcript = decoder(emission[0])
    print(transcript)


------------------------------
hungry
------------------------------
AY|THE|OTO|IA|THE|
AA|A|SAI||SE|SE|IN|A|BR|MES|IS||
W|MISSE|A||A|OY
YO|CAL|TEM|NETOE|A|FARM|TO|NA||
E|MUSTERED|THE|MOCE|THE|OTHER|PAR|M||

------------------------------
discomfort
------------------------------
AMOST|ME|BACKE|DI|I|GOSE|
I|RE|WE|ARR||AA|
SELA|AA|SAID|ATI|AOLA|
TO|ER|EN|FORAN||RI|HOW|CAN|HE|EN|A|MEN|PE|
WEA|ALL|RIGHT|AT|EAY|SAI|MISSUS|DENRY|DRAWGING|HER|EAISES|WA|WE|

------------------------------
burping
------------------------------
LE|AA|THEYARE|RON|
EAN|LE|
TUB|EO|GRN|POT|DOIN|O|CLU|
WY|EHUT|O|SHO|A|
E|A|NATI|ASED|NAT|I|WHY||

------------------------------
belly_pain
------------------------------
|MERCME|
|DI|THE|PEOPEY||
HAVE|YOU|AA|EW|
IS|TE|HE|POVET|BA|O|OT|A|RFEC|
BOSHTE|TOMTETEN|TE|TEI|A|NOW|EE|THE|GET|LES|E|E|TE|N

------------------------------
tired
------------------------------
WAL|OFF|WO|THE|TOOK|THEIRMADENS|AN|TE|SAETE|FEANME|YEON|
SHRIED|OSPETAN|ARE|YOU|GO|DOWN|ETTYE|COSKITO|KODE|

Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/lib/python3/dist-packages/sklearn/neighbors/_base.py", line 1131, in fit
    X, y = self._validate_data(X, y, accept_sparse="csr",
  File "/usr/lib/python3/dist-packages/sklearn/base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/usr/lib/python3/dist-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/usr/lib/python3/dist-packages/sklearn/utils/validation.py", line 795, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/usr/lib/python3/dist-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/usr/lib/python3/dist-packages/sklearn/utils/validation.py", line 598, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
  File "/home/mia

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.