In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
from transformers import pipeline
from datasets import load_dataset
import sounddevice as sd
import soundfile as sf

<IPython.core.display.Javascript object>


<br />  <font size="+2">1. Audio Classification</font> <br /> <br /> 
- Predicts the class of a raw waveform or an audio file.
- Command recognition or keyword spotting classifies utterances into a predefined set of commands. 

In [3]:
classifier = pipeline(
    task="audio-classification", model="superb/wav2vec2-base-superb-ks"
)

#'yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'unknown', 'silence'



<IPython.core.display.Javascript object>

In [4]:
classifier("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac")

[{'score': 0.9972336888313293, 'label': '_unknown_'},
 {'score': 0.0019911823328584433, 'label': 'left'},
 {'score': 0.0003051071835216135, 'label': 'yes'},
 {'score': 0.00021083909086883068, 'label': 'down'},
 {'score': 0.0001140652530011721, 'label': 'stop'}]

<IPython.core.display.Javascript object>

In [5]:
dataset = load_dataset("anton-l/superb_demo", "ks", split="test")
classifier(dataset[0]["file"], top_k=2)

Found cached dataset superb_demo (/Users/sridharkannam/.cache/huggingface/datasets/anton-l___superb_demo/ks/1.9.0/77d23894ff429329a7fe80f9007cabb0deec321316f8dda1a1e9d10ffa089d08)
[W NNPACK.cpp:64] Could not initialize NNPACK! Reason: Unsupported hardware.


[{'score': 0.9999943971633911, 'label': '_silence_'},
 {'score': 2.49855179390579e-06, 'label': 'left'}]

<IPython.core.display.Javascript object>

In [6]:
for data in dataset:
    print(classifier(data["file"], top_k=1))

[{'score': 0.9999943971633911, 'label': '_silence_'}]
[{'score': 0.9844678044319153, 'label': '_unknown_'}]
[{'score': 0.9983683228492737, 'label': '_unknown_'}]
[{'score': 0.9946255087852478, 'label': '_unknown_'}]
[{'score': 0.9979941844940186, 'label': 'down'}]
[{'score': 0.9999951124191284, 'label': 'go'}]
[{'score': 0.2914532721042633, 'label': 'up'}]
[{'score': 0.9973770380020142, 'label': 'no'}]


<IPython.core.display.Javascript object>

In [13]:
# Extract data and sampling rate from file
data, fs = sf.read(dataset[-5]["file"], dtype="float32")
sd.play(data, fs)

<IPython.core.display.Javascript object>

In [15]:
dataset = load_dataset("anton-l/superb_demo", "ks", split="test")
dataset[5]

Found cached dataset superb_demo (/Users/sridharkannam/.cache/huggingface/datasets/anton-l___superb_demo/ks/1.9.0/77d23894ff429329a7fe80f9007cabb0deec321316f8dda1a1e9d10ffa089d08)


{'file': '/Users/sridharkannam/.cache/huggingface/datasets/downloads/extracted/a825fceea0ebca08b8ce8f05031914994ed6671301d2851938b965f956fd1646/go/0c40e715_nohash_0.wav',
 'audio': {'path': '/Users/sridharkannam/.cache/huggingface/datasets/downloads/extracted/a825fceea0ebca08b8ce8f05031914994ed6671301d2851938b965f956fd1646/go/0c40e715_nohash_0.wav',
  'array': array([-0.00985718, -0.02383423, -0.02752686, ..., -0.02755737,
         -0.01998901, -0.01361084], dtype=float32),
  'sampling_rate': 16000},
 'label': 9}

<IPython.core.display.Javascript object>


<br />  <font size="+2">2. Speech Recognition</font> <br /> <br /> 

Extract spoken text contained within audio.



In [20]:
transcriber = pipeline(model="openai/whisper-base")

<IPython.core.display.Javascript object>

In [21]:
transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac")



{'text': ' He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered flour-fatten sauce.'}

<IPython.core.display.Javascript object>

In [22]:
transcriber(
    "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac",
    return_timestamps=True,
)
# utterance level timestamps



{'text': ' He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered flour-fatten sauce.',
 'chunks': [{'timestamp': (0.0, 5.6),
   'text': ' He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat'},
  {'timestamp': (5.6, 10.12),
   'text': ' mutton pieces to be ladled out in thick, peppered flour-fatten sauce.'}]}

<IPython.core.display.Javascript object>


<br />  <font size="+2">3. Zero shot classification</font> <br /> <br /> 


In [16]:
classifier = pipeline(
    task="zero-shot-audio-classification", model="laion/clap-htsat-unfused"
)


<IPython.core.display.Javascript object>

In [17]:
dataset = load_dataset("ashraq/esc50")
dataset["train"]["audio"]

Found cached dataset parquet (/Users/sridharkannam/.cache/huggingface/datasets/ashraq___parquet/ashraq--esc50-1000c3b73cc1500f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

[{'path': None,
  'array': array([0., 0., 0., ..., 0., 0., 0.]),
  'sampling_rate': 44100},
 {'path': None,
  'array': array([-0.01184082, -0.10336304, -0.14141846, ...,  0.06985474,
          0.04049683,  0.00274658]),
  'sampling_rate': 44100},
 {'path': None,
  'array': array([-0.00695801, -0.01251221, -0.01126099, ...,  0.215271  ,
         -0.00875854, -0.28903198]),
  'sampling_rate': 44100},
 {'path': None,
  'array': array([0.53897095, 0.39627075, 0.26739502, ..., 0.09729004, 0.11227417,
         0.07983398]),
  'sampling_rate': 44100},
 {'path': None,
  'array': array([-0.00036621, -0.0007019 , -0.00079346, ...,  0.00317383,
          0.00222778,  0.00158691]),
  'sampling_rate': 44100},
 {'path': None,
  'array': array([-9.46044922e-04, -6.71386719e-04, -6.10351562e-05, ...,
         -2.13623047e-03, -2.62451172e-03, -3.17382812e-03]),
  'sampling_rate': 44100},
 {'path': None,
  'array': array([0.00012207, 0.00018311, 0.00012207, ..., 0.        , 0.        ,
         0.     

<IPython.core.display.Javascript object>

In [18]:
audio = next(iter(dataset["train"]["audio"]))["array"]
classifier(audio, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"])

[{'score': 0.9995999932289124, 'label': 'Sound of a dog'},
 {'score': 0.0004000737681053579, 'label': 'Sound of vaccum cleaner'}]

<IPython.core.display.Javascript object>

In [19]:
sd.play(audio)

<IPython.core.display.Javascript object>

https://huggingface.co/datasets/ashraq/esc50

<br />  <font size="+2">4. Emotion Recognition</font> <br /> <br /> 


In [26]:
import requests

API_TOKEN = "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"
API_URL = "https://api-inference.huggingface.co/models/harshit345/xlsr-wav2vec-speech-emotion-recognition"
headers = {"Authorization": f"Bearer {API_TOKEN}"}


def query(filename):
    with open(filename, "rb") as f:
        data = f.read()
    response = requests.post(API_URL, headers=headers, data=data)
    return response.json()


dataset = load_dataset("anton-l/superb_demo", "ks", split="test")

output = query(dataset[1]["file"])

output

Found cached dataset superb_demo (/Users/sridharkannam/.cache/huggingface/datasets/anton-l___superb_demo/ks/1.9.0/77d23894ff429329a7fe80f9007cabb0deec321316f8dda1a1e9d10ffa089d08)


[{'score': 0.21224874258041382, 'label': 'happiness'},
 {'score': 0.20905514061450958, 'label': 'sadness'},
 {'score': 0.19708998501300812, 'label': 'disgust'},
 {'score': 0.19202306866645813, 'label': 'fear'},
 {'score': 0.18958303332328796, 'label': 'anger'}]

<IPython.core.display.Javascript object>

In [29]:
# Extract data and sampling rate from file
data, fs = sf.read(dataset[1]["file"], dtype="float32")
sd.play(data, fs)

<IPython.core.display.Javascript object>