## 0 - Imports 


In [1]:
# Hugging Face
from datasets import load_dataset
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration

# Audio handling
import torch
import numpy as np
import soundfile as sf
import librosa

# Audio Recording
import sounddevice as sd
from scipy.io.wavfile import write

# Transcription
from jiwer import wer

# Misc
import time


## 1 - Load Datasets

In [2]:
#Load the training portion of each language - takes a couple minutes
fleurs_en = load_dataset("google/fleurs", "en_us", split="train",trust_remote_code = True)
fleurs_es = load_dataset("google/fleurs", "es_419", split="train", trust_remote_code = True)
fleurs_ja = load_dataset("google/fleurs", "ja_jp", split="train", trust_remote_code = True)


In [3]:
# See outputs
sample_en = fleurs_en[0]
sample_en.keys()

sample_en

#key_features = ['audio' , 'transcription', 'language']

{'id': 903,
 'num_samples': 108800,
 'path': 'C:\\Users\\jking36\\.cache\\huggingface\\datasets\\downloads\\extracted\\00f4c8069bdbe2061746b64297f5f7dd1af5f4ac9c3f84e1cc27006199d80190\\10004088536354799741.wav',
 'audio': {'path': 'train/10004088536354799741.wav',
  'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         -3.15904617e-06, -3.03983688e-06, -3.27825546e-06], shape=(108800,)),
  'sampling_rate': 16000},
 'transcription': 'a tornado is a spinning column of very low-pressure air which sucks the surrounding air inward and upward',
 'raw_transcription': 'A tornado is a spinning column of very low-pressure air, which sucks the surrounding air inward and upward.',
 'gender': 1,
 'lang_id': 19,
 'language': 'English',
 'lang_group_id': 0}

## 2 - Obtain ASR Model


In [4]:
device = 0 if torch.cuda.is_available() else -1 

asr = pipeline(
    'automatic-speech-recognition',
    model = 'openai/whisper-small',
    device = device
)

# Or Directly import model

model_name = 'openai/whisper-small'

processor = WhisperProcessor.from_pretrained(model_name)
whisper = WhisperForConditionalGeneration.from_pretrained(model_name)
whisper.eval()



Device set to use cpu


WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (f

In [5]:
#Use English sample to test asr

audio = sample_en['audio']['array']

testing = asr(audio)

print(f'Predicted Text: {testing["text"]} - Predicted Language: {testing.get("language")}')
print(f'True ENG Text: {sample_en["transcription"]} - True Language: {sample_en["language"]}')

`return_token_timestamps` is deprecated for WhisperFeatureExtractor and will be removed in Transformers v5. Use `return_attention_mask` instead, as the number of frames can be inferred from it.
Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.


Predicted Text:  A tornado is a spinning column of very low pressure air, which sucks the surrounding air inward and upward. - Predicted Language: None
True ENG Text: a tornado is a spinning column of very low-pressure air which sucks the surrounding air inward and upward - True Language: English


In [6]:
#Function for Language Detection
def detect_language(dataset,n=50):
    true = 0
    for i in range(n):
        sample = dataset[i]
        prediction = asr(sample['audio']['array'])
        if prediction.get('language') == sample['language']:
            true += 1
        print(prediction.get('language'))
        print(sample['language'])
    return true/n

In [7]:
print('English Language Detection Accuracy: ', detect_language(fleurs_en,1))

None
English
English Language Detection Accuracy:  0.0


## 3 - Record Audio for Model
    Function to record audio clip for Language Detection and Transcription

In [8]:
#### RECORD AUDIO FILE .wav
SAMPLE_RATE = 16000
DURATION = 30 #seconds
OUTPUT = 'recording.wav'

print('Recording in process...')

audio = sd.rec(
    int(DURATION * SAMPLE_RATE),
    samplerate = SAMPLE_RATE,
    channels = 1,
    dtype = np.float32
)

sd.wait()

write(OUTPUT,SAMPLE_RATE,audio)
print('Audio Recording Saved --> "recording.wav"')


Recording in process...
Audio Recording Saved --> "recording.wav"


In [9]:
# Transcribe Audio Recording
# --> RUNTIME = 13 sec
recording, sr = librosa.load('recording.wav',sr = 16000)
transcribed = asr(recording)
print(f"Transcription of Recording audio: {transcribed['text']}")



Transcription of Recording audio:  Yeah, so it's almost New Year's Eve and I am practicing with my language detection transcription. Let's also see if it can do a mix of languages. So I'm gonna say Ego dekimasu ka arigatou gozaimasu. And we're gonna see if that works. So three, two, one, see ya.
