## 0 - Imports 


In [25]:
# Hugging Face
from datasets import load_dataset
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration

# Data Analysis
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score


# Audio handling
import torch
import soundfile as sf
import librosa

# Audio Recording
import sounddevice as sd
from scipy.io.wavfile import write

# Transcription
from jiwer import wer
from sklearn.feature_extraction.text import TfidfVectorizer


# Misc
import time
import os
import joblib


## 1 - Load Datasets

In [2]:
#Load the training portion of each language - takes a couple minutes
fleurs_en = load_dataset("google/fleurs", "en_us", split="train",trust_remote_code = True)
fleurs_es = load_dataset("google/fleurs", "es_419", split="train", trust_remote_code = True)
fleurs_ja = load_dataset("google/fleurs", "ja_jp", split="train", trust_remote_code = True)


In [3]:
# See outputs
sample_en = fleurs_en[0]
sample_en.keys()

sample_en

#key_features = ['audio' , 'transcription', 'language']

{'id': 903,
 'num_samples': 108800,
 'path': 'C:\\Users\\jking36\\.cache\\huggingface\\datasets\\downloads\\extracted\\00f4c8069bdbe2061746b64297f5f7dd1af5f4ac9c3f84e1cc27006199d80190\\10004088536354799741.wav',
 'audio': {'path': 'train/10004088536354799741.wav',
  'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         -3.15904617e-06, -3.03983688e-06, -3.27825546e-06], shape=(108800,)),
  'sampling_rate': 16000},
 'transcription': 'a tornado is a spinning column of very low-pressure air which sucks the surrounding air inward and upward',
 'raw_transcription': 'A tornado is a spinning column of very low-pressure air, which sucks the surrounding air inward and upward.',
 'gender': 1,
 'lang_id': 19,
 'language': 'English',
 'lang_group_id': 0}

## 2 - Obtain ASR Model


In [4]:
device = 0 if torch.cuda.is_available() else -1 

asr = pipeline(
    'automatic-speech-recognition',
    model = 'openai/whisper-small',
    device = device
)

# Or Directly import model

model_name = 'openai/whisper-small'

processor = WhisperProcessor.from_pretrained(model_name)
whisper = WhisperForConditionalGeneration.from_pretrained(model_name)
whisper.eval()



Device set to use cpu


WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (f

In [5]:
#Use English sample to test asr

audio = sample_en['audio']['array']

testing = asr(audio)

print(f'Predicted Text: {testing["text"]} - Predicted Language: {testing.get("language")}')
print(f'True ENG Text: {sample_en["transcription"]} - True Language: {sample_en["language"]}')

`return_token_timestamps` is deprecated for WhisperFeatureExtractor and will be removed in Transformers v5. Use `return_attention_mask` instead, as the number of frames can be inferred from it.
Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.


Predicted Text:  A tornado is a spinning column of very low pressure air, which sucks the surrounding air inward and upward. - Predicted Language: None
True ENG Text: a tornado is a spinning column of very low-pressure air which sucks the surrounding air inward and upward - True Language: English


In [6]:
#Function for Language Detection
def detect_language(dataset,n=50):
    true = 0
    for i in range(n):
        sample = dataset[i]
        prediction = asr(sample['audio']['array'])
        if prediction.get('language') == sample['language']:
            true += 1
        print(prediction.get('language'))
        print(sample['language'])
    return true/n

In [7]:
print('English Language Detection Accuracy: ', detect_language(fleurs_en,1))

None
English
English Language Detection Accuracy:  0.0


## 3 - Record Audio for Model
    Function to record audio clip for Language Detection and Transcription

In [8]:
#### RECORD AUDIO FILE .wav
SAMPLE_RATE = 16000
DURATION = 30 #seconds
OUTPUT = 'recording.wav'

print('Recording in process...')

audio = sd.rec(
    int(DURATION * SAMPLE_RATE),
    samplerate = SAMPLE_RATE,
    channels = 1,
    dtype = np.float32
)

sd.wait()

write(OUTPUT,SAMPLE_RATE,audio)
print('Audio Recording Saved --> "recording.wav"')


Recording in process...
Audio Recording Saved --> "recording.wav"


In [9]:
# Transcribe Audio Recording
# --> RUNTIME = 13 sec
recording, sr = librosa.load('recording.wav',sr = 16000)
transcribed = asr(recording)
ttext = transcribed['text']
print(f"Transcription of Recording audio: {ttext}")



Transcription of Recording audio:  Testing ASR model. Going to build classification, model to detect language in binary. Let's also test multiple languages. Ego, Dekimaska, and Vamos a la playa. Thank you.


## 4 - Language Detection Model 
- Basic Classification Model trained on fleurs data
- Goal: Take in transcription of audio recording and accurately display the language
- For: The toggle between languages once we get to the translation model

In [None]:
#Create Training Data - combination of language data

def fleurs_df(dataset):
    return pd.DataFrame({'text': dataset['transcription'],
                         'language': dataset['language']})

df_en = fleurs_df(fleurs_en)
df_es = fleurs_df(fleurs_es)
df_jp = fleurs_df(fleurs_ja)

data = pd.concat([df_en,df_es,df_jp],ignore_index=True)

data

# data.loc[data['language']=='Japanese']


Unnamed: 0,text,language
0,a tornado is a spinning column of very low-pre...,English
1,former u.s. speaker of the house newt gingrich...,English
2,the island was first inhabited by the taínos a...,English
3,these nerve impulses can be sent so quickly th...,English
4,on september 24 1759 arthur guinness signed a ...,English
...,...,...
7685,映画をdvd形式に変換しようとするときに最も一般的な問題の1つはオーバースキャンです,Japanese
7686,"地球上で非常に希少な岩石の一部は オンス当たり11,000～22,500米ドルで取引されてお...",Japanese
7687,一部の研究は 世界的な流行になる前に この病気による致死性を低下させなければならないと示唆し...,Japanese
7688,寺院の一部として ジグラットと呼ばれるピラミッド型の特殊な塔が建てられることがありました,Japanese


In [24]:
#Train/Test Split

X = data['text']
y= data['language']

X_train, X_dev, y_train, y_dev = train_test_split(X,y,test_size=.2,random_state = 42, stratify = y)


In [26]:
# Vectorizer - 

vectorized = TfidfVectorizer(
    analyzer='char',
    ngram_range=(2,5),
    min_df=2,
    max_features=20000
)

X_train_v = vectorized.fit(X_train)
X_dev_v = vectorized.fit(X_dev)

X_train_v


0,1,2
,"input  input: {'filename', 'file', 'content'}, default='content' - If `'filename'`, the sequence passed as an argument to fit is  expected to be a list of filenames that need reading to fetch  the raw content to analyze. - If `'file'`, the sequence items must have a 'read' method (file-like  object) that is called to fetch the bytes in memory. - If `'content'`, the input is expected to be a sequence of items that  can be of type string or byte.",'content'
,"encoding  encoding: str, default='utf-8' If bytes or files are given to analyze, this encoding is used to decode.",'utf-8'
,"decode_error  decode_error: {'strict', 'ignore', 'replace'}, default='strict' Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given `encoding`. By default, it is 'strict', meaning that a UnicodeDecodeError will be raised. Other values are 'ignore' and 'replace'.",'strict'
,"strip_accents  strip_accents: {'ascii', 'unicode'} or callable, default=None Remove accents and perform other character normalization during the preprocessing step. 'ascii' is a fast method that only works on characters that have a direct ASCII mapping. 'unicode' is a slightly slower method that works on any characters. None (default) means no character normalization is performed. Both 'ascii' and 'unicode' use NFKD normalization from :func:`unicodedata.normalize`.",
,"lowercase  lowercase: bool, default=True Convert all characters to lowercase before tokenizing.",True
,"preprocessor  preprocessor: callable, default=None Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps. Only applies if ``analyzer`` is not callable.",
,"tokenizer  tokenizer: callable, default=None Override the string tokenization step while preserving the preprocessing and n-grams generation steps. Only applies if ``analyzer == 'word'``.",
,"analyzer  analyzer: {'word', 'char', 'char_wb'} or callable, default='word' Whether the feature should be made of word or character n-grams. Option 'char_wb' creates character n-grams only from text inside word boundaries; n-grams at the edges of words are padded with space. If a callable is passed it is used to extract the sequence of features out of the raw, unprocessed input. .. versionchanged:: 0.21  Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data  is first read from the file and then passed to the given callable  analyzer.",'char'
,"stop_words  stop_words: {'english'}, list, default=None If a string, it is passed to _check_stop_list and the appropriate stop list is returned. 'english' is currently the only supported string value. There are several known issues with 'english' and you should consider an alternative (see :ref:`stop_words`). If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. Only applies if ``analyzer == 'word'``. If None, no stop words will be used. In this case, setting `max_df` to a higher value, such as in the range (0.7, 1.0), can automatically detect and filter stop words based on intra corpus document frequency of terms.",
,"token_pattern  token_pattern: str, default=r""(?u)\\b\\w\\w+\\b"" Regular expression denoting what constitutes a ""token"", only used if ``analyzer == 'word'``. The default regexp selects tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator). If there is a capturing group in token_pattern then the captured group content, not the entire match, becomes the token. At most one capturing group is permitted.",'(?u)\\b\\w\\w+\\b'


In [None]:
# Random Forest Classifier
def Detection(X_train, X_dev, y_train, y_dev, model_path = 'detection.pkl'):
    model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

    model.fit(X_train,y_train)
    #Predictions
    y_pred = model.predict(X_dev)
    prob = model.predict_proba(X_dev)[:,1]

    #Scoring - F1/accuracy
    f1 = f1_score(y_dev,prob)
    accuracy = accuracy_score(y_dev,prob)
    print(f'F1 Score: {f1} ---- Accuracy: {accuracy}')

    #Save Model
    joblib.dump(model, model_path)
    print(f"Model saved to {model_path}")

    return model


In [28]:
# Run Training Model 

Detection(X_train_v,X_dev_v, y_train, y_dev)


TypeError: float() argument must be a string or a real number, not 'TfidfVectorizer'