<a href="https://colab.research.google.com/github/Jayaprakash1618/Spoken_language_identification/blob/main/Spoken_language_identification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q omegaconf torchaudio pydub

import os
import sys

if not os.path.exists('Spoken_language_identification'):
  !git clone  -q --depth 1 https://github.com/SpeechFlow-io/Spoken_language_identification.git

%cd Spoken_language_identification

#Spoken_language_identification import
import tensorflow as tf
import librosa
import json
from vocab.vocab import Vocab

# imports for uploading/recording
import torchaudio
import io
import base64
import tempfile
from typing import Optional
from pydub import AudioSegment
from google.colab import files
from google.colab import output
from IPython import display as _display

from IPython.display import Audio, clear_output, display
import ipywidgets as widgets
from scipy.io import wavfile
import numpy as np


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/79.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━[0m [32m71.7/79.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/117.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for antlr4-python3-runtime (setup.py) ... [?25l[?25hdone
/content/Spoken_language_identification


In [None]:
vocab = Vocab("vocab/vocab.txt")

model = tf.saved_model.load('./saved_models/lang14/pb/2')
#some test audios are provided in test_audios files (chinese.wav, english.wav, french.wav, german.wav, italian.wav, japanese.wav, korean.wav, portuguese.wav,
#russian.wav, spanish.wav, vietnamese), you can specify an audio file to run.
audio_file = './test_audios/english.wav'
signal, _ = librosa.load(audio_file, sr=16000)
lang_id, prob = model.predict_pb(signal)
language = vocab.token_list[lang_id.numpy()]
probability = prob.numpy()*100
print("{} is predicted as {} and it's probability={:.2f}% ".format(audio_file, language, probability))

./test_audios/english.wav is predicted as english and it's probability=99.96% 


In [None]:
from google.colab import output
def read_audio(path: str,
               target_sr: int = 16000):

    wav, sr = torchaudio.load(path)

    if wav.size(0) > 1:
        wav = wav.mean(dim=0, keepdim=True)

    if sr != target_sr:
        transform = torchaudio.transforms.Resample(orig_freq=sr,
                                                   new_freq=target_sr)
        wav = transform(wav)
        sr = target_sr

    assert sr == target_sr
    return wav.squeeze(0)


def record_audio(seconds: int = 3,
                 normalize_db: float = 0.1):
    # Use Javascript to record audio.
    record_js_code = """
      const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
      const b2text = blob => new Promise(resolve => {
        const reader = new FileReader()
        reader.onloadend = e => resolve(e.srcElement.result)
        reader.readAsDataURL(blob)
      })
      var record = time => new Promise(async resolve => {
        stream = await navigator.mediaDevices.getUserMedia({ audio: true })
        recorder = new MediaRecorder(stream)
        chunks = []
        recorder.ondataavailable = e => chunks.push(e.data)
        recorder.start()
        await sleep(time)
        recorder.onstop = async ()=>{
          blob = new Blob(chunks)
          text = await b2text(blob)
          resolve(text)
        }
        recorder.stop()
      })
      """
    print('Starting recording for {} seconds...'.format(seconds))
    _display.display(_display.Javascript(record_js_code))
    audio_string = output.eval_js('record(%d)' % (seconds * 1000.0))
    print('Finished recording!')
    audio_bytes = base64.b64decode(audio_string.split(',')[1])
    return audio_bytes_to_np(audio_bytes,
                             normalize_db=normalize_db)


def audio_bytes_to_np(wav_data: bytes,
                      normalize_db: float = 0.1):
    # Parse and normalize the audio.
    audio = AudioSegment.from_file(io.BytesIO(wav_data))
    audio.remove_dc_offset()
    if normalize_db is not None:
        audio.normalize(headroom=normalize_db)
    # Save to tempfile and load with librosa.
    with tempfile.NamedTemporaryFile(suffix='.wav') as temp_wav_file:
        fname = temp_wav_file.name
        audio.export(fname, format='wav')
        wav = read_audio(fname)
    return wav


def upload_audio(normalize_db: Optional[float] = None):
    audio_files = files.upload()
    fnames = list(audio_files.keys())
    if len(fnames) == 0:
        return None
    return read_audio(fnames[0])


In [None]:
#
record_or_upload = "Upload (.mp3 or .wav!)" #@param ["Record", "Upload (.mp3 or .wav!)"]
record_seconds =   6#@param {type:"number", min:5, max:50, step:1}
sample_rate = 16000

def _recognize(audio):
  display(Audio(audio, rate=sample_rate, autoplay=False))
  signal, _ = librosa.load(audio, sr=16000)
  lang_id, prob = model.predict_pb(signal)
  language = vocab.token_list[lang_id.numpy()]
  probability = prob.numpy()*100
  print("The probability of {} is {:.2f}% ".format(language, probability))

def _record_audio(b):
  clear_output()
  audio = record_audio(record_seconds)
  wavfile.write('recorded.wav', sample_rate, (32767*audio).numpy().astype(np.int16))
  _recognize('recorded.wav')

def _upload_audio(b):
  clear_output()
  audio_files = files.upload()
  audio = list(audio_files.keys())[0]
  # audio = upload_audio()
  if audio.endswith('.mp3'):
    wav_audio = audio.replace('.mp3', '.wav')
    command = "ffmpeg -i '{}' -ab 256k -ar 16000 -ac 1 -f wav '{}'".format(audio, wav_audio)
    os.system(command)
  _recognize(wav_audio)
  return wav_audio

if record_or_upload == "Record":
  button = widgets.Button(description="Record Speech")
  button.on_click(_record_audio)
  display(button)
else:
  audio = _upload_audio("")
