# Team AudioAlchemists - Inference Notebook - Wav2Vec2

- Team Members: `Syed Mostofa Monsur Dipto` | `Sakib Chowdhury` | `Md Boktiar Mahbub Murad` |

# Weights

1. [Model weights](https://www.kaggle.com/datasets/mbmmurad/checkpoint-450000ind2-aug)
2. [Language Model](https://www.kaggle.com/datasets/mbmmurad/final-lm-benai/)
3. [Dependencies](https://www.kaggle.com/datasets/shahruk10/csefest2022dlsprintdeps/versions/9)

# Install dependencies

In [1]:
!ls 

__notebook__.ipynb


In [2]:
!cp -r ../input/csefest2022dlsprintdeps ./deps

In [3]:
!pip install ./deps/pygtrie-2.5.0/pygtrie-2.5.0
!pip install ./deps/exceptiongroup-1.0.0rc8-py3-none-any.whl
!pip install ./deps/hypothesis-6.54.4-py3-none-any.whl
!pip install ./deps/pyctcdecode-0.4.0-py2.py3-none-any.whl
!pip install ./deps/pypi-kenlm-0.1.20220713/pypi-kenlm-0.1.20220713
!pip install ./deps/bnunicodenormalizer-0.0.23/bnunicodenormalizer-0.0.23
!pip install ./deps/python-Levenshtein-0.12.2/python-Levenshtein-0.12.2
!pip install ./deps/jiwer-2.3.0-py3-none-any.whl

!chmod +x ./deps/kenlm/kenlm/bin/lmplz

Processing ./deps/pygtrie-2.5.0/pygtrie-2.5.0
  Preparing metadata (setup.py) ... [?25l- done
[?25hBuilding wheels for collected packages: pygtrie
  Building wheel for pygtrie (setup.py) ... [?25l- \ done
[?25h  Created wheel for pygtrie: filename=pygtrie-2.5.0-py3-none-any.whl size=20944 sha256=7e720f66fd9b13fcda075fe994ae395348c735cc9d844627401abdaf3cb5d842
  Stored in directory: /root/.cache/pip/wheels/97/76/3c/04d3f51356d58b1de1abd51542fec46dec27fc231e6c73de07
Successfully built pygtrie
Installing collected packages: pygtrie
Successfully installed pygtrie-2.5.0
[0mProcessing ./deps/exceptiongroup-1.0.0rc8-py3-none-any.whl
Installing collected packages: exceptiongroup
Successfully installed exceptiongroup-1.0.0rc8
[0mProcessing ./deps/hypothesis-6.54.4-py3-none-any.whl
Installing collected packages: hypothesis
Successfully installed hypothesis-6.54.4
[0mProcessing ./deps/pyctcdecode-0.4.0-py2.py3-none-any.whl
Installing collected packages: pyctcdecode
S

# Imports

In [4]:
from typing import Dict, List, Tuple, Any, Union

import os
import re
import json
import glob
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

from transformers import Wav2Vec2ForCTC, Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor
from pyctcdecode import build_ctcdecoder

import torch
import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T
from torch.utils.data import Dataset, DataLoader, IterableDataset

import pyctcdecode

from bnunicodenormalizer import Normalizer 
from datasets import load_metric

bnorm = Normalizer()
wer = load_metric("../input/csefest2022dlsprintdeps/metrics/metrics/wer.py")
cer = load_metric("../input/csefest2022dlsprintdeps/metrics/metrics/cer.py")

In [5]:
vocabPath = '/kaggle/input/ckpt-270k/check_de_model/vocab.json'
lmPath = "/kaggle/input/final-lm-benai/5gram.arpa"
ckptPath = "/kaggle/input/checkpoint-450000ind2-aug"

sampleSubmissionPath = '/kaggle/input/bengaliai-speech/sample_submission.csv'

testDataDir = '../input/dlsprint/test_files'


# Load Model

In [6]:
processor = Wav2Vec2Processor.from_pretrained("/kaggle/input/ckpt-270k/check_de_model")

In [7]:
vocab_dict = processor.tokenizer.get_vocab()
sorted_vocab_dict = {k: v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}

decoder = pyctcdecode.build_ctcdecoder(
    list(sorted_vocab_dict.keys()),
    lmPath,
)

Loading the LM will be faster if you build a binary file.
Reading /kaggle/input/final-lm-benai/5gram.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************


In [8]:
with open(vocabPath, 'r', encoding="utf-8") as vocabFile:
    vocabc2n = json.load(vocabFile)

vocabn2c = { v:k for k,v in vocabc2n.items() }
vocab = list(vocabc2n.keys())

tokenizer = processor.tokenizer

feature_extractor = processor.feature_extractor


model = Wav2Vec2ForCTC.from_pretrained(ckptPath)
model.cuda()
model.eval()

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (2): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (3): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elemen

In [9]:
test_df = pd.read_csv(sampleSubmissionPath)
test_df['id'] = [ os.path.join(testDataDir, x) for x in test_df['id'] ]

# Utilities

In [10]:
class AudioConverter:
    """
    AudioConverter offers methods to load, transcode and augment
    audio data in various ways.
    """

    # Configurations for parameters used in torchaudio's resampling kernel.
    resampleFilterParams = {
        "fast": {  # Fast and less accurate but still MSE = ~2e-5 compared to librosa.
            "lowpass_filter_width": 16,
            "rolloff": 0.85,
            "resampling_method": "kaiser_window",
            "beta": 8.555504641634386,
        },
        "best": { # Twice as slow, and a little bit more accurate.
            "lowpass_filter_width": 64,
            "rolloff": 0.9475937167399596,
            "resampling_method": "kaiser_window",
            "beta": 14.769656459379492,       
        },
    }

    def __init__(
        self,
        sampleRate: int,
    ):
        """
        Initializes AudioConverter.

        Parameters
        ----------
        sampleRate: int
            Sampling rate to convert audio to, if required.
        """
        self.sampleRate = sampleRate

    @classmethod
    def loadAudio(
        cls, audioPath: str, sampleRate: int = None, returnTensor: bool = True, resampleType: str = "fast",
    ) -> Union[torch.Tensor, np.ndarray]:
        """
        Uses torchaudio to load and resample (if necessary) audio files and returns
        audio samples as either a numpy.float32 array or a torch.Tensor.
        
        Parameters
        ----------
        audioPath: str
            Path to audio file file (wav / mp3 / flac).
        
        sampleRate: int, optional
            Sampling rate to convert audio to. If None,
            audio is not resampled.
        
        returnTensor: bool, optional
            If True, the audio samples are returned as a torch.Tensor.
            Otherwise, the samples are returned as a numpy.float32 array.
            
        resampleType: str, optional
            Either "fast" or "best" - sets the quality of resampling.
            "best" is twice as slow as "fast" but more accurate. "fast"
            is still comparable to librosa's resampled output though,
            in terms of MSE.

        Returns
        -------
        Union[torch.Tensor, np.ndarray]
            Audio waveform scaled between +/- 1.0 as either a numpy.float32 array,
            or torch.Tensor, with shape (channels, numSamples)
        """
        x, sr = torchaudio.load(audioPath)
        if sampleRate is not None or sr != sampleRate:
            x = F.resample(x, sr, sampleRate)
        
        if returnTensor:
            return x
        
        return x.numpy()

In [11]:
def normalize(sen):
    _words = [ bnorm(word)['normalized']  for word in sen.split() ]
    sen = " ".join([word for word in _words if word is not None]) 
    sen = sen.replace("\u2047", "-")

    return sen

# Infer on single data

In [12]:
def infer(audioPath):
    wav = AudioConverter.loadAudio(audioPath, sampleRate=16000, returnTensor=False)[0]
    inputs = processor(wav, sampling_rate=16000, return_tensors="pt", padding=True)

    with torch.no_grad():
        logits = model(inputs.input_values.float().cuda()).logits
   
    items = logits.squeeze(0).cpu().numpy()
    preds = decoder.decode_beams(items)[0][0]

    return normalize(preds) + "।"


infer('/kaggle/input/bengaliai-speech/train_mp3s/0001565ed181.mp3')

'উপাসকেরও কার্যকর ধর্ম সম্বন্ধে নিজস্ব ধারণা ও আদর্শ রহিয়াছে।'

# Infer on a batch of data

In [13]:
def batch_infer(audio_paths, batch_size):
    preds = [ infer(x) for x in tqdm(audio_paths) ]
    return preds

# Infer on a directory

In [14]:
def directory_infer(audio_dir):
    audio_paths = sorted(glob.glob(audio_dir+'/*'))
    preds = batch_infer(audio_paths, 10)
    base_paths=[ os.path.basename(p) for p in audio_paths ]
    
    return pd.DataFrame({'id': base_paths ,'sentence': preds})

# Inference

In [15]:
test_dir = "/kaggle/input/bengaliai-speech/test_mp3s"
sub = directory_infer(test_dir)
sub.head()

  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,id,sentence
0,0f3dac00655e.mp3,একটু বয়স হলে একটি বিদেশী।
1,a9395e01ad21.mp3,কী কারণে তুমি এতাবৎকাল পর্যন্ত এ দারুণ দৈব দুর...
2,bf36ea8b718d.mp3,এ কারণে সরকার নির্ধারিত হারে পরিবহন জনিত ক্ষতি...


In [16]:
sub['id'] = sub['id'].apply(lambda x:x.split(".")[0])
sub.head()

Unnamed: 0,id,sentence
0,0f3dac00655e,একটু বয়স হলে একটি বিদেশী।
1,a9395e01ad21,কী কারণে তুমি এতাবৎকাল পর্যন্ত এ দারুণ দৈব দুর...
2,bf36ea8b718d,এ কারণে সরকার নির্ধারিত হারে পরিবহন জনিত ক্ষতি...


In [17]:
sub.to_csv("submission.csv",index=False)