# Inference code for YellowKing's model from  DL Sprint 2022
https://www.kaggle.com/code/sameen53/yellowking-dlsprint-inference

In [1]:
import os
import numpy as np
from tqdm.auto import tqdm
from glob import glob
from transformers import pipeline
import pandas as pd
import librosa
import IPython
from datasets import load_metric
from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader
import torch
import gc
import wave
from scipy.io import wavfile
import scipy.signal as sps
import pyctcdecode

tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# CHANGE ACCORDINGLY
BATCH_SIZE = 16
TEST_DIRECTORY = 'test_mp3s'
paths = glob(os.path.join(TEST_DIRECTORY,'*.mp3'))
print(paths[:2])

['test_mp3s/0f3dac00655e.mp3', 'test_mp3s/a9395e01ad21.mp3']


In [3]:
class CFG:
    my_model_name = 'YellowKing_model'
    processor_name = 'YellowKing_processor'

In [4]:
from transformers import Wav2Vec2ProcessorWithLM

processor = Wav2Vec2ProcessorWithLM.from_pretrained(CFG.processor_name)


In [5]:
my_asrLM = pipeline("automatic-speech-recognition", model=CFG.my_model_name ,feature_extractor =processor.feature_extractor, tokenizer= processor.tokenizer,decoder=processor.decoder ,device=0)


AssertionError: Torch not compiled with CUDA enabled

In [None]:
speech, sr = librosa.load('/kaggle/input/bengaliai-speech/test_mp3s/0f3dac00655e.mp3', sr=processor.feature_extractor.sampling_rate)

In [None]:
my_asrLM([speech]*2, chunk_length_s=112, stride_length_s=None)

[{'text': 'একটু বয়স হলে একটি বিদেশি।'}, {'text': 'একটু বয়স হলে একটি বিদেশি।'}]

In [None]:
my_asrLM

<transformers.pipelines.automatic_speech_recognition.AutomaticSpeechRecognitionPipeline at 0x7827e12c9f90>

**Following Sample Submission:**

In [None]:
class AudioDataset(Dataset):
    def __init__(self, paths):
        self.paths = paths
    def __len__(self):
        return len(self.paths)
    def __getitem__(self,idx):
        speech, sr = librosa.load(self.paths[idx], sr=processor.feature_extractor.sampling_rate) 
#         print(speech.shape)
        return speech

In [None]:
dataset = AudioDataset(paths)
dataset[0]

array([ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
       -2.2192553e-06,  8.4718761e-07, -2.2282691e-07], dtype=float32)

In [None]:
device = 'cuda:0'

In [None]:
def collate_fn_padd(batch):
    '''
    Padds batch of variable length

    note: it converts things ToTensor manually here since the ToTensor transform
    assume it takes in images rather than arbitrary tensors.
    '''
    ## get sequence lengths
    lengths = torch.tensor([ t.shape[0] for t in batch ])
    ## padd
    batch = [ torch.Tensor(t) for t in batch ]
    batch = torch.nn.utils.rnn.pad_sequence(batch)
    ## compute mask
    mask = (batch != 0)
    return batch, lengths, mask


In [None]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=8, collate_fn=collate_fn_padd)

In [None]:
preds_all = []
for batch, lengths, mask in dataloader:
    preds = my_asrLM(list(batch.numpy().transpose()))
    preds_all+=preds

In [None]:
from bnunicodenormalizer import Normalizer 


bnorm = Normalizer()
def normalize(sen):
    _words = [bnorm(word)['normalized']  for word in sen.split()]
    return " ".join([word for word in _words if word is not None])

def dari(sentence):
    try:
        if sentence[-1]!="।":
            sentence+="।"
    except:
        print(sentence)
    return sentence

In [None]:
df= pd.DataFrame(
    {
        "id":[p.split(os.sep)[-1].replace('.mp3','') for p in paths],
        "sentence":[p['text']for p in preds_all]
    }
)
df.sentence= df.sentence.apply(lambda x:normalize(x))
df.sentence= df.sentence.apply(lambda x:dari(x))

In [None]:
df

Unnamed: 0,id,sentence
0,a9395e01ad21,কী কারণে তুমি এতাবৎ কাল পর্যন্ত এ দারুন দৈব দু...
1,0f3dac00655e,একটু বয়স হলে একটি বিদেশি।
2,bf36ea8b718d,এ কারণে সরকার নির্ধারিত হারে পরিবহন জনিত ক্ষতি...


In [None]:
df.to_csv("submission.csv", index=False)
