In [1]:
import transformers
from transformers import BartForConditionalGeneration
from datasets import Audio, load_dataset
import pandas 
import numpy
import tqdm
import torch
import librosa
from IPython.display import Audio
from transformers import WhisperProcessor
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from datasets import load_dataset, DatasetDict
from dataclasses import dataclass
from typing import Any, Dict, List, Union

device='cuda' if torch.cuda.is_available() else 'cpu'

  from .autonotebook import tqdm as notebook_tqdm


# Loading Input, Model

In [2]:
fleurs = DatasetDict()

fleurs["test"] = load_dataset("google/fleurs", "hi_in", split="test")

print(fleurs)

Found cached dataset fleurs (/root/.cache/huggingface/datasets/google___fleurs/hi_in/2.0.0/af82dbec419a815084fa63ebd5d5a9f24a6e9acdf9887b9e3b8c6bbd64e0b7ac)


DatasetDict({
    test: Dataset({
        features: ['id', 'num_samples', 'path', 'audio', 'transcription', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'],
        num_rows: 418
    })
})


In [3]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration

from datasets import load_dataset

# load model and processor

processor = WhisperProcessor.from_pretrained("openai/whisper-small")

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small",output_hidden_states=True).cuda()

model.config.forced_decoder_ids =  processor.get_decoder_prompt_ids(language="hindi", task="transcribe")

## Testing one input

In [4]:
i=0
audio=fleurs['test'][i]['audio']['array']
sr=fleurs['test'][i]['audio']['sampling_rate']
transcription=fleurs['test'][i]['transcription']
labels=torch.tensor(processor.tokenizer(fleurs['test'][i]['transcription']).input_ids).cuda()
input_features = processor(audio, sampling_rate=sr, return_tensors="pt").input_features.cuda()
predicted_ids = model.generate(input_features)

# decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
print(transcription)



['<|startoftranscript|><|hi|><|transcribe|><|notimestamps|> अग्वो में आज्द्टर केंद्रख होता है, जिसका मतला भी आजा की उन्मे थोडे या बिना किसी जटके से तुटनें की प्रवत्ती होती है.<|endoftext|>']


# Preparing Data

In [5]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array 
    batch["input_features"] = processor.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids 
    batch["labels"] = processor.tokenizer(batch["transcription"]).input_ids
    return batch
fleurs_processed = fleurs.map(prepare_dataset, remove_columns=fleurs.column_names["test"], num_proc=8)

                                                                                                                                       

## Defining DataCollator for batched inputs

In [6]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt",return_attention_mask=True).to(device)

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels.to(device)

        return batch
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
data_collator([fleurs_processed['test'][2]])['input_features'].shape

torch.Size([1, 80, 3000])

In [8]:
with torch.no_grad():
    input_batch=data_collator([fleurs_processed['test'][2]])
    output=model(**input_batch)
    raw_embeddings=output['encoder_hidden_states'][0]

    ## Note, There is no implementatoin here that accounts for attention mask. This needs to be done later in batching
    ## In particular, the embeddings have tp be multiplied with the attention mask before averaging. 
    mean_pooled = raw_embeddings.sum(axis=1) / input_batch['attention_mask'].sum(axis=-1).unsqueeze(-1)

In [9]:
mean_pooled

tensor([[-3.2079e-01, -4.0981e-01, -5.3685e-01, -1.8853e+00, -1.2916e-01,
          1.3243e+01, -2.4261e-01, -1.3180e-01, -1.1218e-01, -1.5345e-01,
         -1.1883e-01, -4.6020e-01,  1.6376e+01, -9.7700e-02, -7.0510e-01,
         -3.1490e-01, -1.9867e-01,  2.5166e+00, -6.8524e-01,  1.2408e+01,
         -1.8797e-01, -1.9481e-01, -3.5793e-01, -1.8992e-02, -3.0768e-01,
         -4.3020e-01, -1.3105e-01, -2.0492e-01, -1.0976e-01, -2.4197e-01,
         -6.5150e-01, -2.9447e-01, -1.2303e-01, -1.1385e-01, -1.3264e-01,
         -2.0747e-01, -3.1276e-01, -4.7822e-01, -2.5880e-01, -1.2137e-01,
         -1.5816e-01, -6.0059e-01, -2.2619e-01, -5.1787e-01, -1.5846e-01,
         -1.2401e-01, -7.6439e-01, -4.1079e-02, -4.2156e-01, -1.6939e-01,
         -1.6054e-01, -1.4394e-01, -3.7949e-01, -6.8554e-01, -1.6783e-01,
         -1.1361e-01, -1.4607e-01,  1.2074e+01, -2.6806e-01, -2.9099e-01,
         -6.5096e-02,  3.6805e+00,  9.1759e+00, -4.0561e-01, -1.3031e-01,
         -4.5442e-01,  2.4091e-02, -5.

In [76]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration

from datasets import load_dataset

# load model and processor

processor = WhisperProcessor.from_pretrained("openai/whisper-small")

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small",output_hidden_states=True).cuda()


class generate_layer_embeddings:
    def __init__(language_id):
        self.model=model
        self.model.config.forced_decoder_ids =  processor.get_decoder_prompt_ids(language=language_id, task="transcribe")
        fleurs = DatasetDict()
        fleurs["test"] = load_dataset("google/fleurs", "hi_in", split="test")
        fleurs["train"] = load_dataset("google/fleurs", "hi_in", split="train")

    

torch.Size([1, 80])