In [None]:
from datasets import load_from_disk
from transformers import WhisperTokenizer, WhisperProcessor, WhisperForConditionalGeneration
import evaluate
import torch
import numpy as np
metric = evaluate.load("wer")
from torch.utils.data import Dataset, DataLoader

class PrepareDatasetAsInput:
    
    def __init__(self, feature_extractor, tokenizer_en, tokenizer_fr):
        self.feature_extractor = feature_extractor
        self.tokenizer_en = tokenizer_en
        self.tokenizer_fr = tokenizer_fr
            
    def prepare_dataset(self, batch):
        # load and resample audio data from 48 to 16kHz
        audio = batch["audio"]

        # compute log-Mel input features from input audio array
        batch["input_features"] = self.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

        # encode target text to label ids **** CHANGED FROM **sentence** TO **transcription**
        # if french, than use french tokenizer, english otherwise
        tokenizer = self.tokenizer_en
        if "lang" in batch:
            if batch["lang"] == "fr":
                tokenizer = self.tokenizer_fr

        batch["labels_fullts"] = tokenizer(batch["full_ts"]).input_ids
        batch["labels_shortts"] = tokenizer(batch["short_ts"]).input_ids

        return batch
    
    def prepare_dataset_self_prompt(self,batch):
        # load and resample audio data from 48 to 16kHz
        audio = batch["audio"]

        # compute log-Mel input features from input audio array
        batch["input_features"] = self.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

        # encode target text to label ids **** CHANGED FROM **sentence** TO **transcription**
        # if french, than use french tokenizer, english otherwise
        tokenizer = self.tokenizer_en
        if "lang" in batch:
            if batch["lang"] == "fr":
                tokenizer = self.tokenizer_fr
        
        # make prompt from the lables
        batch['fullts_prompt_ids'] = self.tokenizer_en.get_prompt_ids(batch["full_ts"]).tolist() # YOU NEED TO ADD TOLIST() because array cant be combined with list in the next lines
        batch['shortts_prompt_ids'] = self.tokenizer_en.get_prompt_ids(batch["short_ts"]).tolist() # YOU NEED TO ADD TOLIST() because array cant be combined with list in the next lines
        
        batch["labels_fullts"] = tokenizer(batch["full_ts"]).input_ids # building labels ids with prompt and tokens together
        batch["labels_shortts"] = tokenizer(batch["short_ts"]).input_ids

        return batch

In [2]:
class ComputeMetrics:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        
    def compute_metrics(self,pred_text, reference_text):
        pred_ids = pred_text
        label_ids = reference_text

        # replace -100 with the pad_token_id
        label_ids[label_ids == -100] = self.tokenizer.pad_token_id

        # we do not want to group tokens when computing the metrics
        pred_str = self.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
        label_str = self.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

        wer = 100 * metric.compute(predictions=pred_str, references=label_str)

        return {"wer": wer}

    def compute_metrics2(self,pred_str,label_str):

        wer = 100 * metric.compute(predictions=pred_str, references=label_str)

        return {"wer": wer}

In [30]:
dataset_path = './data/atco/en_ruzyne_test_ds'
# model_def = './test3-tiny-prompt/checkpoint-980'
model_def = './checkpoint-980'
# model_def = 'openai/whisper-tiny'

In [29]:
torch.cuda.synchronize()
torch.cuda.empty_cache()

In [31]:
model = WhisperForConditionalGeneration.from_pretrained(model_def).cuda()
processor = WhisperProcessor.from_pretrained(model_def)
prepare_dataset = PrepareDatasetAsInput(processor.feature_extractor, processor.tokenizer, processor.tokenizer) #TODO handle FR

In [32]:
# setting the parameters of model for correct working
model.generation_config.language = "english"
model.generation_config.task = "transcribe"
model.generation_config.forced_decoder_ids = None

In [33]:
    
# load and prepare the dataset
dataset = load_from_disk(dataset_path)
ds_ready = dataset.map(prepare_dataset.prepare_dataset_self_prompt, remove_columns=dataset.column_names, num_proc=1)
ds_ready = ds_ready.rename_columns({'labels_fullts':'labels','fullts_prompt_ids':'prompt_ids'})

print(ds_ready)

Map:   0%|          | 0/70 [00:00<?, ? examples/s]

Dataset({
    features: ['input_features', 'prompt_ids', 'shortts_prompt_ids', 'labels', 'labels_shortts'],
    num_rows: 70
})


In [34]:
# test modelu bez promptu
# dulezite - kdyz byl tiny model natrenovany malo (jedna epocha), tak vracel dlouhe vety, ktere se opakovaly
out1 = model.generate(torch.tensor(ds_ready[0]['input_features']).unsqueeze(0).cuda()).detach().cpu()
print(processor.decode(out1[0]))

Badassie's you and Lufre's lost a place


In [35]:
# test modelu s promptem (1 nebo 10 epoch vyjde na stejno zda se)
out2 = model.generate(
    torch.tensor(ds_ready[0]['input_features']).unsqueeze(0).cuda(),
    prompt_ids=torch.tensor(ds_ready[0]['prompt_ids']).cuda()
).detach().cpu()
print(processor.decode(out2[0]))

Rar CSA One Delta Zulu established
CSA One Delta Zulu roger contact Ruzyne Tower one three four decimal five six zero
three four five six zero CSA One Delta Zulu  pekný deň


In [14]:
# test modelu, kdy je vstup a prompt jsou od jinych nahravek
# vysledek - neni to uplne tak, ze ten model prepsal ten prompt, ale dost se to blizi tomu
out3 = model.generate(
    torch.tensor(ds_ready[2]['input_features']).unsqueeze(0).cuda(),
    prompt_ids=torch.tensor(ds_ready[0]['prompt_ids']).cuda()
).detach().cpu()
print(processor.decode(out3[0]))
print('real')
print(processor.decode(ds_ready[1]['labels'],skip_special_tokens=True))

Rar CSA One Delta Zulu established
CSA One Delta Zulu  pekný deň
real
Oscar Kilo Uniform Tango Charlie re- release from frequency flying information service available at one two six decimal one Praha Information   dobrý den


In [18]:


# Prepare decoder prompts (pad them to the same length)
batch_decoder_inputs = processor.tokenizer.pad({"input_ids": ds_ready[0:2]['prompt_ids']}, padding=True, return_tensors="pt")

# Generate
outputs = model.generate(
    input_features=torch.tensor(ds_ready[0:2]['input_features']).cuda(),
    decoder_input_ids=batch_decoder_inputs["input_ids"].cuda(),
    decoder_attention_mask=batch_decoder_inputs["attention_mask"],  # Important for handling padding
)

In [49]:
# print(processor.decode(out1[0]))
print(processor.decode(ds_ready[0]['labels']))

<|startoftranscript|><|notimestamps|>Radar CSA One Delta Zulu established
CSA One Delta Zulu roger contact Ruzyne Tower one three four decimal five six zero

three four five six zero CSA One Delta Zulu  pekný deň<|endoftext|>


In [10]:
from typing import Any, Dict, List, Union
from dataclasses import dataclass
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors

        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        batch["labels"] = labels_batch
        
        return batch

In [11]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [23]:
dataloader = DataLoader(ds_ready, batch_size=3, collate_fn=data_collator)
metr = ComputeMetrics(processor.tokenizer)

In [None]:
out = model.generate(input_features).detach().cpu()

In [None]:

# Iterate over batches
all_preds = []
all_lables = []
for batch in dataloader:
    input_features = batch["input_features"].cuda()
    out = model.generate(input_features).detach().cpu()
    all_preds.extend(processor.batch_decode(out, skip_special_tokens=True))
    all_lables.extend(processor.batch_decode(batch["labels"]['input_ids'], skip_special_tokens=True))
    # Free memory
    del batch
    torch.cuda.empty_cache()


In [53]:
metr.compute_metrics(all_preds, all_lables)    

TypeError: 'int' object is not subscriptable

In [35]:
metr.compute_metrics2(all_preds, all_lables)    

{'wer': 76.61662817551964}

In [None]:
val = metr.compute_metrics(out[0].unsqueeze(0), batch["labels"]['input_ids'][0].unsqueeze(0))
val1 = metr.compute_metrics(out[1].unsqueeze(0), batch["labels"]['input_ids'][1].unsqueeze(0))
val2= metr.compute_metrics(out[2].unsqueeze(0), batch["labels"]['input_ids'][2].unsqueeze(0))
print(val,val1,val2,(val1['wer']+val2['wer']+val['wer'])/3)

In [57]:
batch = torch.tensor(ds_ready['input_features'][:5]).cuda()

In [None]:
out=model.generate(batch)

In [59]:
del batch
del model
torch.cuda.empty_cache()
torch.cuda.synchronize()

In [10]:
processor.tokenizer.decode(ds_ready[0]['labels'], skip_special_tokens=True)

'Radar CSA One Delta Zulu established\nCSA One Delta Zulu roger contact Ruzyne Tower one three four decimal five six zero\n\nthree four five six zero CSA One Delta Zulu  pekný deň'

In [11]:
out.shape

torch.Size([3, 42])

In [14]:
processor.tokenizer.decode(out[0], skip_special_tokens=True)

' Radar CC1DZ, established CC1DZ, roger, contact roger tower 134.560 134.560, CC1DZ, take me again'

In [31]:
metr = ComputeMetrics(processor.tokenizer)

In [16]:
out[0]

tensor([ 9654,   289, 12630,    16,    35,    57,    11,  7545, 12630,    16,
           35,    57,    11,   744,  1321,    11,  3385,   744,  1321, 10567,
         3705,    19,    13,    20,  4550,  3705,    19,    13,    20,  4550,
           11, 12630,    16,    35,    57,    11,   747,   385,   797, 50257,
        50257, 50257], device='cuda:0')

In [17]:
torch.tensor(ds_ready[0]['labels'])

tensor([50258, 50363, 48444,   289,   383,  8886,  1485, 18183,  1176, 12845,
         7545,   198,    34,  8886,  1485, 18183,  1176, 12845,   744,  1321,
         3385, 15702,  1229,   716, 17877,   472,  1045,  1451, 26601,  1732,
         2309,  4018,   198,   198, 27583,  1451,  1732,  2309,  4018,   383,
         8886,  1485, 18183,  1176, 12845,   220,   520,  5457, 11822,   368,
          129,   230, 50257])

In [18]:
metr.compute_metrics(out[0].unsqueeze(0), torch.tensor(ds_ready[0]['labels']).unsqueeze(0))

{'wer': 93.54838709677419}

In [44]:
out_test = processor.tokenizer.batch_decode(out, skip_special_tokens=True)
label_test = processor.tokenizer.batch_decode(ds_ready['labels'][:3], skip_special_tokens=True)

In [45]:
label_test.__len__()

3

In [46]:
metr.compute_metrics2(out_test,label_test)

{'wer': 78.37837837837837}