# Code Loading my pretrained model and doing inference

In [1]:
from src.utils import *
import torch
from datasets import load_from_disk
from transformers import WhisperProcessor, Seq2SeqTrainer, Seq2SeqTrainingArguments
import numpy as np
from functools import partial
import evaluate

### Tokenizer

In [2]:
configs_path = "config-data"
vocab_path = f"{configs_path}/ca-vocab.json"
myTokenizer = MyCustomTokenizer(vocab_file = vocab_path, special_tokens_on_vocab_file=False)
print(myTokenizer.vocab)


{'agafar': 0, 'llençar': 1, 'deixar': 2, 'caixa': 3, 'pilota': 4, '<|startoftranscript|>': 5, '<|endoftext|>': 6, '<|transcribe|>': 7, '<|ca|>': 8, '<|notimestamps|>': 9, '<|startofprev|>': 10}


### Loading my pretrained model 

In [3]:

model = create_my_whisper_model(myTokenizer=myTokenizer)

model.load_state_dict(torch.load("whisper-small-ca/model-small-ca.bin", weights_only=True))


embeding tokens layer and projection layer share the same weights: True
shape of embedding tokens layer:  torch.Size([11, 768])
changing model's config vocab_size: 11


<All keys matched successfully>

In [4]:
dataset_dict = load_from_disk("Audios/dataset1")
print(dataset_dict)
print(dataset_dict["train"][0]["tags"])

DatasetDict({
    train: Dataset({
        features: ['audio_sentence', 'audio_rate', 'text_sentence', 'tags'],
        num_rows: 900
    })
    test: Dataset({
        features: ['audio_sentence', 'audio_rate', 'text_sentence', 'tags'],
        num_rows: 100
    })
})
deixar,caixa,deixar,pilota


In [5]:
model_id = "openai/whisper-small"
language = "ca"
task = "transcribe"

processor = WhisperProcessor.from_pretrained(model_id, language=language, task=task)

prepare_dataset_fn = partial(prepare_dataset, feature_extractor=processor.feature_extractor, myTokenizer=myTokenizer)

dataset_dict_prepared = dataset_dict.map(prepare_dataset_fn, remove_columns=dataset_dict.column_names["train"], num_proc=4)

dataset_dict_prepared = dataset_dict_prepared["test"]

In [6]:

print(dataset_dict_prepared)
print(dataset_dict_prepared[0]["labels"])
print(np.array(dataset_dict_prepared[0]["input_features"]).shape)
print(myTokenizer.decode(dataset_dict_prepared[0]["labels"]))
print(myTokenizer.decode(dataset_dict_prepared[0]["labels"], skip_special_tokens=True))


Dataset({
    features: ['input_features', 'labels'],
    num_rows: 100
})
[5, 8, 7, 9, 2, 3, 0, 4, 6]
(80, 3000)
<|startoftranscript|> <|ca|> <|transcribe|> <|notimestamps|> deixar caixa agafar pilota <|endoftext|>
deixar caixa agafar pilota


In [7]:
compute_metrics_fn = partial(compute_metrics, model=model, myTokenizer=myTokenizer)

data_collator = MyDataCollator(
    feature_extractor=processor.feature_extractor, 
    tokenizer=myTokenizer)


training_args = Seq2SeqTrainingArguments(
    output_dir="temp_dir",         
    do_train=False,                
    do_eval=False,                 
    per_device_eval_batch_size=16, 
    predict_with_generate=False,   # cuidado porque si esto esta a false compute_metrics_fn, falla porque le llegan logits no tokens generados
    save_strategy="no",            
    logging_strategy="no",  
    fp16=True,        
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
)

# Generamos predicciones y recogemos los logits
pred_output = trainer.predict(dataset_dict_prepared) #como muestra de inferencia solo 2 ejemplos



Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


In [8]:
# Pasamos los logits por la capa CRF y decodificamos

emission_scores = pred_output.predictions[0]

labels = pred_output.label_ids

transitions_file = f"{configs_path}/transitions_file.txt"

decoded_sequences_list = []
decoded_labels_list = []

log_transition_matrix = generate_transition_matrix(transitions_file=transitions_file, myTokenizer=myTokenizer)

# De momento se hace por cada elemento del batch ¡¡¡(hay que hacerlo por batches; con un objeto dataloader talvez)
for i, es in enumerate(emission_scores): 
    decoded_sequence = viterbi(log_transition=log_transition_matrix, myTokenizer=myTokenizer, emission_scores=es) #hay que mirarlo porque funciona mal
    #decoded_sequence = decode_with_max(emission_scores=es, myTokenizer=myTokenizer)
    print("Decoded sequence:", decoded_sequence)

    #replace -100 with the pad_token_id
    labels[labels == -100] = model.config.pad_token_id
    decoded_labels = myTokenizer.decode(labels[i], skip_special_tokens=True)
    print("Labels Decoded sequence:", decoded_labels)
    
    decoded_sequences_list.append(decoded_sequence)
    decoded_labels_list.append(decoded_labels)
    
metric = evaluate.load("wer")

wer = 100 * metric.compute(predictions=decoded_sequences_list, references=decoded_labels_list)

print("Word Error Rate (WER) in %:", wer)

Decoded sequence: deixar caixa agafar pilota
Labels Decoded sequence: deixar caixa agafar pilota
Decoded sequence: llençar pilota deixar pilota
Labels Decoded sequence: llençar pilota deixar pilota
Decoded sequence: deixar caixa
Labels Decoded sequence: deixar caixa
Decoded sequence: llençar caixa llençar pilota
Labels Decoded sequence: llençar caixa llençar pilota
Decoded sequence: llençar caixa deixar pilota
Labels Decoded sequence: llençar caixa deixar pilota
Decoded sequence: deixar pilota
Labels Decoded sequence: deixar pilota
Decoded sequence: llençar caixa agafar pilota
Labels Decoded sequence: llençar caixa agafar pilota
Decoded sequence: deixar pilota llençar caixa
Labels Decoded sequence: deixar pilota llençar caixa
Decoded sequence: llençar caixa agafar pilota
Labels Decoded sequence: llençar caixa agafar pilota
Decoded sequence: llençar pilota agafar caixa
Labels Decoded sequence: llençar pilota agafar caixa
Decoded sequence: deixar caixa agafar caixa
Labels Decoded sequenc