# Code Loading my pretrained model and doing inference

In [1]:
from src.utils import MyCustomTokenizer, MyDataCollator, create_my_whisper_model, prepare_dataset, compute_metrics, crf
import torch
from datasets import load_from_disk
from transformers import WhisperProcessor, Seq2SeqTrainer, Seq2SeqTrainingArguments
import numpy as np
from functools import partial
import evaluate

### Tokenizer

In [2]:
configs_path = "config-data"
vocab_path = f"{configs_path}/ca-vocab.json"
myTokenizer = MyCustomTokenizer(vocab_file = vocab_path, special_tokens_on_vocab_file=False)
print(myTokenizer.vocab)


{'agafar': 0, 'llençar': 1, 'deixar': 2, 'caixa': 3, 'pilota': 4, '<|startoftranscript|>': 5, '<|endoftext|>': 6, '<|transcribe|>': 7, '<|ca|>': 8, '<|notimestamps|>': 9, '<|startofprev|>': 10}


### Loading my pretrained model 

In [3]:

model = create_my_whisper_model(myTokenizer=myTokenizer)

model.load_state_dict(torch.load("whisper-small-ca/model-small-ca.bin", weights_only=True))

print(model.model.decoder.embed_tokens.weight is model.proj_out.weight)


embeding tokens layer and projection layer share the same weights: True
shape of embedding tokens layer:  torch.Size([11, 768])
changing model's config vocab_size: 11
True


In [4]:
dataset_dict = load_from_disk("Audios/dataset1")
print(dataset_dict)
print(dataset_dict["train"][0]["tags"])

DatasetDict({
    train: Dataset({
        features: ['audio_sentence', 'audio_rate', 'text_sentence', 'tags'],
        num_rows: 900
    })
    test: Dataset({
        features: ['audio_sentence', 'audio_rate', 'text_sentence', 'tags'],
        num_rows: 100
    })
})
deixar,caixa,deixar,pilota


In [5]:
model_id = "openai/whisper-small"
language = "ca"
task = "transcribe"

processor = WhisperProcessor.from_pretrained(model_id, language=language, task=task)

prepare_dataset_fn = partial(prepare_dataset, feature_extractor=processor.feature_extractor, myTokenizer=myTokenizer)

dataset_dict_prepared = dataset_dict["test"].map(prepare_dataset_fn, remove_columns=dataset_dict.column_names["train"], num_proc=1)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [6]:
print(dataset_dict_prepared)
print(dataset_dict_prepared[0]["labels"])
print(np.array(dataset_dict_prepared[0]["input_features"]).shape)
print(myTokenizer.decode(dataset_dict_prepared[0]["labels"]))
print(myTokenizer.decode(dataset_dict_prepared[0]["labels"], skip_special_tokens=True))

Dataset({
    features: ['input_features', 'labels'],
    num_rows: 100
})
[5, 8, 7, 9, 2, 3, 0, 4, 6]
(80, 3000)
<|startoftranscript|> <|ca|> <|transcribe|> <|notimestamps|> deixar caixa agafar pilota <|endoftext|>
deixar caixa agafar pilota


In [7]:
compute_metrics_fn = partial(compute_metrics, model=model, myTokenizer=myTokenizer)

data_collator = MyDataCollator(
    feature_extractor=processor.feature_extractor, 
    tokenizer=myTokenizer)

training_args = Seq2SeqTrainingArguments(
    output_dir="temp_dir",         # Puedes usar un directorio temporal o uno que luego elimines
    do_train=False,                # No se realiza entrenamiento
    do_eval=False,                 # Si no vas a evaluar
    per_device_eval_batch_size=16, # Tamaño de batch de predicción
    predict_with_generate=False,   # Si solo quieres obtener los logits, o True si deseas generación
    save_strategy="no",            # Deshabilitar guardado de checkpoints
    logging_strategy="no"          # Deshabilitar logging automático
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    eval_dataset=dataset_dict_prepared,
    data_collator=data_collator,
)

"""
# Ejecuta la evaluación:
eval_results = trainer.evaluate()
print("Resultados de evaluación:", eval_results)
"""

pred_output = trainer.predict(dataset_dict_prepared.select(range(2)))

print("Loggits:", pred_output.predictions[0])


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Loggits: [[[ 4.50201941e+00  4.96576834e+00  2.54502797e+00  4.31525230e+00
    2.15152788e+00 -2.48287272e+00  5.57102442e-01  3.38520098e+00
    2.66460266e+01  4.54913235e+00  4.88377190e+00]
  [ 3.82566690e+00  3.89053583e+00  5.08556128e+00  4.46134615e+00
    3.86609745e+00  4.38014317e+00  3.92341900e+00  2.10325356e+01
    5.92154264e+00  3.05594921e+00  3.68958783e+00]
  [-3.94085407e+00 -3.96751714e+00 -1.82416773e+00 -4.29596424e+00
   -3.67236328e+00 -4.56119585e+00  1.69887722e+00 -6.71873474e+00
   -5.32996655e+00  1.67842941e+01 -4.19086885e+00]
  [-2.11385512e+00 -4.90395737e+00  1.94160843e+01  1.90426946e+00
    1.03237402e+00 -5.82694054e-01 -3.92123580e+00 -3.46455574e+00
   -4.32737780e+00 -1.03182340e+00 -6.42246842e-01]
  [ 8.91955137e-01  1.48907936e+00  5.10175800e+00  2.10894508e+01
    4.82513905e+00 -6.25683784e-01 -2.38726139e+00  3.73377919e+00
    3.09208989e+00  1.60822511e-01 -1.68424666e-01]
  [ 1.49282675e+01 -9.12228107e-01 -2.35168219e+00 -3.2114515

In [8]:
emission_scores = pred_output.predictions[0]

labels = pred_output.label_ids

transitions_file = f"{configs_path}/transitions_file.txt"
for i, es in enumerate(emission_scores): # lo hacemos por cada elemento del batch
    decoded_sequence = crf(transitions_file=transitions_file,myTokenizer=myTokenizer, emission_scores=es)
    print("Decoded sequence:", decoded_sequence)
    decoded_labels = myTokenizer.decode(labels[i], skip_special_tokens=True)
    print("Labels Decoded sequence:", decoded_labels)

    metric = evaluate.load("wer")

    wer = 100 * metric.compute(predictions=[decoded_sequence], references=[decoded_labels])

    print("Word Error Rate (WER) in %:", wer)

Decoded sequence: deixar caixa agafar pilota
Labels Decoded sequence: deixar caixa agafar pilota
Word Error Rate (WER) in %: 0.0
Decoded sequence: llençar pilota deixar pilota
Labels Decoded sequence: llençar pilota deixar pilota
Word Error Rate (WER) in %: 0.0
