In [1]:
from utils import MyCustomTokenizer, MyDataCollator, create_my_whisper_model, prepare_dataset, compute_metrics, crf
import torch
from datasets import load_from_disk
from transformers import WhisperProcessor, Seq2SeqTrainer, Seq2SeqTrainingArguments
import numpy as np
from functools import partial
import evaluate

### Tokenizer

In [2]:

saved_model_path = "whisper-small-ca"
vocab_path = f"ca-vocab.json"
myTokenizer = MyCustomTokenizer(vocab_file = vocab_path, special_tokens_on_vocab_file=False)
print(myTokenizer.vocab)


{'agafar': 0, 'llençar': 1, 'deixar': 2, 'caixa': 3, 'pilota': 4, '<|startoftranscript|>': 5, '<|endoftext|>': 6, '<|transcribe|>': 7, '<|ca|>': 8, '<|notimestamps|>': 9, '<|startofprev|>': 10}


### Loading my pretrained model 

In [3]:

model = create_my_whisper_model(myTokenizer=myTokenizer)

model.load_state_dict(torch.load("whisper-small-ca/model-small-ca", weights_only=True))

print(model.model.decoder.embed_tokens.weight is model.proj_out.weight)


[['ag', 'afar'], ['llen', 'Ã§ar'], ['de', 'ix', 'ar'], ['ca', 'ixa'], ['p', 'il', 'ota'], ['<|startoftranscript|>'], ['<|endoftext|>'], ['<|transcribe|>'], ['<|ca|>'], ['<|notimestamps|>'], ['<|startofprev|>']]
[[559, 47030], [19191, 26378], [1479, 970, 289], [496, 19195], [79, 388, 5377], [50258], [50257], [50359], [50270], [50363], [50361]]
True
torch.Size([11, 768])
torch.Size([11, 768])
11
True


In [4]:
dataset_dict = load_from_disk("Audios/dataset1")
print(dataset_dict)
print(dataset_dict["train"][0]["tags"])

DatasetDict({
    train: Dataset({
        features: ['audio_sentence', 'audio_rate', 'text_sentence', 'tags'],
        num_rows: 900
    })
    test: Dataset({
        features: ['audio_sentence', 'audio_rate', 'text_sentence', 'tags'],
        num_rows: 100
    })
})
deixar,caixa,llençar,pilota


In [5]:
model_id = "openai/whisper-small"
language = "ca"
task = "transcribe"

processor = WhisperProcessor.from_pretrained(model_id, language=language, task=task)

prepare_dataset_fn = partial(prepare_dataset, feature_extractor=processor.feature_extractor, myTokenizer=myTokenizer)

dataset_dict_prepared = dataset_dict["test"].map(prepare_dataset_fn, remove_columns=dataset_dict.column_names["train"], num_proc=1)

In [6]:
print(dataset_dict_prepared)
print(dataset_dict_prepared[0]["labels"])
print(np.array(dataset_dict_prepared[0]["input_features"]).shape)
print(myTokenizer.decode(dataset_dict_prepared[0]["labels"]))
print(myTokenizer.decode(dataset_dict_prepared[0]["labels"], skip_special_tokens=True))

Dataset({
    features: ['input_features', 'labels'],
    num_rows: 100
})
[5, 8, 7, 9, 1, 3, 2, 4, 6]
(80, 3000)
<|startoftranscript|> <|ca|> <|transcribe|> <|notimestamps|> llençar caixa deixar pilota <|endoftext|>
llençar caixa deixar pilota


In [7]:
compute_metrics_fn = partial(compute_metrics, model=model, myTokenizer=myTokenizer)

data_collator = MyDataCollator(
    feature_extractor=processor.feature_extractor, 
    tokenizer=myTokenizer)

training_args = Seq2SeqTrainingArguments(
    output_dir="temp_dir",         # Puedes usar un directorio temporal o uno que luego elimines
    do_train=False,                # No se realiza entrenamiento
    do_eval=False,                 # Si no vas a evaluar
    per_device_eval_batch_size=16, # Tamaño de batch de predicción
    predict_with_generate=False,   # Si solo quieres obtener los logits, o True si deseas generación
    save_strategy="no",            # Deshabilitar guardado de checkpoints
    logging_strategy="no"          # Deshabilitar logging automático
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    eval_dataset=dataset_dict_prepared,
    data_collator=data_collator,
)

"""
# Ejecuta la evaluación:
eval_results = trainer.evaluate()
print("Resultados de evaluación:", eval_results)
"""

pred_output = trainer.predict(dataset_dict_prepared.select(range(2)))

print("Loggits:", pred_output.predictions[0])


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Loggits: [[[ 3.85254955e+00  5.13223171e+00  2.29882765e+00  4.06304741e+00
    2.20220661e+00 -3.06696320e+00 -1.32876790e+00  3.93809748e+00
    2.40637169e+01  3.68621731e+00  5.06297731e+00]
  [ 3.88227797e+00  3.95090318e+00  5.14947605e+00  4.54010534e+00
    3.92293596e+00  4.53559875e+00  3.91867304e+00  2.15303001e+01
    6.07940149e+00  3.12380934e+00  3.78774238e+00]
  [-3.26619816e+00 -3.16114259e+00 -1.16760170e+00 -3.46700048e+00
   -2.92175484e+00 -4.29721594e+00  2.21208978e+00 -6.21102810e+00
   -4.87519646e+00  1.77827187e+01 -3.65769672e+00]
  [-4.38157082e+00  1.47170153e+01 -1.02587211e+00 -6.36096001e+00
   -4.23857784e+00 -6.08483410e+00 -1.24054785e+01 -6.25136566e+00
   -7.00956881e-01 -4.15676975e+00 -4.38167191e+00]
  [ 1.44269741e+00  1.01453257e+00  6.39746237e+00  2.12195206e+01
    4.22571564e+00 -8.75395060e-01 -2.83021259e+00  2.73089552e+00
    4.22566414e+00  1.07082641e+00  2.58942425e-01]
  [-1.93256032e+00 -2.37850046e+00  1.90279999e+01  3.2896862

In [8]:
emission_scores = pred_output.predictions[0]

labels = pred_output.label_ids

transitions_file = "transitions_file.txt"
for i, es in enumerate(emission_scores): # lo hacemos por cada elemento del batch
    decoded_sequence = crf(transitions_file=transitions_file,myTokenizer=myTokenizer, emission_scores=es)
    print("Decoded sequence:", decoded_sequence)
    decoded_labels = myTokenizer.decode(labels[i], skip_special_tokens=True)
    print("Labels Decoded sequence:", decoded_labels)

    metric = evaluate.load("wer")

    wer = 100 * metric.compute(predictions=[decoded_sequence], references=[decoded_labels])

    print("Word Error Rate (WER) in %:", wer)

Decoded sequence: llençar caixa deixar pilota
Labels Decoded sequence: llençar caixa deixar pilota
Word Error Rate (WER) in %: 0.0
Decoded sequence: agafar caixa llençar caixa
Labels Decoded sequence: agafar caixa llençar caixa
Word Error Rate (WER) in %: 0.0
