archivos tokados con prints modeling_whisper.py, utils.py

In [14]:
from utils import *
from functools import partial
import numpy as np
from transformers import WhisperProcessor
from datasets import load_from_disk
import torch

### Creamos objeto de MyCustomTokenizer

In [None]:
model_id = "openai/whisper-small"
language = "ca"
task = "transcribe"

processor = WhisperProcessor.from_pretrained(model_id, language=language, task=task)

myTokenizer = MyCustomTokenizer(vocab_file="ca-vocab.json")

print(myTokenizer.tokenize("agafar,caixa"))

print(myTokenizer.vocab)

['agafar', 'caixa']
{'agafar': 0, 'llençar': 1, 'deixar': 2, 'caixa': 3, 'pilota': 4, '<|startoftranscript|>': 5, '<|endoftext|>': 6, '<|transcribe|>': 7, '<|ca|>': 8, '<|notimestamps|>': 9, '<|startofprev|>': 10}


## Data

### Data loading

In [None]:
dataset_dict = load_from_disk("Audios/dataset1")
print(dataset_dict)
print(dataset_dict["train"][0]["tags"])

DatasetDict({
    train: Dataset({
        features: ['audio_sentence', 'audio_rate', 'text_sentence', 'tags'],
        num_rows: 900
    })
    test: Dataset({
        features: ['audio_sentence', 'audio_rate', 'text_sentence', 'tags'],
        num_rows: 100
    })
})
deixar,caixa,llençar,pilota


### Data preprocessing

In [None]:
prepare_dataset_fn = partial(prepare_dataset, feature_extractor=processor.feature_extractor, myTokenizer=myTokenizer)

dataset_dict_prepared = dataset_dict.map(prepare_dataset, remove_columns=dataset_dict.column_names["train"], num_proc=4)

In [None]:
print(dataset_dict_prepared)
print(dataset_dict_prepared["train"][0]["labels"])
print(np.array(dataset_dict_prepared["train"][0]["input_features"]).shape)
print(myTokenizer.decode(dataset_dict_prepared["train"][0]["labels"]))
print(myTokenizer.decode(dataset_dict_prepared["train"][0]["labels"], skip_special_tokens=True))

print(dataset_dict_prepared["test"][0]["labels"])
print(np.array(dataset_dict_prepared["test"][0]["input_features"]).shape)
print(myTokenizer.decode(dataset_dict_prepared["test"][0]["labels"]))
print(myTokenizer.decode(dataset_dict_prepared["test"][0]["labels"], skip_special_tokens=True))

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 900
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 100
    })
})
[5, 8, 7, 9, 2, 3, 1, 4, 6]
(80, 3000)
<|startoftranscript|> <|ca|> <|transcribe|> <|notimestamps|> deixar caixa llençar pilota <|endoftext|>
deixar caixa llençar pilota
[5, 8, 7, 9, 1, 3, 2, 4, 6]
(80, 3000)
<|startoftranscript|> <|ca|> <|transcribe|> <|notimestamps|> llençar caixa deixar pilota <|endoftext|>
llençar caixa deixar pilota


### Data Collator

In [None]:
my_datacollator = MyDataCollator(feature_extractor=processor.feature_extractor, tokenizer=myTokenizer)

aux = []
for i in range(5):
    aux.append(dataset_dict_prepared["train"][i])
batch = my_datacollator(aux)
print(batch["labels"])
print(batch["input_features"].shape)

tensor([[8, 7, 9, 2, 3, 1, 4, 6],
        [8, 7, 9, 2, 4, 0, 4, 6],
        [8, 7, 9, 1, 4, 2, 3, 6],
        [8, 7, 9, 2, 4, 0, 3, 6],
        [8, 7, 9, 2, 3, 0, 4, 6]])
torch.Size([5, 80, 3000])


## Model

### Loading model and changing configs for our vocabulary and switching embedding layer

In [None]:
model = create_my_whisper_model(myTokenizer=myTokenizer)

[['ag', 'afar'], ['llen', 'Ã§ar'], ['de', 'ix', 'ar'], ['ca', 'ixa'], ['p', 'il', 'ota'], ['<|startoftranscript|>'], ['<|endoftext|>'], ['<|transcribe|>'], ['<|ca|>'], ['<|notimestamps|>'], ['<|startofprev|>']]
[[559, 47030], [19191, 26378], [1479, 970, 289], [496, 19195], [79, 388, 5377], [50258], [50257], [50359], [50270], [50363], [50361]]
True
torch.Size([11, 768])
torch.Size([11, 768])
11


## Training

### Metrics

In [None]:

compute_metrics_fn = partial(
    compute_metrics,
    model=model,
    myTokenizer=myTokenizer,
)


### Setting Arguments and initializing trainer

In [10]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="whisper-small-ca",  # cambia el nombre del repositorio si lo deseas
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # aumenta de 2x por cada 2x de disminución en el batch size
    learning_rate=1e-4,
    warmup_steps=100,
    num_train_epochs=5,             # Entrenamiento durante 10 epochs
    #gradient_checkpointing=True,
    fp16=True,
    eval_strategy="epoch",     # Evalúa al final de cada epoch
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    logging_steps=25,
    report_to=["tensorboard"],
    metric_for_best_model="wer",
    greater_is_better=False,
    save_strategy="no",  # Esto evitará que el modelo se guarde automáticamente
    save_steps=None  # Si no quieres que se guarde en pasos
)


In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset_dict_prepared["train"],
    eval_dataset=dataset_dict_prepared["test"],
    data_collator=my_datacollator,
    compute_metrics=compute_metrics_fn,
    processing_class=myTokenizer,
)

### Training

In [12]:
trainer.train()
print("entrenamiento completado")

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Wer
1,0.2232,0.080034,7.837838
2,0.3554,0.029342,2.432432
3,0.0191,0.026362,1.081081
4,0.0022,0.011089,1.351351
5,0.0017,0.003021,0.540541


predicted:  ['llençar caixa deixar caixa', 'agafar caixa llençar caixa', 'llençar caixa deixar caixa', 'llençar pilota', 'agafar caixa agafar pilota', 'llençar caixa', 'agafar pilota', 'deixar pilota llençar caixa', 'agafar caixa', 'deixar pilota llençar pilota', 'agafar pilota agafar caixa', 'llençar caixa', 'llençar pilota', 'llençar pilota agafar pilota', 'deixar caixa llençar pilota', 'llençar pilota', 'llençar pilota agafar caixa', 'llençar pilota deixar caixa', 'deixar pilota', 'agafar caixa agafar pilota', 'agafar caixa llençar pilota', 'deixar caixa llençar pilota', 'agafar pilota deixar pilota', 'llençar caixa llençar caixa', 'llençar caixa agafar pilota', 'deixar pilota llençar pilota', 'llençar pilota', 'agafar caixa deixar pilota', 'agafar pilota', 'deixar pilota llençar pilota', 'deixar caixa llençar pilota', 'llençar pilota', 'llençar caixa deixar caixa', 'agafar caixa deixar caixa', 'agafar pilota deixar caixa', 'llençar pilota agafar caixa', 'deixar caixa llençar caixa'

### Saving model locally

In [None]:
print(model.state_dict)
path = "whisper-small-ca/model-small-ca.bin"
torch.save(model.state_dict(), path)
print(f"modelo guardado en {path}")

<bound method Module.state_dict of WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, ou

### Saving Model in Huggingface