archivos tokados con prints modeling_whisper.py, utils.py

In [1]:
from src.utils import *
from functools import partial
import numpy as np
from transformers import WhisperProcessor
from datasets import load_from_disk
import torch

### Creamos objeto de MyCustomTokenizer

In [2]:
model_id = "openai/whisper-small"
language = "ca"
task = "transcribe"

processor = WhisperProcessor.from_pretrained(model_id, language=language, task=task)
configs_path = f"config-data"
myTokenizer = MyCustomTokenizer(vocab_file=f"{configs_path}/ca-vocab.json")

print(myTokenizer.tokenize("agafar,caixa"))

print(myTokenizer.vocab)

['agafar', 'caixa']
{'agafar': 0, 'llençar': 1, 'deixar': 2, 'caixa': 3, 'pilota': 4, '<|startoftranscript|>': 5, '<|endoftext|>': 6, '<|transcribe|>': 7, '<|ca|>': 8, '<|notimestamps|>': 9, '<|startofprev|>': 10}


## Data

### Data loading

In [3]:
dataset_dict = load_from_disk("Audios/dataset1")
print(dataset_dict)
print(dataset_dict["train"][0]["tags"])

DatasetDict({
    train: Dataset({
        features: ['audio_sentence', 'audio_rate', 'text_sentence', 'tags'],
        num_rows: 900
    })
    test: Dataset({
        features: ['audio_sentence', 'audio_rate', 'text_sentence', 'tags'],
        num_rows: 100
    })
})
deixar,caixa,deixar,pilota


### Data preprocessing

In [4]:
prepare_dataset_fn = partial(prepare_dataset, feature_extractor=processor.feature_extractor, myTokenizer=myTokenizer)

dataset_dict_prepared = dataset_dict.map(prepare_dataset_fn, remove_columns=dataset_dict.column_names["train"], num_proc=4)

In [5]:
print(dataset_dict_prepared)
print(dataset_dict_prepared["train"][0]["labels"])
print(np.array(dataset_dict_prepared["train"][0]["input_features"]).shape)
print(myTokenizer.decode(dataset_dict_prepared["train"][0]["labels"]))
print(myTokenizer.decode(dataset_dict_prepared["train"][0]["labels"], skip_special_tokens=True))

print(dataset_dict_prepared["test"][0]["labels"])
print(np.array(dataset_dict_prepared["test"][0]["input_features"]).shape)
print(myTokenizer.decode(dataset_dict_prepared["test"][0]["labels"]))
print(myTokenizer.decode(dataset_dict_prepared["test"][0]["labels"], skip_special_tokens=True))

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 900
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 100
    })
})
[5, 8, 7, 9, 2, 3, 2, 4, 6]
(80, 3000)
<|startoftranscript|> <|ca|> <|transcribe|> <|notimestamps|> deixar caixa deixar pilota <|endoftext|>
deixar caixa deixar pilota
[5, 8, 7, 9, 2, 3, 0, 4, 6]
(80, 3000)
<|startoftranscript|> <|ca|> <|transcribe|> <|notimestamps|> deixar caixa agafar pilota <|endoftext|>
deixar caixa agafar pilota


### Data Collator

In [6]:
my_datacollator = MyDataCollator(feature_extractor=processor.feature_extractor, tokenizer=myTokenizer)

aux = []
for i in range(5):
    aux.append(dataset_dict_prepared["train"][i])
batch = my_datacollator(aux)
print(batch["labels"])
print(batch["input_features"].shape)

tensor([[   8,    7,    9,    2,    3,    2,    4,    6],
        [   8,    7,    9,    2,    3,    1,    3,    6],
        [   8,    7,    9,    2,    3,    2,    4,    6],
        [   8,    7,    9,    2,    4,    0,    3,    6],
        [   8,    7,    9,    1,    3,    6, -100, -100]])
torch.Size([5, 80, 3000])


## Model

### Loading model and changing configs for our vocabulary and switching embedding layer

In [7]:
model = create_my_whisper_model(myTokenizer=myTokenizer)

embeding tokens layer and projection layer share the same weights: True
shape of embedding tokens layer:  torch.Size([11, 768])
changing model's config vocab_size: 11


## Training

### Metrics

In [8]:

compute_metrics_fn = partial(
    compute_metrics,
    model=model,
    myTokenizer=myTokenizer,
)


### Setting Arguments and initializing trainer

In [9]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="whisper-small-ca",  
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # aumenta de 2x por cada 2x de disminución en el batch size
    learning_rate=1e-4,
    warmup_steps=100,
    num_train_epochs=6,             # Entrenamiento durante x epochs
    #gradient_checkpointing=True,
    fp16=True,
    eval_strategy="epoch",     # Evalúa al final de cada epoch
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    logging_steps=25,
    report_to=["tensorboard"],
    metric_for_best_model="wer",
    greater_is_better=False,
    save_strategy="no",  
    save_steps=None  
)


In [10]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset_dict_prepared["train"],
    eval_dataset=dataset_dict_prepared["test"],
    data_collator=my_datacollator,
    compute_metrics=compute_metrics_fn,
    processing_class=myTokenizer,
)

### Training

In [11]:
trainer.train()
print("entrenamiento completado")

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Wer
1,0.2402,0.114012,10.928962
2,0.0713,0.097227,2.73224
3,0.0237,0.005385,0.819672
4,0.0014,0.009207,0.546448
5,0.0,0.000539,0.0
6,0.0,0.000534,0.0


predicted:  ['deixar caixa agafar pilota', 'llençar pilota deixar pilota', 'deixar caixa', 'llençar caixa llençar pilota', 'llençar caixa deixar pilota', 'deixar pilota', 'llençar caixa agafar pilota', 'deixar pilota', 'llençar caixa agafar pilota', 'llençar pilota agafar caixa', 'deixar caixa', 'llençar pilota agafar caixa', 'llençar pilota llençar caixa', 'deixar pilota', 'deixar caixa agafar caixa', 'agafar caixa deixar pilota', 'agafar caixa', 'llençar caixa agafar pilota', 'llençar caixa agafar pilota', 'agafar caixa llençar pilota', 'deixar pilota', 'deixar caixa agafar pilota', 'agafar pilota', 'agafar pilota llençar pilota', 'agafar pilota agafar caixa', 'llençar caixa', 'llençar pilota deixar caixa', 'deixar pilota llençar caixa', 'deixar pilota', 'agafar pilota deixar caixa', 'agafar caixa', 'agafar pilota', 'llençar caixa agafar pilota', 'deixar caixa agafar pilota', 'deixar caixa llençar pilota', 'deixar pilota deixar pilota', 'llençar pilota deixar pilota', 'agafar caixa d

### Saving model locally

In [12]:
print(model.state_dict)
path = "whisper-small-ca/model-small-ca.bin"
torch.save(model.state_dict(), path)
print(f"modelo guardado en {path}")

<bound method Module.state_dict of WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, ou