In [None]:
! pip install -r requirements.txt

Collecting sympy==1.13.1 (from torch->-r requirements.txt (line 3))
  Using cached sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Using cached sympy-1.13.1-py3-none-any.whl (6.2 MB)
Installing collected packages: sympy
  Attempting uninstall: sympy
    Found existing installation: sympy 1.14.0
    Uninstalling sympy-1.14.0:
      Successfully uninstalled sympy-1.14.0
Successfully installed sympy-1.13.1


: 

In [1]:
from finetunning_v2.data_manager import DataManager
from finetunning_v2.tokenizer import DataTokenizer
from finetunning_v2.mt5_lora import build_mt5_lora
from finetunning_v2.data_collator import DataCollatorT5
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer
from finetunning_v2.callbacks import (
    PrintExamplesCallback,
    PerplexityCallback,
    MemoryCallback,
    GradNormCallback,
    LearningRateCallback
)

: 

# 1. Entrenamiento con Train Entero

In [7]:
dm = DataManager(languages=['en', 'am', 'ar', 'de', 'es', 'hi', 'ru', 'uk', 'zh'])
dm.load_main_dataset()
dm.compute_overlap_feature()
dm.stratified_split(stratify_by=["language"])  # como acordamos
dm.create_curriculum_datasets()  # easy/medium/hard/full a nivel texto

# 2. Tokenizer
dtok = DataTokenizer(
    tokenizer_name="google/mt5-base",
    prefix="detoxify_keep_meaning: ",
    max_input_length=128,
    max_target_length=128,
    logger=dm.logger
)

tokenized_splits = dtok.tokenize_splits(dm.splits)
train_dataset = tokenized_splits["train"]
eval_dataset = tokenized_splits["val"]

# 3. Collator
collator = DataCollatorT5(tokenizer=dtok.tokenizer)

# 4. Model + LoRA
model = build_mt5_lora(base_model_name="google/mt5-base", device="cuda")

# 5. TrainingArguments (los que definimos antes)
training_args = Seq2SeqTrainingArguments(
    output_dir="./mt5_detox_baseline",
    overwrite_output_dir=True,
    learning_rate=2e-4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=10,
    logging_dir="./logs",
    logging_steps=50,
    log_level="info",
    save_total_limit=4,
    save_strategy="epoch",
    eval_strategy="epoch",
    predict_with_generate=True,
    generation_max_length=128,
    fp16=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

# 6. Trainer (aquí enchufarías también los callbacks que definimos)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=dtok.tokenizer,
    data_collator=collator,
    callbacks=[
        PrintExamplesCallback(dtok.tokenizer, every_steps=50),
        PerplexityCallback(),
        MemoryCallback(),
        GradNormCallback(every_steps=50),
        LearningRateCallback(),
    ]
)

[11:14:57] [INFO] Inicializando DataManager para idiomas: ['en', 'am', 'ar', 'de', 'es', 'hi', 'ru', 'uk', 'zh']
loading file spiece.model from cache at C:\Users\mario\.cache\huggingface\hub\models--google--mt5-base\snapshots\2eb15465c5dd7f72a8f7984306ad05ebc3dd1e1f\spiece.model
loading file tokenizer.json from cache at None
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at C:\Users\mario\.cache\huggingface\hub\models--google--mt5-base\snapshots\2eb15465c5dd7f72a8f7984306ad05ebc3dd1e1f\special_tokens_map.json
loading file tokenizer_config.json from cache at C:\Users\mario\.cache\huggingface\hub\models--google--mt5-base\snapshots\2eb15465c5dd7f72a8f7984306ad05ebc3dd1e1f\tokenizer_config.json
loading file chat_template.jinja from cache at None
loading configuration file config.json from cache at C:\Users\mario\.cache\huggingface\hub\models--google--mt5-base\snapshots\2eb15465c5dd7f72a8f7984306ad05ebc3dd1e1f\config.json
Model config MT5Co

Tokenizing dataset (num_proc=4):   0%|          | 0/2880 [00:00<?, ? examples/s]

[11:15:50] [INFO]  → Tokenizing split `val` (360 samples)
[11:15:50] [INFO] Tokenizing dataset with 360 samples...


Tokenizing dataset (num_proc=4):   0%|          | 0/360 [00:00<?, ? examples/s]

[11:16:05] [INFO]  → Tokenizing split `test` (360 samples)
[11:16:05] [INFO] Tokenizing dataset with 360 samples...


Tokenizing dataset (num_proc=4):   0%|          | 0/360 [00:00<?, ? examples/s]

loading configuration file config.json from cache at C:\Users\mario\.cache\huggingface\hub\models--google--mt5-base\snapshots\2eb15465c5dd7f72a8f7984306ad05ebc3dd1e1f\config.json
Model config MT5Config {
  "architectures": [
    "MT5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "mt5",
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tokenizer",
  "transformers_version": "4.57.1",
  "use_cache": true,
  "vocab_size": 250112
}

loading weights file pytorch_model.b

AssertionError: Torch not compiled with CUDA enabled

In [6]:
trainer.train()

***** Running training *****
  Num examples = 2,880
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 4
  Total optimization steps = 3,600
  Number of trainable parameters = 1,769,472
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 