In [12]:
# Imports
import torch
import numpy as np
import time
import os
import re
import json
import pandas as pd
from datasets import Dataset

# IMPORTANTE: Configurar para usar solo 1 GPU ANTES de importar transformers
# Esto previene errores NCCL en sistemas con múltiples GPUs
#os.environ["CUDA_VISIBLE_DEVICES"] = "0"
#os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # Para debugging si es necesario

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from peft import LoraConfig, get_peft_model, TaskType
import evaluate
import warnings
warnings.filterwarnings('ignore')

# Verificar GPU
device = "cpu"#"cuda" if torch.cuda.is_available() else "cpu"
print(f"Usando dispositivo: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memoria disponible: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    print(f"Número de GPUs visibles: {torch.cuda.device_count()}")

Usando dispositivo: cpu
GPU: NVIDIA GeForce GTX 970
Memoria disponible: 4.22 GB
Número de GPUs visibles: 1


In [13]:

lang_code = {
    "asturiano": {
        "tatoeba": "ast",
        "opus": "ast"
    },
    "aranes": {
        "tatoeba": "oci",
        "opus": "oc"
    },
    "aragones": {
        "tatoeba": "arg",
        "opus": "an"
    }
}


In [14]:
class LanguageDatasets():
    def __init__(self, language, initialize=True):
        if language not in ["aragones", "asturiano", "occitano"]:
            raise KeyError("Lenguaje no contemplado")
        self.raw_datasets = {}
        self.language = language
        self.language_codes = lang_code[language]
        self.json = []
        if initialize:
            self.start()

    @property
    def hf_dataset(self):
        """Devuelve un Dataset de HuggingFace a partir de self.json"""
        return Dataset.from_list(self.json)

    def tokenize(self, tokenizer, max_length=512):
        """
        Aplica un tokenizer externo al dataset.
        Devuelve un HuggingFace Dataset tokenizado listo para entrenamiento.
        """
            # Aseguramos que el tokenizer tenga pad_token
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        def _tokenize(example):
            result = tokenizer(
                example["text"],
                truncation=True,
                max_length=max_length,
                padding="max_length",
            )
            result["labels"] = result["input_ids"].copy()
            return result

        return self.hf_dataset.map(_tokenize)

    def start(self):
        print(f"Descargando tatoeba para {self.language}:")
        try:
            self.read_tatoeba_url(
                f"https://downloads.tatoeba.org/exports/per_language/"
                f"{self.language_codes['tatoeba']}/"
                f"{self.language_codes['tatoeba']}_sentences_detailed.tsv.bz2"
            )
            print("Completado con éxito")
        except Exception as e:
            print("No se pudo completar por:", e)

        print(f"Cargando txt locales para {self.language}:")
        self.read_folder(f"datasets/{self.language}")

    def read_tatoeba_url(self, url):
        df = pd.read_csv(
            url,
            sep="\t",
            compression="bz2",
            header=None,
            names=["id", "lang", "text", "author", "created_at", "updated_at"]
        )
        if df.iloc[0]["lang"] != self.language_codes["tatoeba"]:
            raise ValueError("El dataset descargado no corresponde al idioma esperado")
        elif "tatoeba" in self.raw_datasets:
            raise ValueError("El dataset tatoeba ya está cargado")

        json_data = self.pandas_to_json(df)
        start = len(self.json)
        self.json += json_data
        end = len(self.json)
        self.raw_datasets["tatoeba"] = {"start": start, "end": end}
        return df

    def read_folder(self, directory):
        for file in os.listdir(directory):
            if file.endswith(".txt"):
                try:
                    self.read_local_file(directory, file)
                    print(f"Archivo {file} cargado")
                except Exception as e:
                    print(f"Fallo al cargar el archivo {file}: {e}")

    def read_local_file(self, directory, file):
        dataset_name = file.split(".")[0]
        if dataset_name in self.raw_datasets.keys():
            raise ValueError(f"El dataset {directory}/{file} ya está cargado")

        json_data = []
        with open(os.path.join(directory, file), "r", encoding="utf-8") as f:
            for line in f:
                clean_line = self.clean_text(line)
                if clean_line:
                    json_data.append({"text": clean_line})

        start = len(self.json)
        self.json += json_data
        end = len(self.json)
        self.raw_datasets[dataset_name] = {"start": start, "end": end}
        return json_data

    def clean_text(self, text):
        text = re.sub(r"[^\w\s\n]", " ", text)
        text = re.sub(r"[ \t]+", " ", text)
        text = re.sub(r"(.)\1{5,}", r"\1"*5, text)
        def limit_word_reps(match):
            word = match.group(1)
            return " ".join([word]*5)
        text = re.sub(r"\b(\w+)( \1){5,}\b", limit_word_reps, text)
        text = text.lower()
        return text.strip()

    def pandas_to_json(self, df, clean=True, save=False):
        json_data = []
        for t in df["text"].tolist():
            clean_line = self.clean_text(t) if clean else t
            if clean_line:
                json_data.append({"text": clean_line})
        if save:
            with open(save, "w", encoding="utf-8") as f:
                f.write(json.dumps(json_data, ensure_ascii=False))
        return json_data


In [15]:
from transformers import AutoTokenizer
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
import torch

# Inicializamos dataset
ast = LanguageDatasets("asturiano")

MODEL_NAME = "Qwen/Qwen2.5-3B"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Tokenizamos dataset
tokenized_dataset = ast.tokenize(tokenizer)

# Ajustamos labels para ignorar el relleno en la pérdida
def mask_labels(example):
    example["labels"] = [
        (id if mask == 1 else -100)
        for id, mask in zip(example["input_ids"], example["attention_mask"])
    ]
    return example

tokenized_dataset = tokenized_dataset.map(mask_labels)


Descargando tatoeba para asturiano:
Completado con éxito
Cargando txt locales para asturiano:


Map:   0%|          | 0/814 [00:00<?, ? examples/s]

Map:   0%|          | 0/814 [00:00<?, ? examples/s]

### Collation

In [16]:
# Data collator para padding dinámico
# Alternativa a padding="max_length": permite diferentes longitudes por batch
# Más eficiente en memoria cuando las secuencias tienen longitudes variadas
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,     # Tokenizer para aplicar padding
    padding=True,            # Aplicar padding dinámico
    max_length=None,         # Sin límite adicional (usa el del tokenizer)
    pad_to_multiple_of=None  # Sin redondeo de longitud
)

# Métrica de evaluación
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    """
    Calcula métricas de evaluación durante el entrenamiento.
    
    Args:
        eval_pred: Tuple con (predictions, labels)
                   predictions: logits del modelo (shape: [batch_size, num_labels])
                   labels: etiquetas verdaderas (shape: [batch_size])
    
    Returns:
        Dict con métricas calculadas
    """
    predictions, labels = eval_pred
    # Convertir logits a clases predichas (argmax sobre dimensión de clases)
    predictions = np.argmax(predictions, axis=1)
    # Calcular accuracy comparando predicciones con labels verdaderos
    return accuracy_metric.compute(predictions=predictions, references=labels)

### Preparar modelo Base

In [17]:
from transformers import AutoModelForCausalLM

base_model_id = "Qwen/Qwen2.5-3B"

model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map="auto"   # usa GPU si está disponible
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu and disk.


### Configurar Lora

In [18]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],  # módulos típicos en Qwen/Mistral
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# Aplicar LoRA al modelo
model_lora = get_peft_model(model, lora_config)

# Imprimir información del modelo
print("\nInformación del modelo con LoRA:")
model_lora.print_trainable_parameters()

# Contar parámetros manualmente para comparación
total_params_lora = sum(p.numel() for p in model_lora.parameters())
trainable_params_lora = sum(p.numel() for p in model_lora.parameters() if p.requires_grad)

print(f"\nRESUMEN:")
print(f"Parámetros totales: {total_params_lora:,}")
print(f"Parámetros entrenables: {trainable_params_lora:,}")
print(f"Porcentaje entrenable: {100 * trainable_params_lora / total_params_lora:.2f}%")
print(f"\n¡Solo entrenamos ~1% de los parámetros del modelo!")
print(f"\nCon rank r={lora_config.r}:")
print(f"  - Cada matriz LoRA añade: d×r + r×k parámetros")
print(f"  - Para attention de dim 768: ~{(768*8 + 8*768)*2:,} params por capa")


Información del modelo con LoRA:
trainable params: 3,686,400 || all params: 3,089,625,088 || trainable%: 0.1193

RESUMEN:
Parámetros totales: 3,089,625,088
Parámetros entrenables: 3,686,400
Porcentaje entrenable: 0.12%

¡Solo entrenamos ~1% de los parámetros del modelo!

Con rank r=8:
  - Cada matriz LoRA añade: d×r + r×k parámetros
  - Para attention de dim 768: ~24,576 params por capa


### Entrenamiento

In [19]:
# Configuración de entrenamiento
training_args_lora = TrainingArguments(
    output_dir="./results_lora",
    num_train_epochs=3,              # Número de épocas completas de entrenamiento
    per_device_train_batch_size=8,   # Batch size para entrenamiento
    per_device_eval_batch_size=16,   # Batch size para evaluación (puede ser mayor)
    learning_rate=2e-4,              # Learning rate (típicamente más alto con LoRA: 1e-4 a 3e-4)
    weight_decay=0.01,               # Regularización L2
    eval_strategy="no",           # Evaluar al final de cada época
    save_strategy="no",              # No guardar checkpoints (para rapidez)
    logging_steps=50,                # Log cada 50 steps
    report_to="none",                # No reportar a wandb/tensorboard
    fp16=torch.cuda.is_available(),  # Mixed precision training si hay GPU
)

# Crear Trainer
trainer_lora = Trainer(
    model=model_lora,
    args=training_args_lora,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


The model is already on multiple devices. Skipping the move to device specified in `args`.


In [None]:
# Entrenar y medir tiempo/memoria
torch.cuda.empty_cache() if torch.cuda.is_available() else None

# Métricas de memoria
if torch.cuda.is_available():
    torch.cuda.reset_peak_memory_stats()
    start_memory_lora = torch.cuda.memory_allocated() / 1e9  # GB
    print(f"Memoria GPU inicial: {start_memory_lora:.2f} GB")

start_time = time.time()

print("\nIniciando entrenamiento con LoRA...")
print("   (Esto debería ser más rápido que full fine-tuning)\n")

trainer_lora.train()

end_time = time.time()
training_time_lora = end_time - start_time

# Reportar uso de memoria
if torch.cuda.is_available():
    peak_memory_lora = torch.cuda.max_memory_allocated() / 1e9  # GB
    current_memory_lora = torch.cuda.memory_allocated() / 1e9  # GB
    print(f"\nREPORTE DE MEMORIA:")
    print(f"   Memoria inicial: {start_memory_lora:.2f} GB")
    print(f"   Memoria actual: {current_memory_lora:.2f} GB")
    print(f"   Memoria pico durante entrenamiento: {peak_memory_lora:.2f} GB")
    print(f"   Memoria adicional usada: {peak_memory_lora - start_memory_lora:.2f} GB")
else:
    peak_memory_lora = 0
    print(f"\nEjecutando en CPU (no hay métricas de memoria GPU)")

print(f"\nTIEMPO DE ENTRENAMIENTO:")
print(f"   {training_time_lora:.2f} segundos ({training_time_lora/60:.2f} minutos)")


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Memoria GPU inicial: 0.00 GB

Iniciando entrenamiento con LoRA...
   (Esto debería ser más rápido que full fine-tuning)



### Evaluar

In [None]:
# Evaluar
print("\nEvaluando modelo con LoRA...")
eval_results_lora = trainer_lora.evaluate()

print("\n" + "=" * 70)
print("RESULTADOS - LORA (PEFT)")
print("=" * 70)
print(f"Accuracy: {eval_results_lora['eval_accuracy']:.4f}")
print(f"Loss: {eval_results_lora['eval_loss']:.4f}")
print(f"Parámetros entrenables: {trainable_params_lora:,} ({100 * trainable_params_lora / total_params_lora:.2f}%)")
print(f"Tiempo: {training_time_lora:.2f}s ({training_time_lora/60:.2f} min)")
if torch.cuda.is_available():
    print(f"Memoria pico: {peak_memory_lora:.2f} GB")
print("=" * 70)

print("\nLoRA permite hacer fine-tuning de modelos grandes eficientemente")
print("   Ideal para: recursos limitados, múltiples tareas, experimentación rápida")


## Usar Modelo

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

base_model_id = "mistralai/Mistral-7B-v0.1"

tokenizer = AutoTokenizer.from_pretrained(base_model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map="auto",
    torch_dtype=torch.float16
)


In [None]:
from peft import PeftModel

# Ruta donde guardaste tu LoRA tras el entrenamiento
lora_path = "./lora-asturiano"

model = PeftModel.from_pretrained(model, lora_path)


In [None]:
prompt = "¿Cómo ta el cielu al atapecer en Xixón?"

inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(
    **inputs,
    max_new_tokens=50,
    do_sample=True,
    temperature=0.7,
    top_p=0.9
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))
