In [1]:
import os
import re
import json
import pandas as pd
from datasets import Dataset

lang_code = {
    "asturiano": {
        "tatoeba": "ast",
        "opus": "ast"
    },
    "aranes": {
        "tatoeba": "oci",
        "opus": "oc"
    },
    "aragones": {
        "tatoeba": "arg",
        "opus": "an"
    }
}


In [None]:
class LanguageDatasets():
    def __init__(self, language, initialize=True):
        if language not in ["aragones", "asturiano", "occitano"]:
            raise KeyError("Lenguaje no contemplado")
        self.raw_datasets = {}
        self.language = language
        self.language_codes = lang_code[language]
        self.json = []
        if initialize:
            self.start()

    @property
    def hf_dataset(self):
        """Devuelve un Dataset de HuggingFace a partir de self.json"""
        return Dataset.from_list(self.json)

    def tokenize(self, tokenizer, max_length=512):
        """
        Aplica un tokenizer externo al dataset.
        Devuelve un HuggingFace Dataset tokenizado listo para entrenamiento.
        """
            # Aseguramos que el tokenizer tenga pad_token
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        def _tokenize(example):
            result = tokenizer(
                example["text"],
                truncation=True,
                max_length=max_length,
                padding="max_length",
            )
            result["labels"] = result["input_ids"].copy()
            return result

        return self.hf_dataset.map(_tokenize)

    def start(self):
        print(f"Descargando tatoeba para {self.language}:")
        try:
            self.read_tatoeba_url(
                f"https://downloads.tatoeba.org/exports/per_language/"
                f"{self.language_codes['tatoeba']}/"
                f"{self.language_codes['tatoeba']}_sentences_detailed.tsv.bz2"
            )mistralai/Mistral-7B-v0.1
            print("Completado con éxito")
        except Exception as e:
            print("No se pudo completar por:", e)

        print(f"Cargando txt locales para {self.language}:")
        self.read_folder(f"datasets/{self.language}")

    def read_tatoeba_url(self, url):
        df = pd.read_csv(
            url,
            sep="\t",
            compression="bz2",
            header=None,
            names=["id", "lang", "text", "author", "created_at", "updated_at"]
        )
        if df.iloc[0]["lang"] != self.language_codes["tatoeba"]:
            raise ValueError("El dataset descargado no corresponde al idioma esperado")
        elif "tatoeba" in self.raw_datasets:
            raise ValueError("El dataset tatoeba ya está cargado")

        json_data = self.pandas_to_json(df)
        start = len(self.json)
        self.json += json_data
        end = len(self.json)
        self.raw_datasets["tatoeba"] = {"start": start, "end": end}
        return df

    def read_folder(self, directory):
        for file in os.listdir(directory):
            if file.endswith(".txt"):
                try:
                    self.read_local_file(directory, file)
                    print(f"Archivo {file} cargado")
                except Exception as e:
                    print(f"Fallo al cargar el archivo {file}: {e}")

    def read_local_file(self, directory, file):
        dataset_name = file.split(".")[0]
        if dataset_name in self.raw_datasets.keys():
            raise ValueError(f"El dataset {directory}/{file} ya está cargado")

        json_data = []
        with open(os.path.join(directory, file), "r", encoding="utf-8") as f:
            for line in f:
                clean_line = self.clean_text(line)
                if clean_line:
                    json_data.append({"text": clean_line})

        start = len(self.json)
        self.json += json_data
        end = len(self.json)
        self.raw_datasets[dataset_name] = {"start": start, "end": end}
        return json_data

    def clean_text(self, text):
        text = re.sub(r"[^\w\s\n]", " ", text)
        text = re.sub(r"[ \t]+", " ", text)
        text = re.sub(r"(.)\1{5,}", r"\1"*5, text)
        def limit_word_reps(match):
            word = match.group(1)
            return " ".join([word]*5)
        text = re.sub(r"\b(\w+)( \1){5,}\b", limit_word_reps, text)
        text = text.lower()
        return text.strip()

    def pandas_to_json(self, df, clean=True, save=False):
        json_data = []
        for t in df["text"].tolist():
            clean_line = self.clean_text(t) if clean else t
            if clean_line:
                json_data.append({"text": clean_line})
        if save:
            with open(save, "w", encoding="utf-8") as f:
                f.write(json.dumps(json_data, ensure_ascii=False))
        return json_data


In [None]:
from transformers import AutoTokenizer
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
import torch

# Inicializamos dataset
ast = LanguageDatasets("asturiano")

# Cargamos tokenizer de Mistral
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Tokenizamos dataset
tokenized_dataset = ast.tokenize(tokenizer)

# Ajustamos labels para ignorar el relleno en la pérdida
def mask_labels(example):
    example["labels"] = [
        (id if mask == 1 else -100)
        for id, mask in zip(example["input_ids"], example["attention_mask"])
    ]
    return example

tokenized_dataset = tokenized_dataset.map(mask_labels)


2025-11-17 12:30:14.669076: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Descargando tatoeba para asturiano:
Completado con éxito
Cargando txt locales para asturiano:


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/814 [00:00<?, ? examples/s]

Map:   0%|          | 0/814 [00:00<?, ? examples/s]

: 

### Preparar modelo Base

In [None]:
from transformers import BitsAndBytesConfig

base_model_id = "Qwen/Qwen2.5-3B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    llm_int8_enable_fp32_cpu_offload=True
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto"
)


config.json:   0%|          | 0.00/683 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Error named symbol not found at line 57 in file /src/csrc/ops.cu


### Configurar Lora

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj","v_proj"],  # módulos típicos en Mistral/LLaMA
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)


### Entrenamiento

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_steps=100,
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

trainer.train()


## Usar Modelo

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

base_model_id = "mistralai/Mistral-7B-v0.1"

tokenizer = AutoTokenizer.from_pretrained(base_model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map="auto",
    torch_dtype=torch.float16
)


In [None]:
from peft import PeftModel

# Ruta donde guardaste tu LoRA tras el entrenamiento
lora_path = "./lora-asturiano"

model = PeftModel.from_pretrained(model, lora_path)


In [None]:
prompt = "¿Cómo ta el cielu al atapecer en Xixón?"

inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(
    **inputs,
    max_new_tokens=50,
    do_sample=True,
    temperature=0.7,
    top_p=0.9
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))
