In [2]:
from datasets import load_dataset
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model

MODEL_NAME = "microsoft/phi-3.5-mini-instruct"

final_dataset = []

  from .autonotebook import tqdm as notebook_tqdm





In [3]:
# -----------------------------
# 1. XSum - resúmenes concisos
# -----------------------------
print("Cargando XSum...")
xsum = load_dataset("xsum", split="train", trust_remote_code=True)
for item in xsum:
    final_dataset.append({
        "instruction": "Resume el siguiente texto de forma concisa",
        "input": item["document"],
        "output": item["summary"]
    })


Cargando XSum...


In [4]:
# -----------------------------
# 2. CNN/DailyMail - resúmenes descriptivos
# -----------------------------
print("Cargando CNN/DailyMail...")
cnn_dm = load_dataset("cnn_dailymail", "3.0.0", split="train", trust_remote_code=True)
for item in cnn_dm:
    final_dataset.append({
        "instruction": "Resume el siguiente texto",
        "input": item["article"],
        "output": item["highlights"]
    })


Cargando CNN/DailyMail...


In [5]:
# -----------------------------
# 3. ASSET - simplificación de texto
# -----------------------------
print("Cargando ASSET...")
try:
    asset = load_dataset("facebook/asset", split="train", trust_remote_code=True)
    for item in asset:
        final_dataset.append({
            "instruction": "Simplifica el siguiente texto",
            "input": item["original"],
            "output": item["simplification"]
        })
except Exception:
    print("⚠️ ASSET no disponible directamente en Hugging Face, debes descargarlo manualmente.")

Cargando ASSET...
⚠️ ASSET no disponible directamente en Hugging Face, debes descargarlo manualmente.


In [6]:
# -----------------------------
# 4. WikiLarge - simplificación Wikipedia
# -----------------------------
print("Cargando WikiLarge...")
try:
    wikilarge = load_dataset("bogdancazan/wikilarge-text-simplification", split="train", trust_remote_code=True)
    for item in wikilarge:
        final_dataset.append({
            "instruction": "Simplifica el siguiente texto",
            "input": item["original"],
            "output": item["simplified"]
        })
except Exception:
    print("⚠️ WikiLarge no disponible directamente en Hugging Face, debes descargarlo manualmente.")

Cargando WikiLarge...
⚠️ WikiLarge no disponible directamente en Hugging Face, debes descargarlo manualmente.


In [7]:
import json

# -----------------------------
# 5. Textos del museo (archivo local)
# -----------------------------
print("Cargando textos del museo...")
with open("C:\\Users\\yleob\\ReactNative\\QuetzAI\\server\\dataset\\Museo\\museo_textos.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():  # skip empty lines
            try:
                obj = json.loads(line)
                final_dataset.append({
                    "instruction": obj["instruction"],
                    "input": obj["input"],
                    "output": obj["output"]
                })
            except json.JSONDecodeError as e:
                print(f"Error decoding line: {e}")

Cargando textos del museo...
Error decoding line: Expecting value: line 1 column 1 (char 0)
Error decoding line: Expecting value: line 1 column 1 (char 0)
Error decoding line: Expecting value: line 1 column 1 (char 0)
Error decoding line: Expecting value: line 1 column 1 (char 0)
Error decoding line: Expecting value: line 1 column 1 (char 0)
Error decoding line: Expecting value: line 1 column 1 (char 0)
Error decoding line: Expecting value: line 1 column 1 (char 0)
Error decoding line: Expecting value: line 1 column 1 (char 0)
Error decoding line: Expecting value: line 1 column 1 (char 0)
Error decoding line: Expecting value: line 1 column 1 (char 0)
Error decoding line: Expecting value: line 1 column 1 (char 0)
Error decoding line: Expecting value: line 1 column 1 (char 0)
Error decoding line: Expecting value: line 1 column 1 (char 0)
Error decoding line: Expecting value: line 1 column 1 (char 0)
Error decoding line: Expecting value: line 1 column 1 (char 0)
Error decoding line: Expec

In [None]:
# -----------------------------
# 6. Guardar dataset combinado
# -----------------------------
print("Guardando dataset combinado...")
with open("dataset_combinado.jsonl", "w", encoding="utf-8") as f:
    for item in final_dataset:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

Guardando dataset combinado...


In [None]:
# -----------------------------
# 7. Preparar dataset para entrenamiento
# -----------------------------
print("Preparando dataset para entrenamiento...")
dataset = load_dataset("json", data_files="dataset_combinado.jsonl")["train"]

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def format_dataset(example):
    prompt = f"{example['instruction']}\n\n{example['input']}\n\nRespuesta:"
    return tokenizer(prompt, text_target=example['output'], truncation=True)

tokenized_dataset = dataset.map(format_dataset)

In [None]:
# -----------------------------
# 8. Cargar modelo con QLoRA
# -----------------------------
print("Cargando modelo base...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    load_in_4bit=True,
    device_map="auto",
    attn_implementation="eager"
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)


In [None]:
# -----------------------------
# 9. Configuración de entrenamiento
# -----------------------------
training_args = TrainingArguments(
    output_dir="./lora-phi3.5",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_steps=500,
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

In [None]:
# -----------------------------
# 10. Entrenar y guardar adaptador
# -----------------------------
print("Entrenando modelo...")
trainer.train()

print("Guardando adaptador LoRA...")
model.save_pretrained("./lora-phi3.5")

print("Entrenamiento completado")