In [None]:
# Log In to Hugging Face
from huggingface_hub import login

login()

In [None]:
# Install dependencies
!pip install -q transformers datasets peft accelerate bitsandbytes

In [None]:
# Upload training file
from google.colab import files

uploaded = files.upload()

In [None]:
# Load dataset
from datasets import Dataset
import json

# Carga manual desde el archivo .jsonl
with open("chess_commentary_dataset.jsonl", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

# Crear el dataset de HuggingFace
dataset = Dataset.from_list(data)

# Opcional: dividir en entrenamiento y validación
dataset = dataset.train_test_split(test_size=0.1)

In [None]:
# Load tokenizer and QLoRA-configured model
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType

model_name = "Waterhorse/chessgpt-chat-v1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype="float16"
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)


In [None]:
# Apply LoRA with PEFT
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1
)

model = get_peft_model(model, peft_config)

In [None]:
# Preprocessing function for CausalLM (input and output concatenated)

# Ensure pad token is set
tokenizer.pad_token = tokenizer.eos_token

# Preprocessing function
def preprocess(examples):
    prompts = examples["input"]
    responses = examples["output"]
    full_texts = [f"<s>Input:\n{input}\n\nResponse:\n{output}</s>" for p, r in zip(prompts, responses)]

    tokenized = tokenizer(
        full_texts,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_attention_mask=True
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Apply to each split of DatasetDict
tokenized_data = {
    split: ds.map(
        preprocess,
        batched=True,
        remove_columns=['input', 'output'],
        load_from_cache_file=False
    )
    for split, ds in dataset.items()
}

tokenized["labels"] = [
    [(l if l != tokenizer.pad_token_id else -100) for l in label]
    for label in tokenized["input_ids"]
]

In [None]:
sample = dataset['train'][0]
processed = preprocess(sample)
decoded = tokenizer.decode(processed['input_ids'])
print(decoded)

import numpy as np
lengths = [len(tokenizer(f"<s>{ex['input']}\n{ex['output']}</s>")['input_ids']) for ex in dataset['train']]
print(f"Max: {max(lengths)}, Avg: {np.mean(lengths)}")

print(tokenized["labels"][0])  # Ensure -100 is where pad_token_id appears

In [None]:
!pip install matplotlib

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Training setup
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
from transformers.trainer_callback import TrainerCallback
import matplotlib.pyplot as plt

# Callback to log loss
class LossLoggerCallback(TrainerCallback):
    def __init__(self):
        self.train_losses = []
        self.eval_losses = []
        self.steps = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is None:
            return
        if 'loss' in logs:
            self.train_losses.append(logs['loss'])
            self.steps.append(state.global_step)
        if 'eval_loss' in logs:
            self.eval_losses.append(logs['eval_loss'])

# Inicialize callback
loss_logger = LossLoggerCallback()

# Training arguments with logging
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/EchidnaAI",
    per_device_train_batch_size=8, #4
    per_device_eval_batch_size=8,
    learning_rate=2e-4,
    num_train_epochs=1, #3
    max_steps=300,
    logging_steps=25,
    save_steps=300,
    save_total_limit=1,
    save_strategy="steps",
    fp16=True,
    push_to_hub=False,
    report_to="none",  # No TensorBoard
    disable_tqdm=False
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    callbacks=[loss_logger]
)

# Manual periodical eval
eval_every_n_steps = 50

trainer.train()
for step in range(0, training_args.max_steps or 10000, eval_every_n_steps):
    eval_metrics = trainer.evaluate()
    loss_logger.eval_losses.append(eval_metrics["eval_loss"])

# Graph
plt.figure(figsize=(10, 5))
plt.plot(loss_logger.steps, loss_logger.train_losses, label="Training Loss")
if loss_logger.eval_losses:
    eval_x = loss_logger.steps[:len(loss_logger.eval_losses)]
    plt.plot(eval_x, loss_logger.eval_losses, label="Eval Loss", linestyle="--")
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.legend()
plt.title("Training vs Eval Loss")
plt.grid(True)
plt.show()

In [None]:
# Inference example
input_text = "Player moved Nf3. Evaluation dropped from +0.3 to -1.2. Best move was e4. Explain the mistake."
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
output_ids = model.generate(**inputs, max_length=128)
print(tokenizer.decode(output_ids[0], skip_special_tokens=True))

In [None]:
input_text = "Rewrite the following coaching commentary from first person to third person:
"I thought I could save the rook, but there was no way."."
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
output_ids = model.generate(**inputs, max_length=128)
print(tokenizer.decode(output_ids[0], skip_special_tokens=True))

In [None]:
trainer.push_to_hub()