In [2]:
print('Test')

Test


In [3]:
from datasets import Dataset
import json
from pathlib import Path

# Ruta de la carpeta donde guardaste los chunks generados previamente
carpeta = Path("Chunks_Dataset")

# Lista para almacenar todos los datos
registros = []

# Cargar todos los archivos JSON que contienen los chunks
for archivo_json in carpeta.rglob("*.json"):
    with open(archivo_json, "r", encoding="utf-8") as f:
        data = json.load(f)
        registros.append(data)

# Verificar cuántos registros hemos cargado
len(registros), registros[0]


  from .autonotebook import tqdm as notebook_tqdm


(102,
 {'id': '1_Biological_Psychology_chunk_0',
  'book_id': '1_Biological_Psychology',
  'source_file': 'Biological Psychology.txt',
  'source_path': 'Convert_Books\\TestBookClean\\Biological Psychology.txt',
  'chunk_index': 0,
  'text': 'It is often said that Man is unique among animals. It is worth looking at this term unique before we discuss our subject proper. The word may in this context have two slightly different meanings. It may mean: Man is strikingly different - he is not identical with any animal. This is of course true. It is true also of all other animals: Each species, even each individual is unique in this sense. But the term is also often used in a more absolute sense: Man is so different, so "essentially different" (whatever that means) that the gap between him and animals cannot possibly be bridged - he is something altogether new. \nUsed in this absolute sense the term is scientifically meaningless. Its use also reveals and may reinforce conceit, and it leads to 

In [4]:
dataset = Dataset.from_list(registros)
dataset



Dataset({
    features: ['id', 'book_id', 'source_file', 'source_path', 'chunk_index', 'text', 'meta'],
    num_rows: 102
})

In [5]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Cargar el tokenizer de GPT-2
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
    model.config.pad_token_id = tokenizer.pad_token_id


# Función para tokenizar el texto
def tokenizar(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

# Tokenizar todos los registros
tokenized = dataset.map(tokenizar, batched=True)

# Eliminar columnas innecesarias
tokenized = tokenized.remove_columns([col for col in tokenized.column_names if col != "input_ids"])
tokenized.set_format("torch")

tokenized



Map: 100%|██████████| 102/102 [00:00<00:00, 229.23 examples/s]


Dataset({
    features: ['input_ids'],
    num_rows: 102
})

In [6]:
from transformers import GPT2LMHeadModel

# Cargar el modelo GPT-2 preentrenado
model = GPT2LMHeadModel.from_pretrained("gpt2")



In [7]:
import re

# Congelar las primeras 6 capas (0-5) de GPT-2 si se detecta el índice
for name, param in model.named_parameters():
    # Buscar patrón ".h.<num>." en el nombre del parámetro
    m = re.search(r'\.h\.(\d+)\.', name)
    if not m:
        # no corresponde a una capa 'h.<n>' (p. ej. ln_f, wte, wpe...), saltar
        continue
    capa = int(m.group(1))
    if capa < 6:   # Congelamos capas 0–5
        param.requires_grad = False

print("Capas 0–5 congeladas. Solo las capas 6–11 serán entrenadas.")




Capas 0–5 congeladas. Solo las capas 6–11 serán entrenadas.


In [8]:
# ...existing code...
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Argumentos comunes
common_args = dict(
    output_dir="./modelo_final",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    logging_dir="./logs"
)

# Intentamos crear TrainingArguments con opciones modernas; si falla, usamos fallback mínimo
try:
    args = TrainingArguments(
        **common_args,
        overwrite_output_dir=True,
        per_device_eval_batch_size=2,
        learning_rate=5e-5,
        weight_decay=0.01,
        logging_steps=50,
        evaluation_strategy="no",   # puede fallar en versiones antiguas
        save_strategy="epoch",
        save_total_limit=2,
        fp16=False
    )
except TypeError:
    # Fallback para versiones antiguas de transformers
    args = TrainingArguments(**common_args)

# Data collator para causal LM
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Crear Trainer y entrenar
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized,
    eval_dataset=tokenized,
    data_collator=data_collator,
    tokenizer=tokenizer
)

trainer.train()
# ...existing code...




  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


TrainOutput(global_step=153, training_loss=3.142376669092116, metrics={'train_runtime': 39.0365, 'train_samples_per_second': 7.839, 'train_steps_per_second': 3.919, 'total_flos': 79955361792000.0, 'train_loss': 3.142376669092116, 'epoch': 3.0})

In [9]:
# ...existing code...
import torch

# seleccionar dispositivo y mover modelo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# ...existing code...

In [None]:
def chat_psicologia():
    print("=== Chat Psicología (Escribe SALIR para terminar) ===\n")

    while True:
        pregunta = input("Tú: ")

        if pregunta.lower().strip() == "salir":
            print("Fin del chat.")
            break

        prompt = f"""
    Actúa como profesor de psicología experto.
Responde únicamente usando tu entrenamiento en los libros.

Pregunta:
{pregunta}

Respuesta:
"""
        inputs = tokenizer(prompt, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        output = model.generate(
            **inputs,
            max_new_tokens=250,
            temperature=0.7
        )

        print("\nModelo:", tokenizer.decode(output[0], skip_special_tokens=True), "\n")




In [None]:
chat_psicologia()

=== Chat Psicología (Escribe SALIR para terminar) ===


Modelo: 
Actúa como profesor de psicología experto.
Responde únicamente usando tu entrenamiento en los libros.

Pregunta:
what do dendritic spines do?

Respuesta:

What do dendrites do?

Pregunta:

What do dendrites do?
Pregunta:

What do dendrites do?

Pregunta:

What do dendrites do?

Pregunta:

What do dendrites do?

Pregunta:

What do dendrites do?

Pregunta:

What do dendrites do?

Pregunta:

What do dendrites do?

Pregunta:

What do dendrites do?

Pregunta:

What do dendrites do?

Pregunta:

What do dendrites do?

Pregunta:

What do dendrites do?

Pregunta:
What do dendrites do?

Pregunta:


What do dendrites do?
Pregunta:

What do dendrites do?

Pregunta:

What do dendrites do?
Pregunta:

What do dendrites do?

Pregunta 


Modelo: 
Actúa como profesor de psicología experto.
Responde únicamente usando tu entrenamiento en los libros.

Pregunta:
what do the mitochondria do? 

Respuesta:

The mitochondria are the same thing as 