
 # Workflow: Fine-Tuning de ProtGPT2 y Generación
 
 Este notebook cubre el proceso completo:
 1.  Instalar dependencias.
 2.  Descargar el script de entrenamiento de Hugging Face.
 3.  Cargar datos desde un CSV (usando pandas).
 4.  Formatear y guardar los datos en `training.txt` y `validation.txt`.
 5.  Ejecutar el fine-tuning (re-entrenamiento).
 6.  Cargar el nuevo modelo "tuneado".
 7.  Generar secuencias con el nuevo modelo.
 8.  Calcular la Perplejidad (PPL) de las secuencias generadas.

In [12]:
!pip install transformers torch accelerate datasets pandas scikit-learn biopython
!pip install transformers torch accelerate datasets pandas scikit-learn biopython evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.6


In [7]:
import requests

url = "https://raw.githubusercontent.com/huggingface/transformers/main/examples/pytorch/language-modeling/run_clm.py"
r = requests.get(url)
with open("run_clm.py", "wb") as f:
    f.write(r.content)

# Verificar
import os
print(os.listdir("."))


['appx', 'bin', 'chrome_100_percent.pak', 'chrome_200_percent.pak', 'Code.exe', 'Code.VisualElementsManifest.xml', 'd3dcompiler_47.dll', 'ffmpeg.dll', 'icudtl.dat', 'libEGL.dll', 'libGLESv2.dll', 'LICENSES.chromium.html', 'locales', 'policies', 'resources', 'resources.pak', 'run_clm.py', 'snapshot_blob.bin', 'tools', 'unins000.dat', 'unins000.exe', 'unins000.msg', 'v8_context_snapshot.bin', 'vk_swiftshader.dll', 'vk_swiftshader_icd.json', 'vulkan-1.dll']


In [8]:
# %%python
# --- 3. Cargar y Preparar los Datos ---
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from Bio import SeqIO # Usaremos esto solo para el formato, pero adaptado

# --- Configuración de Archivos ---
CSV_INPUT_FILE = "D:/source/Proyecto Integrador/glp-1_drug_discovery/data/processed/descriptores_cdhit.csv"  # <--- ¡CAMBIA ESTO A TU ARCHIVO!
SEQUENCE_COLUMN = "sequence"     # <--- ¡CAMBIA ESTO AL NOMBRE DE TU COLUMNA!

TRAIN_FILE_TXT = "training.txt"
VAL_FILE_TXT = "validation.txt"
SPECIAL_TOKEN = "<|endoftext|>"
TEST_SPLIT_SIZE = 0.10  # 10% para validación
RANDOM_SEED = 42

# Expresión regular para aminoácidos canónicos
valid_aa_regex = re.compile(r"^[ACDEFGHIKLMNPQRSTVWY]+$")

# --- 3.1. Cargar Datos del CSV ---
try:
    df = pd.read_csv(CSV_INPUT_FILE)
    print(f"DataFrame cargado. Columnas: {df.columns.tolist()}")
    
    if SEQUENCE_COLUMN not in df.columns:
        print(f"¡ERROR! No se encontró la columna '{SEQUENCE_COLUMN}' en el CSV.")
        # Detener la ejecución si la columna no existe
        raise KeyError(f"Columna '{SEQUENCE_COLUMN}' no encontrada.")
        
    sequences = df[SEQUENCE_COLUMN].dropna().astype(str).tolist()
    print(f"Se extrajeron {len(sequences)} secuencias de la columna '{SEQUENCE_COLUMN}'.")

except FileNotFoundError:
    print(f"¡ERROR! No se encontró el archivo {CSV_INPUT_FILE}.")
    sequences = [] # Dejar la lista vacía para que el resto del script no falle

# --- 3.2. Validar y Limpiar Secuencias ---
valid_sequences = []
for seq_str in sequences:
    seq_str = seq_str.strip().upper().replace("\n", "").replace(" ", "")
    if seq_str and valid_aa_regex.match(seq_str):
        valid_sequences.append(seq_str)
    else:
        print(f"  > Omitiendo secuencia (vacía o con caracteres inválidos): {seq_str[:30]}...")

print(f"Se procesaron {len(valid_sequences)} secuencias válidas.")

# --- 3.3. División de Datos (Train/Validation) ---
if valid_sequences:
    train_seqs, val_seqs = train_test_split(
        valid_sequences, 
        test_size=TEST_SPLIT_SIZE, 
        random_state=RANDOM_SEED
    )
    print(f"División: {len(train_seqs)} para entrenamiento, {len(val_seqs)} para validación.")

    # --- 3.4. Función para Escribir Archivos ---
    def write_to_file(filename, seq_list):
        with open(filename, 'w', encoding='utf-8') as f:
            for seq in seq_list:
                f.write(f"{SPECIAL_TOKEN}\n")
                f.write(f"{seq}\n")
        print(f"Archivo guardado: {filename}")

    # --- 3.5. Escribir Archivos ---
    write_to_file(TRAIN_FILE_TXT, train_seqs)
    write_to_file(VAL_FILE_TXT, val_seqs)
else:
    print("No se encontraron secuencias válidas para procesar. No se crearán archivos de entrenamiento.")

print("¡Proceso de preparación de datos completado!")

DataFrame cargado. Columnas: ['ID', 'AAC_A', 'AAC_C', 'AAC_D', 'AAC_E', 'AAC_F', 'AAC_G', 'AAC_H', 'AAC_I', 'AAC_K', 'AAC_L', 'AAC_M', 'AAC_N', 'AAC_P', 'AAC_Q', 'AAC_R', 'AAC_S', 'AAC_T', 'AAC_V', 'AAC_W', 'AAC_Y', 'CKSAAGP_alphaticr.alphaticr.gap0', 'CKSAAGP_alphaticr.aromatic.gap0', 'CKSAAGP_alphaticr.postivecharger.gap0', 'CKSAAGP_alphaticr.negativecharger.gap0', 'CKSAAGP_alphaticr.uncharger.gap0', 'CKSAAGP_aromatic.alphaticr.gap0', 'CKSAAGP_aromatic.aromatic.gap0', 'CKSAAGP_aromatic.postivecharger.gap0', 'CKSAAGP_aromatic.negativecharger.gap0', 'CKSAAGP_aromatic.uncharger.gap0', 'CKSAAGP_postivecharger.alphaticr.gap0', 'CKSAAGP_postivecharger.aromatic.gap0', 'CKSAAGP_postivecharger.postivecharger.gap0', 'CKSAAGP_postivecharger.negativecharger.gap0', 'CKSAAGP_postivecharger.uncharger.gap0', 'CKSAAGP_negativecharger.alphaticr.gap0', 'CKSAAGP_negativecharger.aromatic.gap0', 'CKSAAGP_negativecharger.postivecharger.gap0', 'CKSAAGP_negativecharger.negativecharger.gap0', 'CKSAAGP_negativ

In [15]:
# Ajusta los parámetros según tu GPU y necesidades.

# Directorio donde se guardará el modelo
OUTPUT_MODEL_DIR = "C:/temp/protgpt2_finetuned"

# ¡Asegúrate de tener GPU disponible!
!python run_clm.py \
    --model_name_or_path nferruz/ProtGPT2 \
    --tokenizer_name nferruz/ProtGPT2 \
    --train_file {TRAIN_FILE_TXT} \
    --validation_file {VAL_FILE_TXT} \
    --output_dir {OUTPUT_MODEL_DIR} \
    --do_train \
    --do_eval \
    --learning_rate 1e-6 \
    --num_train_epochs 3 \
    --per_device_train_batch_size 2 \
    --per_device_eval_batch_size 2 \
    --gradient_accumulation_steps 4 \
    --save_strategy "epoch" \
    --eval_strategy "epoch" \
    --fp16=True \
    --logging_steps 100

print(f"¡Entrenamiento completado! Modelo guardado en {OUTPUT_MODEL_DIR}")

10/27/2025 15:45:13 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=True,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalSt

Using custom data configuration default-5d3022183dd3c5bc
Generating dataset text (C:/Users/Dev/.cache/huggingface/datasets/text/default-5d3022183dd3c5bc/0.0.0/37eaf37ac90527a7fd768c94b312ee84f8815c9b7ac00acf81c1c364e8392f99)
Downloading and preparing dataset text/default to C:/Users/Dev/.cache/huggingface/datasets/text/default-5d3022183dd3c5bc/0.0.0/37eaf37ac90527a7fd768c94b312ee84f8815c9b7ac00acf81c1c364e8392f99...
Downloading took 0.0 min
Checksum Computation took 0.0 min
Generating train split

Generating train split: 0 examples [00:00, ? examples/s]
Generating train split: 404 examples [00:00, 11482.99 examples/s]
Generating validation split

Generating validation split: 0 examples [00:00, ? examples/s]
Generating validation split: 46 examples [00:00, 6360.24 examples/s]
Unable to verify splits sizes.
Dataset text downloaded and prepared to C:/Users/Dev/.cache/huggingface/datasets/text/default-5d3022183dd3c5bc/0.0.0/37eaf37ac90527a7fd768c94b312ee84f8815c9b7ac00acf81c1c364e8392f99. 

In [10]:
print(os.listdir("."))

['appx', 'bin', 'chrome_100_percent.pak', 'chrome_200_percent.pak', 'Code.exe', 'Code.VisualElementsManifest.xml', 'd3dcompiler_47.dll', 'ffmpeg.dll', 'icudtl.dat', 'libEGL.dll', 'libGLESv2.dll', 'LICENSES.chromium.html', 'locales', 'policies', 'resources', 'resources.pak', 'run_clm.py', 'snapshot_blob.bin', 'tools', 'training.txt', 'unins000.dat', 'unins000.exe', 'unins000.msg', 'v8_context_snapshot.bin', 'validation.txt', 'vk_swiftshader.dll', 'vk_swiftshader_icd.json', 'vulkan-1.dll']
