In [1]:
import os
import sys
import torch
import numpy as np
import pandas as pd
from pathlib import Path

# Machine Learning y Transformers
from sklearn.model_selection import train_test_split
from transformers import (
    XLNetLMHeadModel,
    XLNetTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from datasets import Dataset

# ruta del directorio del notebook actual
notebook_dir = Path.cwd()
directorio_base = Path.cwd().parent
sys.path.append(str(directorio_base))

from src.plotting import plot_pca_3d

# Asegurarse de que W&B est√© deshabilitado si no se usa
os.environ["WANDB_DISABLED"] = "true"

In [2]:
# Establecer la ruta los archivos de datos
directorio_datos = Path(directorio_base / "data")
directorio_modelos = Path(directorio_base / "models")
directorio_modelos_automl= Path(directorio_modelos / "pycaret")
raw_data_dir = directorio_datos / "raw"
processed_data_dir = directorio_datos / "processed"

#ruta a los modelos automl
ruta_mejor_modelo_final = directorio_modelos_automl / "modelos_GLP1_no_pca" /"mejor_modelo_final"
# Ruta modelos generativos
ruta_modelo_protxlnet = directorio_modelos / "prot_xlnet_finetuned"

# Datos con actividad conocida
ruta_125_ec50 = processed_data_dir / "descriptores_125.csv"

# Datos sin actividad conocida
ruta_peptidos_eval = processed_data_dir / "descriptores_cdhit.csv"

In [3]:
# cargar datos procesados
df_125_conocidos = pd.read_csv(ruta_125_ec50)
df_125_conocidos.set_index('ID', inplace=True)
df_125_conocidos.columns = df_125_conocidos.columns.str.replace('.', '_', regex=False)
df_125_conocidos['pEC50'] = -np.log10(df_125_conocidos["EC50_T2"] * 1e-12)

df_glp1 = pd.read_csv(ruta_peptidos_eval)
df_glp1.set_index('ID', inplace=True)
df_glp1.columns = df_glp1.columns.str.replace('.', '_', regex=False)



In [4]:
df_125_conocidos.head()

Unnamed: 0_level_0,AAC_A,AAC_C,AAC_D,AAC_E,AAC_F,AAC_G,AAC_H,AAC_I,AAC_K,AAC_L,...,NMBroto_BEGF750103_lag1,NMBroto_BEGF750103_lag2,NMBroto_BEGF750103_lag3,NMBroto_BHAR880101_lag1,NMBroto_BHAR880101_lag2,NMBroto_BHAR880101_lag3,sequence,EC50_T2,EC50_LOG_T2,pEC50
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
seq_pep1,0.033333,0.0,0.1,0.1,0.066667,0.066667,0.033333,0.0,0.033333,0.066667,...,-0.02798,-0.182783,0.054222,0.190428,-0.142437,0.090372,HSQGTFTSDYSKYLDSRRAQDFVQWLEEGE,563.0,-9.25,9.249492
seq_pep2,0.033333,0.0,0.1,0.1,0.066667,0.066667,0.033333,0.0,0.033333,0.066667,...,0.00091,-0.316149,0.170202,0.157133,-0.144228,0.115217,HSQGTFTSDYSKYLDSRRAEDFVQWLENGE,552.0,-9.26,9.258061
seq_pep3,0.034483,0.0,0.103448,0.068966,0.068966,0.034483,0.034483,0.0,0.034483,0.068966,...,-0.004817,-0.250582,0.18155,0.098041,-0.203722,0.127012,HSQGTFTSDYSKYLDSRRAEDFVQWLENT,252.0,-9.6,9.598599
seq_pep4,0.055556,0.0,0.083333,0.027778,0.055556,0.166667,0.027778,0.0,0.027778,0.055556,...,0.22509,-0.097965,0.052838,0.377701,0.150231,0.286987,HSQGTFTSDYSKYLDSRRAEDFVQWLVAGGSGSGSG,6.03,-11.22,11.219683
seq_pep5,0.066667,0.0,0.1,0.066667,0.066667,0.066667,0.033333,0.0,0.033333,0.066667,...,0.088858,-0.190213,0.020097,0.069381,-0.184796,0.222087,HSQGTFTSDYSKYLDSRRAQDFVQWLEAEG,238.0,-9.62,9.623423


In [5]:
df_glp1.head()

Unnamed: 0_level_0,AAC_A,AAC_C,AAC_D,AAC_E,AAC_F,AAC_G,AAC_H,AAC_I,AAC_K,AAC_L,...,NMBroto_BEGF750102_lag1,NMBroto_BEGF750102_lag2,NMBroto_BEGF750102_lag3,NMBroto_BEGF750103_lag1,NMBroto_BEGF750103_lag2,NMBroto_BEGF750103_lag3,NMBroto_BHAR880101_lag1,NMBroto_BHAR880101_lag2,NMBroto_BHAR880101_lag3,sequence
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AF-A0A060VXS0-F1,0.1,0.0,0.066667,0.066667,0.033333,0.066667,0.033333,0.0,0.066667,0.066667,...,0.037142,-0.508484,0.112768,0.224928,-0.142711,0.120597,0.09933,-0.372417,0.041586,HAEGTYTSDMSSYLQDQAAKEFVSWLKNGR
AF-A0A060VY52-F1,0.1,0.0,0.066667,0.066667,0.033333,0.066667,0.033333,0.0,0.066667,0.066667,...,0.037142,-0.508484,0.112768,0.145803,-0.195554,0.086696,0.178218,-0.292699,0.046946,HAEGTYTSDVSSYLQDQAAKEFVSWLKNGR
AF-A0A060WDT4-F1,0.1,0.0,0.133333,0.0,0.033333,0.066667,0.033333,0.0,0.066667,0.1,...,-0.029432,-0.341003,0.010969,-0.055908,-0.413757,-0.005239,0.086834,-0.334601,-0.073197,HADGTYTSDVSTYLQDQAAKDFVSWLKSGL
AF-A0A087VEU7-F1,0.133333,0.0,0.033333,0.1,0.033333,0.1,0.033333,0.066667,0.033333,0.066667,...,0.068401,-0.469261,-0.028003,0.179833,-0.232789,0.257321,0.296206,-0.151547,-0.086574,HAEGTYTSDITSYLEGQAAKEFIAWLVNGR
AF-A0A087XPV4-F1,0.1,0.0,0.133333,0.0,0.066667,0.066667,0.033333,0.033333,0.1,0.066667,...,0.094549,-0.460741,0.040947,0.156853,-0.296064,-0.234672,0.149114,-0.209235,-0.250359,HADGTFTSDVSSYLKDQAIKDFVAQLKSGQ


In [6]:
# predecci√≥n de actividad para los p√©ptidos GLP-1

# cargar el modelo guardado 
from pycaret.regression import load_model, predict_model
modelo_pycaret = load_model(ruta_mejor_modelo_final)
modelo_pycaret

Transformation Pipeline and Model Successfully Loaded


In [7]:
# Predecir la actividad de los p√©ptidos GLP-1
df_predicciones_glp1 = predict_model(modelo_pycaret, data=df_glp1)
df_predicciones_glp1.rename(columns={'prediction_label': 'pEC50'}, inplace=True)
df_predicciones_glp1.head(10)

Unnamed: 0_level_0,AAC_A,AAC_C,AAC_D,AAC_E,AAC_F,AAC_G,AAC_H,AAC_I,AAC_K,AAC_L,...,NMBroto_BEGF750102_lag2,NMBroto_BEGF750102_lag3,NMBroto_BEGF750103_lag1,NMBroto_BEGF750103_lag2,NMBroto_BEGF750103_lag3,NMBroto_BHAR880101_lag1,NMBroto_BHAR880101_lag2,NMBroto_BHAR880101_lag3,sequence,pEC50
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AF-A0A060VXS0-F1,0.1,0.0,0.066667,0.066667,0.033333,0.066667,0.033333,0.0,0.066667,0.066667,...,-0.508484,0.112768,0.224928,-0.142711,0.120597,0.09933,-0.372417,0.041586,HAEGTYTSDMSSYLQDQAAKEFVSWLKNGR,8.729368
AF-A0A060VY52-F1,0.1,0.0,0.066667,0.066667,0.033333,0.066667,0.033333,0.0,0.066667,0.066667,...,-0.508484,0.112768,0.145803,-0.195554,0.086696,0.178218,-0.292699,0.046946,HAEGTYTSDVSSYLQDQAAKEFVSWLKNGR,8.988079
AF-A0A060WDT4-F1,0.1,0.0,0.133333,0.0,0.033333,0.066667,0.033333,0.0,0.066667,0.1,...,-0.341003,0.010969,-0.055908,-0.413757,-0.005239,0.086834,-0.334601,-0.073197,HADGTYTSDVSTYLQDQAAKDFVSWLKSGL,9.318927
AF-A0A087VEU7-F1,0.133333,0.0,0.033333,0.1,0.033333,0.1,0.033333,0.066667,0.033333,0.066667,...,-0.469261,-0.028003,0.179833,-0.232789,0.257321,0.296206,-0.151547,-0.086574,HAEGTYTSDITSYLEGQAAKEFIAWLVNGR,9.398033
AF-A0A087XPV4-F1,0.1,0.0,0.133333,0.0,0.066667,0.066667,0.033333,0.033333,0.1,0.066667,...,-0.460741,0.040947,0.156853,-0.296064,-0.234672,0.149114,-0.209235,-0.250359,HADGTFTSDVSSYLKDQAIKDFVAQLKSGQ,9.803035
AF-A0A091DI12-F1,0.133333,0.0,0.033333,0.1,0.066667,0.1,0.033333,0.033333,0.066667,0.066667,...,-0.530788,-0.04763,0.16739,-0.286063,0.189741,0.250639,-0.305108,-0.179249,HAEGTFTSDVSSYLEGQAAKEFIAWLVKGR,10.234846
AF-A0A091N9Y7-F1,0.033333,0.0,0.1,0.033333,0.1,0.033333,0.066667,0.033333,0.133333,0.066667,...,-0.143338,-0.219515,-0.1579,-0.401052,0.154808,0.038722,-0.216945,0.023633,HSEGTFTSDFTRYLDKMKAKDFVHWLINTK,9.780633
AF-A0A091P079-F1,0.034483,0.0,0.057471,0.057471,0.045977,0.045977,0.011494,0.022989,0.08046,0.091954,...,0.151567,0.134447,0.000482,-0.007747,0.045643,0.039176,0.142432,0.085655,MKMKSVYFIAGLLLMIVQGSWQNPLQDTEEKSRSFKASQSEPLDES...,9.102588
AF-A0A0F8AUA0-F1,0.133333,0.0,0.1,0.0,0.066667,0.066667,0.033333,0.033333,0.1,0.066667,...,-0.429825,0.067043,-0.01986,-0.094131,-0.228686,-0.009579,-0.200217,-0.346726,HADGTFTSDVSSYLKQQAIKDFVARLKAGQ,10.139276
AF-A0A0H4A7I9-F1,0.1,0.0,0.066667,0.1,0.1,0.033333,0.033333,0.033333,0.066667,0.066667,...,-0.389142,0.039464,0.1238,-0.018384,0.067122,0.189003,-0.391515,-0.086112,HADGTFTSDVASYLERQTVKAFIKFLQEES,9.557083


In [8]:
# Selecci√≥n de las secuencias con mayor actividad biol√≥gica como semilla para la generaci√≥n de nuevas secuencias
df_125_conocidos.sort_values(by='pEC50', ascending=False, inplace=True)
df_predicciones_glp1.sort_values(by='pEC50', ascending=False, inplace=True)

# unir los datos conocidos con las predicciones
df_todas_actividades = pd.concat( [df_125_conocidos.head(10), df_predicciones_glp1.head(10)], axis=0)
df_todas_actividades.sort_values(by='pEC50', ascending=False, inplace=True)
df_todas_actividades.head(50)

Unnamed: 0_level_0,AAC_A,AAC_C,AAC_D,AAC_E,AAC_F,AAC_G,AAC_H,AAC_I,AAC_K,AAC_L,...,NMBroto_BEGF750103_lag1,NMBroto_BEGF750103_lag2,NMBroto_BEGF750103_lag3,NMBroto_BHAR880101_lag1,NMBroto_BHAR880101_lag2,NMBroto_BHAR880101_lag3,sequence,EC50_T2,EC50_LOG_T2,pEC50
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
seq_pep117,0.137931,0.0,0.034483,0.103448,0.068966,0.068966,0.034483,0.034483,0.068966,0.068966,...,0.146081,-0.260145,0.256426,0.190963,-0.335632,-0.156167,HAEGTFTSDVSSYLEGQAAKEFIAWLVKR,0.96,-12.02,12.017729
seq_pep26,0.076923,0.0,0.051282,0.128205,0.051282,0.102564,0.0,0.025641,0.025641,0.102564,...,0.319468,0.005239,-0.012149,0.284983,-0.065844,0.122248,YSEGTFTSDYSKLLEEEAVRDFIEWLLAGGPSSGAPPPS,1.03,-11.99,11.987163
seq_pep7,0.068966,0.0,0.034483,0.137931,0.068966,0.068966,0.0,0.034483,0.034483,0.137931,...,0.308208,0.194179,0.178745,0.10022,-0.256994,0.033664,YSQGTFTSDYSKYLEEEAVRLFIEWLLAG,1.06,-11.97,11.974694
seq_pep11,0.1,0.0,0.05,0.025,0.05,0.15,0.025,0.0,0.05,0.075,...,0.204633,0.032283,0.272735,0.360519,0.151019,0.076209,HSQGTFTSDYSKYLDSRAAAKFVQWLLNGGPSSGAPPEGG,1.49,-11.83,11.826814
seq_pep93,0.068966,0.0,0.068966,0.068966,0.068966,0.034483,0.034483,0.034483,0.034483,0.068966,...,0.092625,-0.04922,0.099761,0.111584,-0.335907,-0.10385,HSQGTFTSDYSKYLDSRAASEFVQWLISE,1.57,-11.8,11.8041
seq_pep58,0.166667,0.0,0.033333,0.1,0.066667,0.1,0.033333,0.033333,0.066667,0.066667,...,0.255451,-0.428717,0.219653,0.219475,-0.368515,-0.194881,HAEGTFTSDVASYLEGQAAKEFIAWLVKGR,1.76,-11.75,11.754487
seq_pep27,0.051282,0.0,0.051282,0.076923,0.051282,0.102564,0.0,0.025641,0.051282,0.102564,...,0.192723,-0.080814,0.004414,0.36499,0.038718,0.069637,YSEGTFTSDYSKLLERQAIDEFVNWLLKGGPSSGAPPPS,1.93,-11.71,11.714443
seq_pep115,0.137931,0.0,0.034483,0.103448,0.068966,0.103448,0.034483,0.034483,0.068966,0.068966,...,0.121037,-0.176862,0.185694,0.241324,-0.371814,-0.244886,HAEGTFTSDVSSYLEGQAAKEFIAWLKGR,2.12,-11.67,11.673664
seq_pep56,0.166667,0.0,0.033333,0.1,0.066667,0.1,0.033333,0.033333,0.066667,0.066667,...,0.078577,-0.074029,0.167106,0.192582,-0.221056,-0.310421,HAEGTFTSDVSAYLEGQAAKEFIAWLVKGR,2.32,-11.63,11.634512
seq_pep10,0.1,0.05,0.05,0.0,0.05,0.15,0.025,0.0,0.025,0.075,...,0.26703,0.087523,0.179742,0.328377,0.083261,0.121555,HSQGTFTSDYSKYLDSRAAACFVQWLLNGGPSSGAPPCGG,2.33,-11.63,11.632644


In [9]:
# generaci√≥n de nuevas secuencias usando ProtXLNet
from src.ProtXLNet_generator import generate_peptide_variants, generate_peptide_variants_fast
# Configuraci√≥n del dispositivo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")

print(f"Cargando modelo desde: {ruta_modelo_protxlnet}")
tokenizer = XLNetTokenizer.from_pretrained(ruta_modelo_protxlnet)
model = XLNetLMHeadModel.from_pretrained(ruta_modelo_protxlnet)
model.to(device)

Usando dispositivo: cuda
Cargando modelo desde: d:\source\Proyecto Integrador\glp-1_drug_discovery\models\prot_xlnet_finetuned


XLNetLMHeadModel(
  (transformer): XLNetModel(
    (word_embedding): Embedding(37, 1024)
    (layer): ModuleList(
      (0-29): 30 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=1024, out_features=4096, bias=True)
          (layer_2): Linear(in_features=4096, out_features=1024, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): ReLU()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (lm_loss): Linear(in_features=1024, out_features=37, bias=True)
)

In [10]:
longitud_maxima = df_todas_actividades['sequence'].str.len().max()
longitud_minima = df_todas_actividades['sequence'].str.len().min()

In [11]:
sequences_base = [
 'HAEGTFTSDVSSYLEGQA**FIAWLVK*',
 'HADGT***DVSAYLK*QAIKDFVAKLKSGQ',
 'HSEGTFTSDFSSYLDYKATKEFIAQLTKGL',
 'HSEGTFTSDFSSY*EGKAAKEFIAWLVKGL',
 'HADGTFT***DMS*YLTDKAIRDFVARLKAGQ',
 'HSEGTF**NDV*RLLEEKATSEFIAWLLKGL',
]

In [12]:
longitud_maxima



40

In [13]:
longitud_minima

29

In [14]:
sequences_base = df_todas_actividades['sequence'].tolist()

In [15]:
sequences_base

['HAEGTFTSDVSSYLEGQAAKEFIAWLVKR',
 'YSEGTFTSDYSKLLEEEAVRDFIEWLLAGGPSSGAPPPS',
 'YSQGTFTSDYSKYLEEEAVRLFIEWLLAG',
 'HSQGTFTSDYSKYLDSRAAAKFVQWLLNGGPSSGAPPEGG',
 'HSQGTFTSDYSKYLDSRAASEFVQWLISE',
 'HAEGTFTSDVASYLEGQAAKEFIAWLVKGR',
 'YSEGTFTSDYSKLLERQAIDEFVNWLLKGGPSSGAPPPS',
 'HAEGTFTSDVSSYLEGQAAKEFIAWLKGR',
 'HAEGTFTSDVSAYLEGQAAKEFIAWLVKGR',
 'HSQGTFTSDYSKYLDSRAAACFVQWLLNGGPSSGAPPCGG',
 'HTDGTFTSDVSAYLNDRAVKEFVARLKSGQ',
 'HADGTFTSDVSAYLKEQAIKDFVAKLKAGQ',
 'HSEGTFTSDFSSYLDGKAAKEFVAWLVKSL',
 'HADGTFTSDVSSYLTDQAIRDFVARLKAGR',
 'HADGTFTSDVSAYLKEQAIKDFVAKLKSGQ',
 'HSEGTFTSDFSSYLDYKATKEFIAQLTKGL',
 'HSEGTFTSDFSSYLEGKAAKEFIAWLVKGL',
 'HADGTFTSDMSSYLTDKAIRDFVARLKAGQ',
 'HSEGTFTNDVTRLLEEKATSEFIAWLLKGL',
 'HADGTFTSDISSYLESQAAKEFIAWLANGG']

In [17]:

print("\nIniciando la generaci√≥n de variantes con la funci√≥n importada...")

# Llama a la funci√≥n
nuevas_variantes = generate_peptide_variants_fast  (
    prompt_sequences=sequences_base,
    model=model,
    tokenizer=tokenizer,
    top_k=5,
    num_variants_per_seq=5, # Generar 10 variantes por cada secuencia base
    min_length=longitud_minima,
    max_length=longitud_maxima
)




Iniciando la generaci√≥n de variantes con la funci√≥n importada...
Generando 100 variantes en lotes de 32...


Generando:   0%|          | 0/4 [00:00<?, ?it/s]


shape len: 40

shape len: 40

shape len: 32

shape len: 32

Generaci√≥n completada. Se obtuvieron 123 variantes √∫nicas.


In [17]:
nuevas_variantes

['TAEGTFTSDVSAYLEGQAAKEFIAWLVKGR',
 'HXDGTFTSDISSYLESQAAKEFIAWLANGG',
 'YSEGNFTSDYSKLLERQAIDEFVNWLLKGGPSSGAPPP',
 'YSQGTFTSDYSKFLEEEAVRLFIEWLLAGH',
 'HSCGTFTSDYSKYLDSRAAACFVQWLLNGGPSSGAPPC',
 'HAWGTFTSDMSSYLTDKAIRDFVARLKAGQ',
 'HADGTFTSDVSSYLFDQAIRDFVARLKAGR',
 'HADGTFTSDMSSYLTDKASRDFVARLKAGQ',
 'HADGTFTSDVSAYLKEQAIKDFVFKLKAGQ',
 'HADGTFTSDVSAYLKLQAIKDFVAKLKSGQ',
 'HSEGTFTNDVTLLLEEKATSEFIAWLLKGL',
 'HSEGTFTSDFSSYLEGKGAKEFIAWLVKGL',
 'YSEGTFTSDYSKLLEEEAVRDWIEWLLAGGPSSGAPPP',
 'HAEGTFTSDVSSYLEGQAAKEFIAWLCKR',
 'HSEGTFTSDFSSYLEGKAAKEFIAWLVRGLY',
 'HIEGTFTSDVSAYLEGQAAKEFIAWLVKGR',
 'HADGTFTSDVSAYLKEQIIKDFVAKLKAGQ',
 'HSEGTFTNDVTRLLEEKATSEFIAWLMKGL',
 'HADGTFTSDISSYLESQAAKEFITWLANGGY',
 'HSEGTFTNDVTRLLWEKATSEFIAWLLKGLD',
 'HAEGTFTSDVSSILEGQAAKEFIAWLVKR',
 'HDEGTFTSDVSSYLEGQAAKEFIAWLKGR',
 'YSSGTFTSDYSKYLEEEAVRLFIEWLLAG',
 'HSEGTFTNDVTRLLWEKATSEFIAWLLKGL',
 'HSEGTFTSDFSSYLDMKAAKEFVAWLVKSL',
 'YSQGSFTSDYSKYLEEEAVRLFIEWLLAG',
 'HQQGTFTSDYSKYLDSRAASEFVQWLISE',
 'HSEGTHTSDFSSYLDYKATKEFIAQLTKGL',

In [18]:
df = pd.DataFrame(nuevas_variantes, columns=["sequence"])
df

Unnamed: 0,sequence
0,TAEGTFTSDVSAYLEGQAAKEFIAWLVKGR
1,HXDGTFTSDISSYLESQAAKEFIAWLANGG
2,YSEGNFTSDYSKLLERQAIDEFVNWLLKGGPSSGAPPP
3,YSQGTFTSDYSKFLEEEAVRLFIEWLLAGH
4,HSCGTFTSDYSKYLDSRAAACFVQWLLNGGPSSGAPPC
...,...
125,YSEGTXTSDYSKLLEEEAVRDFIEWLLAGGPSSGAPPP
126,YSQGTFTSDYSKYLEEEAVRLFIEWLLXGH
127,YSQGTFTSDYSKFLEEEAVRLFIEWLLAG
128,HTDGTFTSDVSAYRNDRAVKEFVARLKSGQ


In [19]:
df['sequence'].str.len().min()

29

In [20]:
df['sequence'].str.len().max()

38

In [None]:
if model is not None:
    del model
if tokenizer is not None:
    del tokenizer
if device.type == 'cuda':
    torch.cuda.empty_cache()

In [None]:
import random
prompt_sequences = sequences_base
num_variants_per_seq=5

In [None]:
AMINO_ACIDS = list("ACDEFGHIKLMNPQRSTVWY*")
random.seed(1321)

In [None]:
mutated_prompts = []
for base_seq in prompt_sequences:
    if not base_seq:
        continue
    for _ in range(num_variants_per_seq):
        seq_list = list(base_seq)
        idx = random.randrange(len(seq_list))
        aa = seq_list[idx]
        seq_list[idx] = random.choice([x for x in AMINO_ACIDS if x != aa])
        mutated_prompts.append("".join(seq_list))


        

In [None]:
batch_size: int = 32
from tqdm.auto import tqdm    
# --- Procesar en lotes ---
for i in tqdm(range(0, len(mutated_prompts), batch_size), desc="Generando"):
    batch_prompts = mutated_prompts[i:i + batch_size]        
    inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True).to(device)
    print(inputs)


In [None]:
df = pd.DataFrame(mutated_prompts, columns=["sequence"])
df

In [None]:
df['sequence'].str.len().min()

In [None]:
df['sequence'].str.len().max()