# MODELO DE APRENDIZAJE BERT PARA PREDICCIÓN DE AMPs

## INSTALACIÓN E IMPORTACIÓN DE PAQUETES DE TRABAJO

In [None]:
!pip install transformers



In [None]:
import os
import pandas as pd
import numpy as np
import torch
import re

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from torch.utils.data import Dataset, DataLoader

torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

from transformers import AutoTokenizer, Trainer, TrainingArguments, BertForSequenceClassification, AdamW

cuda


In [None]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # or any {'0', '1', '2'}
import tensorflow as tf

## FUNCIÓN DE TOKENIZACIÓN

In [None]:
# Clase mix_data para procesar cualquier DataFrame (df o ddf)
class mix_data():
    def __init__(self, df, tokenizer_name='Rostlab/prot_bert_bfd', max_len=200):
        # Inicializa el tokenizador de BERT y otros parámetros
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, do_lower_case=False)
        self.max_len = max_len
        self.seqs, self.labels = self.get_seqs_labels(df)  # Aquí pasamos df o ddf como parámetro

    def get_seqs_labels(self, df):  # Aquí recibimos el DataFrame que puede ser df o ddf
        # Aislar las secuencias de aminoácidos y sus etiquetas
        seqs = list(df['aa_seq'])  # Suponiendo que 'aa_seq' tiene las secuencias
        labels = list(df['label'].astype(int))  # Suponiendo que 'label' tiene las etiquetas
        return seqs, labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        seq = " ".join("".join(self.seqs[idx].split()))  # Tokeniza la secuencia
        seq_ids = self.tokenizer(seq, truncation=True, padding='max_length', max_length=self.max_len)

        sample = {key: torch.tensor(val) for key, val in seq_ids.items()}
        sample['labels'] = torch.tensor(self.labels[idx])  # Añadir la etiqueta correspondiente
        return sample
    def __setitem__(self, idx, sample):
        # Update the elements of the mix_data object at the given index
        for key in sample:
            if key in self.__dict__:  # Or any other logic to determine which attributes to update
                if isinstance(self.__dict__[key], list): # Assuming your data is stored in lists
                    self.__dict__[key][idx] = sample[key]
                # Add other conditions for different data structures if needed
                else:
                    # Handle cases where the attribute is not a list
                    pass  # Or raise an error if this should not happen



## CARGADO DE DATOS

In [None]:
# read in the train dataset
# create an amp_data class of the dataset

data_url = 'https://raw.githubusercontent.com/JavierColubi/Javier-Colubi-github/refs/heads/main/train_amp-BERT.txt'
df = pd.read_csv(data_url, index_col = 0)
df = df.sample(frac=1, random_state = 0)
print(df.head(7))
print(f'El número de secuencias totales en el archivo es: {len(df)}')
train_dataset = mix_data(df)

                                                            aa_seq  aa_len  \
AP02151          YEALVTSILGKLTGLWHNDSVDFMGHICYFRRRPKIRRFKLYHEGK...      95   
AP01951                                          FLPLVLGALSGILPKIL      17   
AP00972                                        FLSLIPHAINAVGVHAKHF      19   
AP01261                                           IIEKLVNTALGLLSGL      16   
AP01298                                       GLFTLIKCAYQLIAPTVACN      20   
AP01802                                     RPWAGNGSVHRYTVLSPRLKTQ      22   
UniRef50_Q9UTR1                                SKENSYVEKLLYKQRFYAS      19   

                 label  
AP02151              1  
AP01951              1  
AP00972              1  
AP01261              1  
AP01298              1  
AP01802              1  
UniRef50_Q9UTR1      0  
El número de secuencias totales en el archivo es: 3556


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
# read in the train dataset
# create the dataset

data_url2 = 'https://raw.githubusercontent.com/JavierColubi/Javier-Colubi-github/refs/heads/main/validacion2_20.txt'
ddf = pd.read_csv(data_url2, index_col = 0)
ddf = ddf.sample(frac=1, random_state = 0)
print(ddf.head(7))
print(f'El número de secuencias totales en el archivo es: {len(ddf)}')

val_dataset = mix_data(ddf)

                                                            aa_seq  aa_len  \
DRAMP00346       AITCGQVSSALGPCAAYAKGSGTSPSAGCCSGVKRLAGLARSTADK...      90   
DRAMP03908                                         RKCLRWQWEMRKYGG      15   
DRAMP00094                     VNYGNGVSCSKTKCSVNWGIITHQAFRVTSGVASG      35   
DRAMP00071       KTVNYGNGLYCNQKKCWVNWSETATTIVNNSIMNGLTGGNAGWHSGGRA      49   
non_AMPEP74023                                       DAHNEDEEHAEGS      13   
non_AMPEP135992                                      QGGWPRPGPEIPP      13   
non_AMPEP79757                                         QGGWPRNPIPP      11   

                 label  
DRAMP00346           1  
DRAMP03908           1  
DRAMP00094           1  
DRAMP00071           1  
non_AMPEP74023       0  
non_AMPEP135992      0  
non_AMPEP79757       0  
El número de secuencias totales en el archivo es: 710


## FUNCIÓN DE EVALUACIÓN DEL ENTRENAMIENTO

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Calcular métricas globales (macro)
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)

    # Calcular métricas por clase
    precision_per_label, recall_per_label, f1_per_label, support_per_label = precision_recall_fscore_support(
        labels, preds, average=None
    )

    # Confusion matrix
    conf_matrix = confusion_matrix(labels, preds).tolist()  # Convertir a lista para serialización en JSON si es necesario

    # Crear el diccionario de métricas
    metrics = {
        'accuracy': acc,
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
        'f1_macro': f1_macro,
    }

    # Agregar métricas por clase al diccionario
    for i, precision in enumerate(precision_per_label):
        metrics[f'precision_label_{i}'] = precision  # Unique key for each label's precision
    for i, recall in enumerate(recall_per_label):
        metrics[f'recall_label_{i}'] = recall  # Unique key for each label's recall
    for i, f1 in enumerate(f1_per_label):
        metrics[f'f1_label_{i}'] = f1  # Unique key for each label's F1-score
    # ... (add other metrics as needed) ...

    return metrics

## MODELO DE ENTRENAMIENTO

In [None]:
# define the model initializing function for Trainer in huggingface

def model_init():
    return BertForSequenceClassification.from_pretrained('Rostlab/prot_bert_bfd')

In [None]:
# training on entire data
# no evaluation/validation

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    learning_rate = 5e-5,
    per_device_train_batch_size=1,
    warmup_steps=0,
    weight_decay=0.1,
    logging_dir='./logs',
    logging_strategy='epoch',
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy='epoch',
    gradient_accumulation_steps=32,
    fp16=True,
    fp16_opt_level="O2",
    run_name="neuropep_BERT",
    seed=0,
    load_best_model_at_end = True,
    metric_for_best_model="eval_accuracy",  # Seleccionar el mejor modelo basado en precisión
    greater_is_better=True        # La precisión es mejor si es mayor
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics = compute_metrics,
)

trainer.train()



pytorch_model.bin:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Rostlab/prot_bert_bfd and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Rostlab/prot_bert_bfd and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro,Precision Label 0,Precision Label 1,Recall Label 0,Recall Label 1,F1 Label 0,F1 Label 1
0,0.4232,0.753955,0.739437,0.763975,0.692958,0.726736,0.719072,0.763975,0.785915,0.692958,0.751009,0.726736
1,0.3669,0.713062,0.704225,0.850242,0.495775,0.626335,0.644135,0.850242,0.912676,0.495775,0.755245,0.626335
2,0.4588,0.767894,0.757746,0.786834,0.707042,0.744807,0.734015,0.786834,0.808451,0.707042,0.769437,0.744807
3,0.3066,0.673976,0.753521,0.766272,0.729577,0.747475,0.741935,0.766272,0.777465,0.729577,0.759285,0.747475
4,0.2784,0.780443,0.721127,0.704961,0.760563,0.731707,0.740061,0.704961,0.68169,0.760563,0.709677,0.731707
5,0.3392,0.842782,0.726761,0.703797,0.783099,0.741333,0.755556,0.703797,0.670423,0.783099,0.710448,0.741333
6,0.2281,0.787673,0.71831,0.686747,0.802817,0.74026,0.762712,0.686747,0.633803,0.802817,0.692308,0.74026
8,0.1471,0.990921,0.740845,0.719794,0.788732,0.752688,0.766355,0.719794,0.692958,0.788732,0.727811,0.752688
9,0.122,1.110215,0.702817,0.665899,0.814085,0.732573,0.76087,0.665899,0.591549,0.814085,0.66561,0.732573


TrainOutput(global_step=1110, training_loss=0.2848445119084539, metrics={'train_runtime': 4802.9632, 'train_samples_per_second': 7.404, 'train_steps_per_second': 0.231, 'total_flos': 1.6151956376832e+16, 'train_loss': 0.2848445119084539, 'epoch': 9.988751406074242})

## METRICAS DE EVALUACIÓN

In [None]:
# Imprimir el historial de logs del entrenador
print(trainer.state.log_history)

# Analizar la relación entre pasos y épocas
for log in trainer.state.log_history:
    if 'epoch' in log:
        print(f"Step: {log['step']}, Epoch: {log['epoch']}, Metrics: {log}")

[{'loss': 0.4232, 'grad_norm': 272.37188720703125, 'learning_rate': 4.531531531531532e-05, 'epoch': 0.9988751406074241, 'step': 111}, {'eval_loss': 0.7539548873901367, 'eval_accuracy': 0.7394366197183099, 'eval_precision_macro': 0.7639751552795031, 'eval_recall_macro': 0.6929577464788732, 'eval_f1_macro': 0.7267355982274741, 'eval_precision_label_0': 0.7190721649484536, 'eval_precision_label_1': 0.7639751552795031, 'eval_recall_label_0': 0.7859154929577464, 'eval_recall_label_1': 0.6929577464788732, 'eval_f1_label_0': 0.7510094212651414, 'eval_f1_label_1': 0.7267355982274741, 'eval_runtime': 8.856, 'eval_samples_per_second': 80.172, 'eval_steps_per_second': 10.05, 'epoch': 0.9988751406074241, 'step': 111}, {'loss': 0.3669, 'grad_norm': 25.343387603759766, 'learning_rate': 4.04054054054054e-05, 'epoch': 1.9977502812148482, 'step': 222}, {'eval_loss': 0.7130619883537292, 'eval_accuracy': 0.704225352112676, 'eval_precision_macro': 0.8502415458937198, 'eval_recall_macro': 0.495774647887323

In [None]:
ls ./results

[0m[01;34mcheckpoint-1000[0m/  [01;34mcheckpoint-1110[0m/  [01;34mcheckpoint-333[0m/  [01;34mcheckpoint-555[0m/  [01;34mcheckpoint-777[0m/
[01;34mcheckpoint-111[0m/   [01;34mcheckpoint-222[0m/   [01;34mcheckpoint-444[0m/  [01;34mcheckpoint-666[0m/  [01;34mcheckpoint-889[0m/


In [None]:
#Este script permite elegir que te muestre las metricas de entrenamiento del epoch que quieras que en este
#caso será aquel que haya dado un valor de accuracy mayor en la validación, en este caso es el epoch 7
from transformers import AutoModelForSequenceClassification

# Ruta del checkpoint del epoch deseado
specific_epoch_checkpoint = "./results/checkpoint-333"  # Cambia X por el número del checkpoint

# Cargar el modelo desde el checkpoint
# Add the 'local_files_only=True' argument to load from local path.
model = AutoModelForSequenceClassification.from_pretrained(specific_epoch_checkpoint, local_files_only=True)
# Move the model to the GPU
model.to(device) # where 'device' is your cuda device if available
trainer.model = model  # Actualiza el modelo del trainer

# Calcular métricas para el conjunto de entrenamiento
train_predictions, train_label_ids, train_metrics = trainer.predict(train_dataset)
print(f"Metrics for epoch X:", train_metrics)

Metrics for epoch X: {'test_loss': 0.3120515048503876, 'test_accuracy': 0.9041057367829022, 'test_precision_macro': 0.9614643545279383, 'test_recall_macro': 0.8419572553430821, 'test_f1_macro': 0.8977511244377812, 'test_precision_label_0': 0.8594297148574287, 'test_precision_label_1': 0.9614643545279383, 'test_recall_label_0': 0.9662542182227222, 'test_recall_label_1': 0.8419572553430821, 'test_f1_label_0': 0.9097167063807254, 'test_f1_label_1': 0.8977511244377812, 'test_runtime': 188.0471, 'test_samples_per_second': 18.91, 'test_steps_per_second': 2.366}


In [None]:
val_predictions, val_label_ids, val_metrics = trainer.predict(val_dataset)
print("Validation Metrics:", val_metrics)

Validation Metrics: {'test_loss': 0.7678731679916382, 'test_accuracy': 0.7577464788732394, 'test_precision_macro': 0.786833855799373, 'test_recall_macro': 0.7070422535211267, 'test_f1_macro': 0.744807121661721, 'test_precision_label_0': 0.7340153452685422, 'test_precision_label_1': 0.786833855799373, 'test_recall_label_0': 0.8084507042253521, 'test_recall_label_1': 0.7070422535211267, 'test_f1_label_0': 0.7694369973190348, 'test_f1_label_1': 0.744807121661721, 'test_runtime': 38.7067, 'test_samples_per_second': 18.343, 'test_steps_per_second': 2.299}


## GUARDADO Y CARGADO DEL MODELO

In [None]:
# save the model, if desired

from google.colab import drive
drive.mount('/content/drive')
# Call .contiguous() on the model's parameters before saving
for name, param in trainer.model.named_parameters():
    if not param.is_contiguous():
        param.data = param.data.contiguous()
trainer.save_model('/content/drive/MyDrive/Datos Finales para Memoria/modelos/AMP-BERT/Fine-tuned_model/')

Mounted at /content/drive


In [None]:
# predict AMP/non-AMP for a single example

# IMPORTANT:
# one must mount their Google Drive and load their own fine-tuned model before running the below cell for individual predictions
from google.colab import drive
drive.mount('/content/drive')

# load appropriate tokenizer and fine-tuned model
tokenizer = AutoTokenizer.from_pretrained('Rostlab/prot_bert_bfd', do_lower_case=False)
model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/Datos Finales para Memoria/modelos/AMP-BERT/Fine-tuned_model")

Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

## TEST DE VERIFICACIÓN DEL ENTRENAMIENTO

### TEST DE VERIFICACIÓN CON AMPs

In [None]:
import re
import torch

# Función para leer un archivo FASTA
def read_fasta(file_path):
    sequences = []
    with open(file_path, 'r') as f:
        identifier = ""
        sequence = ""
        for line in f:
            line = line.strip()
            if line.startswith(">"):  # Es un identificador de secuencia
                if sequence:  # Guarda la secuencia anterior si existe
                    sequences.append((identifier, sequence))
                    sequence = ""
                identifier = line[1:]  # Guarda el identificador sin el ">"
            else:
                sequence += line  # Agrega las líneas de la secuencia
        if sequence:  # Guarda la última secuencia
            sequences.append((identifier, sequence))
    return sequences
# Función para hacer predicciones de AMP/non-AMP para múltiples secuencias y guardar probabilidades
def predict_amp(sequences, output_file):
    i=0
    n=0
    with open(output_file, 'w') as out_f:
        out_f.write("Identifier\tSequence\tPrediction\tProbability\n")  # Encabezados

        for identifier, input_seq in sequences:
            input_seq_spaced = ' '.join([input_seq[i:i+1] for i in range(0, len(input_seq), 1)])
            input_seq_spaced = re.sub(r'[UZOB]', 'X', input_seq_spaced)  # Reemplazar aminoácidos no estándar
            input_seq_tok = tokenizer(input_seq_spaced, return_tensors='pt')
            #print(input_seq_tok)

            output = model(**input_seq_tok)
            logits = output[0]
           # print(f"Logits for {identifier}: {logits}")

            # Extraer la probabilidad de clase AMP
            #y_prob = torch.sigmoid(logits)[:, 1].detach().numpy()
            #y_pred = y_prob > 0.84  # Predicción binaria con umbral de 0.5
            #prob = y_prob[0]  # Probabilidad real (sin umbral)

            # Aplicar softmax para obtener probabilidades de cada clase
            y_prob = torch.softmax(logits, dim=1).detach().numpy()

            # Obtener la clase con la probabilidad más alta
            y_pred_class = y_prob.argmax(axis=1)[0]
            prob_class1, prob_class2 = y_prob[0]  # Probabilidades para las tres clases
            # Determinar la etiqueta de la predicción según la clase
            if y_pred_class == 0:
                input_class = 'non-AMP/OTHERS'  # Cambia estos nombres según tus etiquetas
                n=n+1
            elif y_pred_class == 1:
                input_class = 'AMP'
                i=i+1
            # Escribir el identificador, secuencia, predicción y probabilidades en el archivo de salida
            out_f.write(f"{identifier}\t{input_seq}\t{input_class}\t{prob_class1:.4f}\t{prob_class2:.4f}\n")
            print(f"Processed {identifier}: {input_class} (Probabilities: {prob_class1:.4f}, {prob_class2:.4f})")
    print(f"El número de secuencias de AMPs es: {i}")
    print(f"El número de secuencias de non-AMP es: {n}")

            # Determinar clase (AMP o non-AMP)
            #if y_pred:
            #    input_class = 'AMP'
            #else:
            #    input_class = 'non-AMP'

            # Escribir el identificador, secuencia, predicción y probabilidad en el archivo de salida
           # out_f.write(f"{identifier}\t{input_seq}\t{input_class}\t{prob:.4f}\n")
           # print(f"Processed {identifier}: {input_class} (Probability: {prob:.4f})")

# Ruta del archivo FASTA de entrada y del archivo de salida
fasta_file = '/content/drive/MyDrive/data_final/general_amps_DRAMP_cdhit_80_NO_REPES_recortado1892seq.fasta'  # Nombre del archivo FASTA con secuencias
output_file = '/content/drive/MyDrive/Datos Finales para Memoria/resultados_verificacion_general_AMPs_model2.tsv'  # Archivo de salida con predicciones y probabilidades

# Leer las secuencias del archivo FASTA
sequences = read_fasta(fasta_file)

# Realizar predicciones y escribir resultados en un archivo
predict_amp(sequences, output_file)

Processed DRAMP00032: non-AMP/OTHERS (Probabilities: 0.9294, 0.0706)
Processed DRAMP00089: AMP (Probabilities: 0.0271, 0.9729)
Processed DRAMP00106: AMP (Probabilities: 0.0271, 0.9729)
Processed DRAMP00127: AMP (Probabilities: 0.0273, 0.9727)
Processed DRAMP00129: AMP (Probabilities: 0.0271, 0.9729)
Processed DRAMP00189: AMP (Probabilities: 0.0458, 0.9542)
Processed DRAMP00190: AMP (Probabilities: 0.0271, 0.9729)
Processed DRAMP00191: non-AMP/OTHERS (Probabilities: 0.9306, 0.0694)
Processed DRAMP00204: non-AMP/OTHERS (Probabilities: 0.9304, 0.0696)
Processed DRAMP00254: AMP (Probabilities: 0.0271, 0.9729)
Processed DRAMP00384: AMP (Probabilities: 0.0271, 0.9729)
Processed DRAMP00425: non-AMP/OTHERS (Probabilities: 0.9301, 0.0699)
Processed DRAMP00437: AMP (Probabilities: 0.0271, 0.9729)
Processed DRAMP00454: AMP (Probabilities: 0.0271, 0.9729)
Processed DRAMP00766: non-AMP/OTHERS (Probabilities: 0.9300, 0.0700)
Processed DRAMP01004: non-AMP/OTHERS (Probabilities: 0.9296, 0.0704)
Proces

### TEST DE VERIFICACIÓN CON NON-AMPs

In [None]:
import re
import torch

# Función para leer un archivo FASTA
def read_fasta(file_path):
    sequences = []
    with open(file_path, 'r') as f:
        identifier = ""
        sequence = ""
        for line in f:
            line = line.strip()
            if line.startswith(">"):  # Es un identificador de secuencia
                if sequence:  # Guarda la secuencia anterior si existe
                    sequences.append((identifier, sequence))
                    sequence = ""
                identifier = line[1:]  # Guarda el identificador sin el ">"
            else:
                sequence += line  # Agrega las líneas de la secuencia
        if sequence:  # Guarda la última secuencia
            sequences.append((identifier, sequence))
    return sequences
# Función para hacer predicciones de AMP/non-AMP para múltiples secuencias y guardar probabilidades
def predict_amp(sequences, output_file):
    i=0
    n=0
    with open(output_file, 'w') as out_f:
        out_f.write("Identifier\tSequence\tPrediction\tProbability\n")  # Encabezados

        for identifier, input_seq in sequences:
            input_seq_spaced = ' '.join([input_seq[i:i+1] for i in range(0, len(input_seq), 1)])
            input_seq_spaced = re.sub(r'[UZOB]', 'X', input_seq_spaced)  # Reemplazar aminoácidos no estándar
            input_seq_tok = tokenizer(input_seq_spaced, return_tensors='pt')
            #print(input_seq_tok)

            output = model(**input_seq_tok)
            logits = output[0]
           # print(f"Logits for {identifier}: {logits}")

            # Extraer la probabilidad de clase AMP
            #y_prob = torch.sigmoid(logits)[:, 1].detach().numpy()
            #y_pred = y_prob > 0.84  # Predicción binaria con umbral de 0.5
            #prob = y_prob[0]  # Probabilidad real (sin umbral)

            # Aplicar softmax para obtener probabilidades de cada clase
            y_prob = torch.softmax(logits, dim=1).detach().numpy()

            # Obtener la clase con la probabilidad más alta
            y_pred_class = y_prob.argmax(axis=1)[0]
            prob_class1, prob_class2 = y_prob[0]  # Probabilidades para las tres clases
            # Determinar la etiqueta de la predicción según la clase
            if y_pred_class == 0:
                input_class = 'non-AMP/OTHERS'  # Cambia estos nombres según tus etiquetas
                n=n+1
            elif y_pred_class == 1:
                input_class = 'AMP'
                i=i+1
            # Escribir el identificador, secuencia, predicción y probabilidades en el archivo de salida
            out_f.write(f"{identifier}\t{input_seq}\t{input_class}\t{prob_class1:.4f}\t{prob_class2:.4f}\n")
            print(f"Processed {identifier}: {input_class} (Probabilities: {prob_class1:.4f}, {prob_class2:.4f})")
    print(f"El número de secuencias de AMPs es: {i}")
    print(f"El número de secuencias de non-AMP es: {n}")

            # Determinar clase (AMP o non-AMP)
            #if y_pred:
            #    input_class = 'AMP'
            #else:
            #    input_class = 'non-AMP'

            # Escribir el identificador, secuencia, predicción y probabilidad en el archivo de salida
           # out_f.write(f"{identifier}\t{input_seq}\t{input_class}\t{prob:.4f}\n")
           # print(f"Processed {identifier}: {input_class} (Probability: {prob:.4f})")

# Ruta del archivo FASTA de entrada y del archivo de salida
fasta_file = '/content/drive/MyDrive/data/non_amp_ampep_cdhit90.fasta'  # Nombre del archivo FASTA con secuencias...
output_file = '/content/drive/MyDrive/Datos Finales para Memoria/resultados/RESULTADO_verificacion_non-AMPs_model2.tsv'  # Archivo de salida con predicciones y probabilidades

# Leer las secuencias del archivo FASTA
sequences = read_fasta(fasta_file)

# Realizar predicciones y escribir resultados en un archivo
predict_amp(sequences, output_file)

Processed non_AMPEP82130: non-AMP/OTHERS (Probabilities: 0.9299, 0.0701)
Processed non_AMPEP54142: non-AMP/OTHERS (Probabilities: 0.9297, 0.0703)
Processed non_AMPEP137679: non-AMP/OTHERS (Probabilities: 0.9305, 0.0695)
Processed non_AMPEP77131: non-AMP/OTHERS (Probabilities: 0.9303, 0.0697)
Processed non_AMPEP100173: non-AMP/OTHERS (Probabilities: 0.9292, 0.0708)
Processed non_AMPEP49224: non-AMP/OTHERS (Probabilities: 0.9286, 0.0714)
Processed non_AMPEP9780: non-AMP/OTHERS (Probabilities: 0.9300, 0.0700)
Processed non_AMPEP100390: non-AMP/OTHERS (Probabilities: 0.9286, 0.0714)
Processed non_AMPEP119666: non-AMP/OTHERS (Probabilities: 0.9297, 0.0703)
Processed non_AMPEP2676: non-AMP/OTHERS (Probabilities: 0.9291, 0.0709)
Processed non_AMPEP1965: non-AMP/OTHERS (Probabilities: 0.9305, 0.0695)
Processed non_AMPEP133341: non-AMP/OTHERS (Probabilities: 0.9296, 0.0704)
Processed non_AMPEP144088: non-AMP/OTHERS (Probabilities: 0.9300, 0.0700)
Processed non_AMPEP162641: non-AMP/OTHERS (Proba

## TEST DE PREDICCIÓN DE POTENCIAL ANTIMICROBIANO EN NEUROPÉPTIDOS HUMANOS

In [None]:
import re
import torch

# Función para leer un archivo FASTA
def read_fasta(file_path):
    sequences = []
    with open(file_path, 'r') as f:
        identifier = ""
        sequence = ""
        for line in f:
            line = line.strip()
            if line.startswith(">"):  # Es un identificador de secuencia
                if sequence:  # Guarda la secuencia anterior si existe
                    sequences.append((identifier, sequence))
                    sequence = ""
                identifier = line[1:]  # Guarda el identificador sin el ">"
            else:
                sequence += line  # Agrega las líneas de la secuencia
        if sequence:  # Guarda la última secuencia
            sequences.append((identifier, sequence))
    return sequences

# Función para hacer predicciones de AMP/non-AMP para múltiples secuencias y guardar probabilidades
def predict_amp(sequences, output_file):
    with open(output_file, 'w') as out_f:
        out_f.write("Species\tFamily\tIdentifier\tPrediction\tProbability_AMP\n")  # Encabezados
        for identifier, input_seq in sequences:
            input_seq_spaced = ' '.join([input_seq[i:i+1] for i in range(0, len(input_seq), 1)])
            input_seq_spaced = re.sub(r'[UZOB]', 'X', input_seq_spaced)  # Reemplazar aminoácidos no estándar
            input_seq_tok = tokenizer(input_seq_spaced, return_tensors='pt')

            output = model(**input_seq_tok)
            logits = output[0]

            # Extraer la probabilidad de clase AMP
            y_prob = torch.sigmoid(logits)[:, 1].detach().numpy()
            y_pred = y_prob > 0.84  # Predicción binaria con umbral de 0.5
            prob = y_prob[0]  # Probabilidad real (sin umbral)

            # Determinar clase (AMP o non-AMP)
            if y_pred:
                input_class = 'AMP'
            else:
                input_class = 'non-AMP'

            # Escribir el identificador, secuencia, predicción y probabilidad en el archivo de salida
            out_f.write(f"{identifier}\t{input_class}\t{prob:.4f}\n")
            print(f"Processed {identifier}: {input_class} (Probability: {prob:.4f})")

# Ruta del archivo FASTA de entrada y del archivo de salida
fasta_file = '/content/drive/MyDrive/data/Human_neuropep_Neuropedia_test.fasta'  # Nombre del archivo FASTA con secuencias
output_file = '/content/drive/MyDrive/Datos Finales para Memoria/predictions_with_probabilities_neuro.tsv'  # Archivo de salida con predicciones y probabilidades

# Leer las secuencias del archivo FASTA
sequences = read_fasta(fasta_file)

# Realizar predicciones y escribir resultados en un archivo
predict_amp(sequences, output_file)


Processed Homo sapiens (Human)	Vasopressin/oxytocin gene family	Neurophysin1: AMP (Probability: 0.8433)
Processed Homo sapiens (Human)	Calcitonin gene family	Calcitonin gene-related peptide 1: non-AMP (Probability: 0.1944)
Processed Homo sapiens (Human)	CRH-related gene family	Urotensin-2B: AMP (Probability: 0.8433)
Processed Homo sapiens (Human)	Calcitonin gene family	Calcitonin gene-related peptide 2: AMP (Probability: 0.8427)
Processed Homo sapiens (Human)	Somatostatin gene family	Somatostatin-14: AMP (Probability: 0.8433)
Processed Homo sapiens (Human)	F- and Y-amide gene family	Neuropeptide AF: non-AMP (Probability: 0.1977)
Processed Homo sapiens (Human)	Kinin and tensin gene family	C-terminal-flanking peptide: non-AMP (Probability: 0.1975)
Processed Homo sapiens (Human)	GnRH family	GnRH-associated peptide 2: non-AMP (Probability: 0.1955)
Processed Homo sapiens (Human)	Vasopressin/oxytocin gene family	Neurophysin 2: AMP (Probability: 0.8433)
Processed Homo sapiens (Human)	Neurexop

In [None]:
# Función para leer un archivo de predicciones y filtrar aquellos con etiqueta 'Neuropeptide'
def print_res(file_path):
    i=0
    with open(file_path, 'r') as f:
        next(f)  # Saltar la primera línea (encabezado)
        for line in f:
            columns = line.strip().split("\t")  # Dividir las columnas por tabulador
            species, family, identifier, prediction, prob_class = columns
            # Imprimir solo aquellas líneas con la etiqueta 'AMP'
            if prediction == 'AMP':
                print(f"Identifier: {identifier}, Prediction: {prediction}, Probabilities: {prob_class}")
                i=i+1
        print(f"\nEL NÚMERO DE SECUENCIAS CON POTENCIAL ANTIMICROBIANO ES: {i}")
# Ruta del archivo de resultados con predicciones
result_file = '/content/drive/MyDrive/Datos Finales para Memoria/predictions_with_probabilities_neuro.tsv'  # Archivo de salida generado anteriormente

# Llamar a la función para imprimir los neuropeptidos
print_res(result_file)


Identifier: Neurophysin1, Prediction: AMP, Probabilities: 0.8433
Identifier: Urotensin-2B, Prediction: AMP, Probabilities: 0.8433
Identifier: Calcitonin gene-related peptide 2, Prediction: AMP, Probabilities: 0.8427
Identifier: Somatostatin-14, Prediction: AMP, Probabilities: 0.8433
Identifier: Neurophysin 2, Prediction: AMP, Probabilities: 0.8433
Identifier: Agouti-related protein, Prediction: AMP, Probabilities: 0.8432
Identifier: CNP-53, Prediction: AMP, Probabilities: 0.8432
Identifier: Relaxin-3 A chain, Prediction: AMP, Probabilities: 0.8429
Identifier: Nociceptin, Prediction: AMP, Probabilities: 0.8433
Identifier: CNP-22, Prediction: AMP, Probabilities: 0.8433
Identifier: Apelin-31, Prediction: AMP, Probabilities: 0.8433
Identifier: BNP(3-29), Prediction: AMP, Probabilities: 0.8433
Identifier: BNP(3-30), Prediction: AMP, Probabilities: 0.8433
Identifier: BNP(3-32), Prediction: AMP, Probabilities: 0.8433
Identifier: Resistin, Prediction: AMP, Probabilities: 0.8433
Identifier: Ape