In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

import torch
import numpy as np
import matplotlib.pyplot as plt
from balanced_loss import Loss
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
# from transformers import AutoTokenizer, AutoModel
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader, Dataset
from pathlib import Path

from data.constants import LOCAL_MODELS_PATH, CHECKPOINTS_PATH, TEST_DF_PATH, LOCAL_MODEL_OUTPUT_FILES_PATH


BASE_MODEL_PATH = LOCAL_MODELS_PATH / 't5-small'
MAX_LEN = 512
batch_size = 8
TEST_BATCH_SIZE = batch_size

SEED = 2


model_name = BASE_MODEL_PATH.name
# CHECKPOINTS_DIR = CHECKPOINTS_PATH / model_name / 'checkpoints_train_v3'
CHECKPOINTS_DIR = CHECKPOINTS_PATH / model_name / 'checkpoints_train_v2'

CHECKPOINTS_DIR.mkdir(exist_ok=True, parents=True)
MODEL_TO_SAVE_TEMPLE = 'model-{epoch}-epoch.pt'

In [2]:
tokenizer = T5Tokenizer.from_pretrained(BASE_MODEL_PATH)

model = T5ForConditionalGeneration.from_pretrained(BASE_MODEL_PATH)  # Замените на вашу базовую модель
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(DEVICE)

saved_models = sorted(CHECKPOINTS_DIR.glob("*.pt"), key=lambda x: x.stat().st_mtime, reverse=True)
if saved_models:
    latest_model_path = saved_models[0] 
    print(f"Loading model weights from {latest_model_path}")
    
    # Загрузка весов
    model.load_state_dict(torch.load(str(latest_model_path), map_location=DEVICE))
    model.eval()  
else:
    print("No saved models found. Starting with random weights.")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Loading model weights from c:\users\eliza\graduate_work_rep\biomol-generator\model_training\model_epoch_save\t5-small\checkpoints_train_v2\model-14-epoch.pt


In [3]:
class ProteinSeqSmailesDataset(Dataset):
    def __init__(self, df, tokenizer, input_column, output_column, max_len):
        self.max_len = max_len
        self.df = df
        self.input_column = input_column
        self.output_column = output_column
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        row = self.df.iloc[index]

        inputs = self.tokenizer.encode_plus(
            row[self.input_column],
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt'
        )

        labels = self.tokenizer.encode_plus(
            row[self.output_column],
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt'
        )
    
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': labels['input_ids'].flatten(),
        }

df_test = pd.read_parquet(TEST_DF_PATH)
test_dataset = ProteinSeqSmailesDataset(df_test, tokenizer, 'Target', 'Drug', MAX_LEN)

test_loader = DataLoader(test_dataset, batch_size=TEST_BATCH_SIZE, generator=torch.manual_seed(SEED), num_workers=0, shuffle=True, pin_memory=True)

In [4]:
import pandas as pd
from tqdm import tqdm

def evaluate_and_save_results(model, data_loader, device, tokenizer):
    model.eval()
    
    sequences = []
    true_labels = []
    predicted_labels = []
    test_losses = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Test"):
  
            batch = {key: value.to(device) for key, value in batch.items()}
            labels = batch["labels"]
            input_ids = batch["input_ids"]
            
            del batch["labels"]

            outputs = model(**batch, labels=labels)
            loss = outputs.loss
            logits = outputs.logits  
            
            test_losses.append(loss.item())
            
            decoded_sequences = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
            
            decoded_true_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
            
            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
            
            sequences.extend(decoded_sequences)
            true_labels.extend(decoded_true_labels)
            predicted_labels.extend(decoded_preds)
    
    results_df = pd.DataFrame({
        "Sequence": sequences,
        "True_Label": true_labels,
        "Predicted_Label": predicted_labels
    })
    
    return np.mean(test_losses), results_df

test_loss, results_df = evaluate_and_save_results(model, test_loader, DEVICE, tokenizer)

print(f"Test Loss: {test_loss:.4f}")


results_df.to_csv(LOCAL_MODEL_OUTPUT_FILES_PATH / 't5_v2_test.csv', index=False)



Test:   0%|          | 0/125 [00:00<?, ?it/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Test: 100%|██████████| 125/125 [12:01<00:00,  5.77s/it]

Test Loss: 0.0924





In [5]:
results_df


Unnamed: 0,Sequence,True_Label,Predicted_Label
0,MFIEAIVLALTALILYSVYSVKSFNTTRPTDPPVYPVTVPFLGHIV...,O=C(NCC(c1ccccc1)n1ccnc1)c1ccc(-c2ccc(Cl)cc2)cc1,=C(Oc(C1ccc(c1)c1cccc2c1cccc-c2ccccCl)cccc11
1,MDSSTGPGNTSDCSDPLAQASCSPAPGSWLNLSHVDGNQSDPCGLN...,CN1CC[C@]23c4c5ccc(O)c4O[C@H]2[C@@H](O)CC[C@H]...,CCC[CCNC@H12O1c(c(ccC)ccc[C@H](OC@@]](C)[[C@@]...
2,MAAAYLDPNLNHTPNSSTKTHLGTGMERSPGAMERVLKVFHYFESN...,C[C@]12O[C@H](C[C@]1(O)CO)n1c3ccccc3c3c4c(c5c6...,CcC@@12O[C@H](C[C@]1(O)CO)n1c3ccccc3c3c4c(c5c6...
3,MELENIVANSLLLKARQGGYGKKSGRSKKWKEILTLPPVSQCSELR...,CO[C@@H]1[C@H](N(C)C(=O)c2ccccc2)C[C@H]2O[C@]1...,CcC@@H]1[C@H](N(C)C(=O)c2ccccc2)C[C@H]2O[C@]1(...
4,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,COc1cc2ncnc(Oc3cccc(NC(=O)Nc4cc(-c5ccccc5)on4)...,Cc1cc2ncnc(Nc3ccc((NC(=O)Nc4cccCc5ccccccn4)c3)...
...,...,...,...
995,MDRAPQRQHRASRELLAAKKTHTSQIEVIPCKICGDKSSGIHYGVI...,COc1nc2ccc(C(O)(c3cnnn3C)c3cnc(C)n3C)cc2c(C#N)...,Cc1cc2ccc(C(O)(c3cncn3C)cccnn(C)n3C)cc2c(ClN)c...
996,MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANT...,COCCNC(=O)[C@@H]1CCCN1C(=O)CC(c1ccccc1)c1ccccc1,Cc((=O)cC@@H](CCN2C(=O)[CCC2ccc((1)C1ccccc1
997,MEGTPAANWSVELDLGSGVPPGEEGNRTAGPPQRNEALARVEVAVL...,CCN1C(=O)CC2(CCCCC2)SSC[C@H](C(=O)N2CCC[C@H]2C...,c(CC(=O)N((CC(2)CC((C@@](Cc=O)N[CCCCCCC@H]2O(=...
998,MDRAPQRQHRASRELLAAKKTHTSQIEVIPCKICGDKSSGIHYGVI...,COc1nc2ccc(C(O)(c3cnc(C)n3C)c3cnc(C)n3C)cc2c(C...,Cc1cc2ccc(C(O)(c3cnc(C)n3C)cccncnC)n3C)cc2c(Cl...
