# Load and Preprocess the BGM Dataset

In [53]:
import os
os.getcwd()

'/Graph-LLM/code'

In [2]:
import torch

# Availability of CUDA 
print("CUDA available:", torch.cuda.is_available())
print("Number of CUDA devices:", torch.cuda.device_count())

if torch.cuda.is_available():
    print("Current CUDA device:", torch.cuda.get_device_name(0))
else:
    print("No CUDA device available.")

CUDA available: True
Number of CUDA devices: 2
Current CUDA device: NVIDIA A100-SXM4-80GB


In [3]:
# !pip install -r requirements.txt

In [12]:
# Install dependencies
# !pip install transformers accelerate torch datasets

# Import necessary libraries
import json
import pathlib
import pickle
import transformers
import torch
import os
import copy
import gc
from tqdm import tqdm
from pathlib import Path
from accelerate import Accelerator
from transformers import default_data_collator, AutoTokenizer, AutoModelForCausalLM
from accelerate import DistributedDataParallelKwargs
from datasets import Dataset
import pandas as pd
import numpy as np

In [5]:
gene_df = pd.read_csv("../data/HCS/combined_graphs.csv")
gene_df['Type'] = gene_df['Type'].replace({'+': 1, '-': -1})

# Rename columns
gene_df.rename(columns={'Unnamed: 0': 'node_id', 'Type': 'label'}, inplace=True)

gene_df

Unnamed: 0,node_id,Gene1,Gene2,label
0,0,Gata1,Eklf,1
1,1,Fli1,Eklf,-1
2,2,Cebpa,Pu1,1
3,3,Pu1,Pu1,1
4,4,Gata1,Pu1,-1
...,...,...,...,...
895,895,Gfi1,cJun,-1
896,896,Gata2,Gata2,1
897,897,Pu1,Gata2,-1
898,898,Gata1,Gata2,-1


In [13]:
# Generate unique node_ids for each unique gene
unique_genes = pd.concat([gene_df['Gene1'], gene_df['Gene2']]).unique()
node_id_map = {gene: idx for idx, gene in enumerate(unique_genes)}

print("Edge Type:\n", gene_df['label'][:5])
print(f"Unique Genes:{unique_genes}\nNumber of Unique Genes: {len(unique_genes)}")

Edge Type:
 0    1
1   -1
2    1
3    1
4   -1
Name: label, dtype: int64
Unique Genes:['Gata1' 'Fli1' 'Cebpa' 'Pu1' 'Gata2' 'cJun' 'Gfi1' 'EgrNab' 'Fog1' 'Eklf'
 'Scl']
Number of Unique Genes: 11


In [14]:
# Convert to the required structure
df = gene_df[['node_id', 'Gene1', 'Gene2', 'label']]

# Convert the DataFrame to a Dataset object
gene_dataset = Dataset.from_pandas(df)

# Print the features of the dataset to verify the transformation
print(gene_dataset)

Dataset({
    features: ['node_id', 'Gene1', 'Gene2', 'label'],
    num_rows: 900
})


# Define and Load the Tokenizer 
+ daryl149/llama-2-7b-chat-hf

In [None]:
#!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.6.0+cu102.html

In [15]:
from transformers import LlamaTokenizer, AutoTokenizer, AutoModelForCausalLM

# # Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('daryl149/llama-2-7b-chat-hf')
tokenizer.pad_token_id = 0
tokenizer.padding_side = 'left'

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


In [16]:
import argparse
import sys

def parse_args_llama():
    parser = argparse.ArgumentParser(description="GraphLLM")

    parser.add_argument("--project", type=str, default="project_GraphLLM")
    parser.add_argument("--exp_num", type=int, default=1)
    parser.add_argument("--model_name", type=str, default='LLaMA-7B-2')

    parser.add_argument("--dataset", type=str, default='mol')
    parser.add_argument("--lr", type=float, default=5e-5)
    parser.add_argument("--wd", type=float, default=0.1)

    parser.add_argument("--adapter_len", type=int, default=5)
    parser.add_argument("--adapter_dim", type=int, default=768)
    parser.add_argument("--adapter_n_heads", type=int, default=6)

    parser.add_argument("--n_decoder_layers", type=int, default=4)
    parser.add_argument("--n_encoder_layers", type=int, default=4)
    parser.add_argument("--n_mp_layers", type=int, default=4)

    # Model Training
    parser.add_argument("--batch_size", type=int, default=16)
    parser.add_argument("--grad_steps", type=int, default=2)

    # Learning Rate Scheduler
    parser.add_argument("--num_epochs", type=int, default=15)
    parser.add_argument("--warmup_epochs", type=float, default=1)

    # RRWP
    parser.add_argument("--rrwp", type=int, default=8)

    # Inference
    parser.add_argument("--eval_batch_size", type=int, default=32)

    # Jupyter 노트북에서 불필요한 인수 제거
    if 'ipykernel_launcher' in sys.argv[0]:
        sys.argv = sys.argv[:1]

    args = parser.parse_args()
    return args


# 인수 파싱
args = parse_args_llama()

# args 확인
print(args)


Namespace(adapter_dim=768, adapter_len=5, adapter_n_heads=6, batch_size=16, dataset='mol', eval_batch_size=32, exp_num=1, grad_steps=2, lr=5e-05, model_name='LLaMA-7B-2', n_decoder_layers=4, n_encoder_layers=4, n_mp_layers=4, num_epochs=15, project='project_GraphLLM', rrwp=8, warmup_epochs=1, wd=0.1)


# Load dataset, split and edge_index

In [17]:
import torch
from datasets import Dataset
import torch
def preprocess_function_gene_interaction(tokenizer, ignore_index=-100, max_length=32):
    def preprocess_function(examples):
        # Create prompts based on the gene interactions and label (-1 or 1)
        prompts = [
            f"Does Gene {gene1} interact positively or negatively with Gene {gene2}?"
            for gene1, gene2 in zip(examples['Gene1'], examples['Gene2'])
        ]
        
        # Convert label (-1 or 1) to completion strings ("positive interaction" or "negative interaction")
        completion = [
            f"{'positive' if label == 1 else 'negative'} interaction." 
            for label in examples['label']
        ]

        # Encode instruction
        instruction = f"\n\n###\n\n"
        instruction = tokenizer.encode(instruction, add_special_tokens=False)

        # Tokenize the prompts and completions
        model_inputs = tokenizer(prompts, add_special_tokens=False)
        labels = tokenizer(completion, add_special_tokens=False)

        batch_size = len(examples['Gene1'])

        for i in range(batch_size):
            # Add bos & eos token
            sample_input_ids = [tokenizer.bos_token_id] + model_inputs["input_ids"][i]
            label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id]

            # Adjust length so that both input and label fit within max_length
            p_max_length = max_length - len(label_input_ids) - len(instruction)
            sample_input_ids = sample_input_ids[:p_max_length] + instruction

            model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
            labels["input_ids"][i] = [ignore_index] * len(sample_input_ids) + label_input_ids
            model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])

        # Padding
        for i in range(batch_size):
            sample_input_ids = model_inputs["input_ids"][i]
            label_input_ids = labels["input_ids"][i]
            
            # Pad to max_length
            model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (max_length - len(sample_input_ids)) + sample_input_ids
            model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs["attention_mask"][i]
            labels["input_ids"][i] = [ignore_index] * (max_length - len(sample_input_ids)) + label_input_ids
            
            # Convert to torch tensors
            model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i])
            model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i])
            labels["input_ids"][i] = torch.tensor(labels["input_ids"][i])

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    return preprocess_function


In [20]:
# Assuming 'tokenizer' is already defined and 'gene_dataset' contains 'Gene1', 'Gene2', and 'label'
gene_dataset = gene_dataset.map(
    preprocess_function_gene_interaction(tokenizer=tokenizer, max_length=32),
    batched=True,
    batch_size=None,
    remove_columns=[i for i in gene_dataset.column_names if i not in ['node_id']],
    keep_in_memory=True,
    writer_batch_size=10000,
    num_proc=1,
).with_format("torch")

# Now, `preprocessed_dataset` contains the tokenized inputs and labels ready for model training.


Map:   0%|          | 0/900 [00:00<?, ? examples/s]

In [21]:
gene_dataset

Dataset({
    features: ['node_id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 900
})

# Define Train, Validation, and Test dataset 

In [22]:
from datasets import Dataset, DatasetDict

# Split the dataset into 60% training, 20% validation, and 20% test
train_test = gene_dataset.train_test_split(test_size=0.40, seed=42)
val_test = train_test['test'].train_test_split(test_size=0.50, seed=42)

# Create a DatasetDict to manage the splits
split_datasets = DatasetDict({
    'train': train_test['train'],
    'val': val_test['train'],
    'test': val_test['test']
})

# Convert to torch format
split_datasets.set_format("torch")

# Now you have the train, validation, and test datasets
train_dataset = split_datasets['train']
val_dataset = split_datasets['val']
test_dataset = split_datasets['test']

# Print to check
print(train_dataset, val_dataset, test_dataset)


Dataset({
    features: ['node_id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 540
}) Dataset({
    features: ['node_id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 180
}) Dataset({
    features: ['node_id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 180
})


# Reduce the size of dataset

In [23]:
from datasets import Dataset, DatasetDict

# Shuffle the datasets and select the required number of samples
train_dataset = train_test['train'].shuffle(seed=42).select(range(120))
val_dataset = val_test['train'].shuffle(seed=42).select(range(40))
test_dataset = val_test['test'].shuffle(seed=42).select(range(40))

# Convert to torch format
train_dataset.set_format("torch")
val_dataset.set_format("torch")
test_dataset.set_format("torch")

# Print to check
print(train_dataset, val_dataset, test_dataset)


Dataset({
    features: ['node_id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 120
}) Dataset({
    features: ['node_id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 40
}) Dataset({
    features: ['node_id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 40
})


In [24]:
len(train_dataset['node_id']), train_dataset['node_id']

(120,
 tensor([812, 122,  31, 473, 522, 559, 690, 364, 481, 866, 836, 594, 526, 175,
         254, 509, 292, 392, 362, 191,  93, 423, 456,  62, 732,  68, 539, 228,
         642, 523,  41, 453, 563, 256,  72, 332, 333,  43, 284, 280, 730, 717,
         404, 177,  75, 899, 373, 757, 465, 675,  99, 491, 606, 499, 841, 347,
         262, 170, 414, 561, 466, 814, 187,  85, 827, 291, 645, 203, 403, 554,
         747, 653, 234, 888, 480, 446, 489, 791, 512, 178, 849, 560, 113, 339,
         134, 829,  19, 519, 867, 659, 524, 825, 189, 215, 468, 353, 873, 223,
         438, 104, 736, 824, 282,  13, 430, 483,  32, 497,  63, 571, 616, 802,
         195, 327, 821, 337, 343, 615,  95, 779]))

In [25]:
len(test_dataset['node_id']), test_dataset['node_id']

(40,
 tensor([121,  44, 172, 676, 636, 845, 199, 813,  64, 565, 478, 729, 799, 236,
         274, 880, 716, 592, 202, 107, 455, 881, 681, 159, 171, 248, 598, 626,
         138, 434,  71, 803, 166, 643, 350, 147, 445, 860, 703, 241]))

In [26]:
len(train_dataset['input_ids']), train_dataset['input_ids'], len(train_dataset['input_ids'][0])

(120,
 tensor([[    0,     0,     0,  ..., 14881, 29889,     2],
         [    0,     0,     0,  ..., 14881, 29889,     2],
         [    0,     0,     0,  ..., 14881, 29889,     2],
         ...,
         [    0,     0,     0,  ..., 14881, 29889,     2],
         [    0,     0,     0,  ..., 14881, 29889,     2],
         [    0,     0,     0,  ..., 14881, 29889,     2]]),
 32)

In [27]:
len(train_dataset['attention_mask']), train_dataset['attention_mask'], len(train_dataset['attention_mask'][0])

(120,
 tensor([[0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         ...,
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1]]),
 32)

In [28]:
len(train_dataset['labels']), train_dataset['labels'], len(train_dataset['labels'][0])

(120,
 tensor([[ -100,  -100,  -100,  ..., 14881, 29889,     2],
         [ -100,  -100,  -100,  ..., 14881, 29889,     2],
         [ -100,  -100,  -100,  ..., 14881, 29889,     2],
         ...,
         [ -100,  -100,  -100,  ..., 14881, 29889,     2],
         [ -100,  -100,  -100,  ..., 14881, 29889,     2],
         [ -100,  -100,  -100,  ..., 14881, 29889,     2]]),
 32)

# Define and Initialize the Model

In [29]:
# Load the model

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# # Load the Hugging Face LLaMA model (daryl149/llama-2-7b-chat-hf)
# model_id = "daryl149/llama-2-7b-chat-hf"

# # Initialize the model with FP16 precision and auto device map (for GPU if available)
# if torch.cuda.is_available():
#     model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)
# else:
#     model = AutoModelForCausalLM.from_pretrained(model_id)

In [30]:
import copy
from typing import Optional, Tuple, List, Union
from dataclasses import dataclass, field
import math

import torch
import torch_geometric.utils
from torch import nn
import torch.nn.functional as F
from typing import List, Optional, Tuple, Union
from torch_geometric.nn.pool import global_max_pool, global_mean_pool


@dataclass
class ModelArgs:
    dim: int = 4096
    n_layers: int = 32
    n_heads: int = 32
    vocab_size: int = -1  # defined later by tokenizer
    multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
    norm_eps: float = 1e-5

    max_batch_size: int = 32
    max_seq_len: int = 4096

    adapter_len: int = 0
    adapter_layer: int = 0
    adapter_dim: int = 512
    adapter_n_heads: int = 4


    num_hops: int = 2
    w_adapter: bool = True
    w_lora: bool = True
    lora_r: int = 16
    lora_alpha: int = 1
    lora_dropout: float = 0.05
    rrwp: int = 8


    n_decoder_layers: int = 2
    n_mp_layers: int = 2
    n_encoder_layers: int = 2

    # target_modules: Tuple[str] = ('q_proj', 'v_proj')     # Option
    fans_out: Tuple[int] = (50, 50, 50)

    # target_modules: Tuple[str] = ('q_proj', 'v_proj', 'k_proj')     # Option
    # target_modules: Tuple[str] = ('o_proj')     # Option
    target_modules: Tuple[str] = ('down_proj', 'up_proj', 'gate_proj')     # Option
    task_level: str = 'node'




In [31]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaConfig
import pert
from peft import LoraConfig, get_peft_model
# Load the LLaMA model and tokenizer
model_id = "daryl149/llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# ModelArgs 객체를 먼저 생성한 후, 속성을 수동으로 설정
model_args = ModelArgs()

# ModelArgs 속성 수동 설정 (Note: Do not change hidden_size or num_attention_heads)
model_args.w_lora = False
model_args.w_adapter = True
model_args.adapter_layer = 8
model_args.adapter_dim = 4096  # match the pre-trained model hidden size (4096 for LLaMA 7B)
model_args.adapter_len = args.adapter_len
model_args.lora_alpha = 16
model_args.lora_r = 8
model_args.num_hops = 3
model_args.n_mp_layers = args.n_mp_layers
model_args.rrwp = args.rrwp
model_args.n_encoder_layers = args.n_encoder_layers
model_args.n_decoder_layers = args.n_decoder_layers
model_args.adapter_n_heads = 32  # Must match the pre-trained model attention heads (32 for LLaMA 7B)
model_args.task_level = "node_classification"

# Use the original configuration
config = LlamaConfig.from_pretrained(model_id)

# BFloat16 tensor
torch.set_default_tensor_type(torch.cuda.BFloat16Tensor)

# Initialize the model with FP16 precision and auto device map (for GPU if available)
if torch.cuda.is_available():
    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, config=config)
else:
    model = AutoModelForCausalLM.from_pretrained(model_id, config=config)


# LoRA 설정
lora_config = LoraConfig(
    r=model_args.lora_r,  # Rank of the low-rank matrices
    lora_alpha=model_args.lora_alpha,  # LoRA scaling factor
    target_modules=["q_proj", "v_proj"],  # LoRA 적용할 모듈들
    lora_dropout=0.05,  # Dropout 적용
    bias="none"  # Bias 사용 여부 (none으로 설정)
)

# LoRA 적용
model = get_peft_model(model, lora_config)

# Trainable 파라미터 확인 (LoRA 적용된 파라미터만 학습 가능하도록 설정)
model.print_trainable_parameters()

# You can now proceed to add your adapter/LoRA layers on top of this model without changing its core architecture.


  _C._set_default_tensor_type(t)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.0622


In [33]:
model

PeftModel(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear

In [35]:
model.config

LlamaConfig {
  "_name_or_path": "daryl149/llama-2-7b-chat-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.43.4",
  "use_cache": true,
  "vocab_size": 32000
}

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Create DateLoader for train, validation, evaluation and test

In [36]:
from transformers import default_data_collator, AutoTokenizer, AutoModelForCausalLM
import torch
# 인수 파싱
args = parse_args_llama()
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')


generator = torch.Generator(device=device)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1, drop_last=True,
                                            shuffle=True, collate_fn=default_data_collator,
                                           generator=generator)

val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, drop_last=False,
                                          shuffle=False, collate_fn=default_data_collator,
                                         generator=generator)


test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, drop_last=False,
                                         shuffle=False, collate_fn=default_data_collator,
                                          generator=generator)



In [37]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import copy
from tqdm import tqdm
import gc

# Freeze all layers except the adapter and LoRA layers
for param in model.parameters():
    param.requires_grad = False

# Assuming adapter and LoRA layers are already added to the model, make those trainable
def set_trainable_params_new(model):
    param_adapter, param_lora = [], []
    for name, param in model.named_parameters():
        if 'adapter' in name:  # Custom logic for adapter layers
            param.requires_grad = True
            param_adapter.append(param)
        elif 'lora' in name:  # Custom logic for LoRA layers
            param.requires_grad = True
            param_lora.append(param)
    return param_adapter, param_lora

# Get the trainable parameters
param_adapter, param_lora = set_trainable_params_new(model)

# Step 4: Define optimizer for both adapter and LoRA layers
lr_group = {
    'adapter': 5e-5,  # Set your learning rate
    'lora': 5e-5,     # Set your learning rate for LoRA layers
}

wd_group = {
    'adapter': 0.01,  # Set weight decay
    'lora': 0.01,     # Set weight decay for LoRA layers
}

optimizer = torch.optim.AdamW(
    [
        {'params': param_adapter, 'lr': lr_group['adapter'], 'weight_decay': wd_group['adapter']},
        {'params': param_lora, 'lr': lr_group['lora'], 'weight_decay': wd_group['lora']},
    ],
    betas=(0.9, 0.95)
)

# Training setup
def adjust_learning_rate(param_group, LR, epoch, args):
    # Custom function for learning rate adjustment
    min_lr = 0.001 #5e-6
    if epoch < args['warmup_epochs']:
        lr = LR * epoch / args['warmup_epochs']
    else:
        lr = min_lr + (LR - min_lr) * 0.5 * (
                1.0 + torch.cos(torch.tensor(epoch * 3.1415 / args['num_epochs'])))
    param_group['lr'] = lr

In [38]:
for param in model.parameters():
    param.requires_grad = True

In [39]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(f"Using device: {device}")
model.to(device)
print(model.device)

Using device: cuda:0
cuda:0


# Model Training

In [40]:
from tqdm.notebook import tqdm


# Define arguments
args = {'num_epochs': 10, 'warmup_epochs': 1, 'grad_steps': 2}
lr_group = {'adapter': 1e-5, 'lora': 1e-5}  # Example learning rate settings

# Training loop with tqdm for epoch and batch progress
for epoch in tqdm(range(args['num_epochs']), desc="Epochs"):
    model.train()
    epoch_loss, accum_loss = 0.0, 0.0

    # Progress bar for each batch within an epoch
    with tqdm(total=len(train_loader), desc=f"Epoch {epoch + 1}/{args['num_epochs']} Batches") as batch_progress:
        for step, batch in enumerate(train_loader):
            optimizer.zero_grad()

            # Move batch to the device (ensure all tensors are on the same device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            if 'position_ids' in batch:
                position_ids = batch['position_ids'].to(device)

            # Forward pass and loss computation
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()

            # Adjust learning rates for adapter and LoRA layers
            adjust_learning_rate(optimizer.param_groups[0], lr_group['adapter'], step + epoch, args)
            adjust_learning_rate(optimizer.param_groups[1], lr_group['lora'], step + epoch, args)

            # Update model parameters
            optimizer.step()
            epoch_loss += loss.item()

            if (step + 1) % args['grad_steps'] == 0:
                accum_loss = 0.0  # Reset accumulated loss after each gradient step

            # Update the batch progress bar
            batch_progress.set_postfix(Loss=loss.item())
            batch_progress.update(1)
    
    print(f"Epoch {epoch + 1} finished with average loss: {epoch_loss / len(train_loader)}")

    # Validation loop
    model.eval()
    val_loss = 0.0

    with torch.no_grad():
        for step, batch in enumerate(val_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()
        
        print(f"Validation loss for epoch {epoch + 1}: {val_loss / len(val_loader)}")

Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/10 Batches:   0%|          | 0/120 [00:00<?, ?it/s]

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Epoch 1 finished with average loss: 0.6278458298407107
Validation loss for epoch 1: 0.18639653827995062


Epoch 2/10 Batches:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 2 finished with average loss: 0.2301026154619952
Validation loss for epoch 2: 0.16073068361729384


Epoch 3/10 Batches:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 3 finished with average loss: 0.14857714967026064
Validation loss for epoch 3: 0.18001933582127094


Epoch 4/10 Batches:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 4 finished with average loss: 0.1400083626659883
Validation loss for epoch 4: 0.0936630752752535


Epoch 5/10 Batches:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 5 finished with average loss: 0.1168101852100032
Validation loss for epoch 5: 0.1415713634021813


Epoch 6/10 Batches:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 6 finished with average loss: 0.09536855762187316
Validation loss for epoch 6: 0.11126171628493467


Epoch 7/10 Batches:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 7 finished with average loss: 0.08546966973669139
Validation loss for epoch 7: 0.028947628637979506


Epoch 8/10 Batches:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 8 finished with average loss: 0.06439952158614991
Validation loss for epoch 8: 0.03667574356486512


Epoch 9/10 Batches:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 9 finished with average loss: 0.04170702987379779
Validation loss for epoch 9: 0.002915508874821171


Epoch 10/10 Batches:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 10 finished with average loss: 0.029949112897619066
Validation loss for epoch 10: 0.0022432866297094735


In [41]:
import gc
gc.collect()
torch.cuda.empty_cache()

# Save the Model

In [55]:
# torch.save(model.state_dict(), 'new_trained_model/gene_trained_model_0.pth')

# Evaluate the Model

In [52]:
import torch
from tqdm.notebook import tqdm

# Move the model to the appropriate device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

samples_seen = 0
eval_output = []
eval_loss = 0.0
correct_predictions = 0
total_predictions = 0
model.eval()

# Progress bar for test loader
progress_bar_test = tqdm(range(len(test_loader)), desc="Evaluating")

for step, batch in enumerate(test_loader):
    with torch.no_grad():
        # Move batch tensors to the device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Prepare model input
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        # The loss is already computed in the model outputs when you pass labels
        loss = outputs.loss
        eval_loss += loss.item()

        # Get predicted class (assuming the model outputs logits)
        preds = torch.argmax(outputs.logits, dim=-1)

        # Update accuracy
        correct_predictions += (preds == labels).sum().item()
        total_predictions += labels.size(0)

        # Gather predictions for later analysis
        eval_output.extend(preds.cpu().numpy())

    # Update the progress bar for each batch
    progress_bar_test.update(1)

# Final evaluation metrics
accuracy = correct_predictions / total_predictions
average_loss = eval_loss / len(test_loader)

# Print results
print(f"Test Loss: {average_loss}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Finish progress bar
progress_bar_test.close()


Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

Test Loss: 0.001986566302366555
Test Accuracy: 85.00%
