In [1]:
import os

# Hugging Face cache (G drive)
os.environ["HF_HOME"] = "G:/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "G:/hf_cache"
os.environ["HF_DATASETS_CACHE"] = "G:/hf_cache/datasets"

# Reduce memory pressure
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"


In [2]:
model_name = "EleutherAI/gpt-neo-1.3B"


In [3]:
from transformers import BitsAndBytesConfig
import torch

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


In [5]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    low_cpu_mem_usage=True,
)


In [6]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # GPT-Neo attention layers
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)


In [7]:
model.print_trainable_parameters()


trainable params: 1,572,864 || all params: 1,317,148,672 || trainable%: 0.1194


In [8]:
from datasets import load_dataset

dataset = load_dataset(
    "json",
    data_files="data/general_pathology.jsonl"
)


In [9]:
def format_data(example):
    prompt = (
        f"### Instruction:\n{example['instruction']}\n\n"
        f"### Question:\n{example['input']}\n\n"
        f"### Answer:\n{example['output']}"
    )

    tokenized = tokenizer(
        prompt,
        truncation=True,
        padding="max_length",
        max_length=512,
    )

    # ðŸ”‘ THIS LINE FIXES THE ERROR
    tokenized["labels"] = tokenized["input_ids"].copy()

    return tokenized


In [10]:
tokenized_data = dataset.map(
    format_data,
    remove_columns=dataset["train"].column_names
)


In [11]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    report_to="none"
)


In [12]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
)

trainer.train()


Step,Training Loss
10,4.4626


TrainOutput(global_step=12, training_loss=4.443501710891724, metrics={'train_runtime': 40.9622, 'train_samples_per_second': 1.831, 'train_steps_per_second': 0.293, 'total_flos': 278790458572800.0, 'train_loss': 4.443501710891724, 'epoch': 3.0})

In [14]:
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

prompt = """You are a medical pathology assistant.
Answer ONLY pathology-related questions.
If the question is not related to pathology, say:
"I can only answer pathology-related questions."

Question:
What is crypto?

Answer:
"""



output = pipe(
    prompt,
    max_new_tokens=80,
    do_sample=False,
    repetition_penalty=1.2,
    no_repeat_ngram_size=3,
    eos_token_id=tokenizer.eos_token_id,
    return_full_text=False
)

print(output[0]["generated_text"])




Device set to use cuda:0


Crypto is a type of encryption that uses a key to encrypt data.

Cryptography is the art of encoding information in such a way that it can be decoded by a third party.
Cryptology is the study of cryptography.
The term "cryptology" is used to describe the study and use of cryptography in the field of computer security.
A cryptographic system is a system


In [15]:
save_dir = "./pathology_lora_adapters"


In [16]:
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)


('./pathology_lora_adapters\\tokenizer_config.json',
 './pathology_lora_adapters\\special_tokens_map.json',
 './pathology_lora_adapters\\vocab.json',
 './pathology_lora_adapters\\merges.txt',
 './pathology_lora_adapters\\added_tokens.json',
 './pathology_lora_adapters\\tokenizer.json')