In [1]:
import torch

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [2]:
!pip install opencv-python



In [3]:
! pip install transformers peft datasets accelerate

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>

### Fine-Tune the Model with LoRA

In [4]:
! huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `Itiswhatitis` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `Itisw

In [5]:
# Step 1: Install Required Libraries
# Run this in your terminal or notebook
# !pip install transformers peft datasets accelerate ftplib gzip shutil

# Step 2: Download 10% of PubMed Files
import ftplib
import os
import random

# --- Configuration ---
SUBSET_RATIO = 0.01  # 2% of total files (adjust between 0.01-1.0)
RANDOM_SEED = 42    # For reproducibility
# ---------------------

# FTP server details
ftp_server = "ftp.ncbi.nlm.nih.gov"
ftp_directory = "/pubmed/baseline/"
local_directory = "./pubmed_subset"

# Create local directory if it doesn't exist
os.makedirs(local_directory, exist_ok=True)

# Connect to the FTP server
ftp = ftplib.FTP(ftp_server)
ftp.login()  # Anonymous login

# Change to the PubMed directory
ftp.cwd(ftp_directory)

# Get list of XML files only
all_files = [f for f in ftp.nlst() if f.endswith(".xml.gz")]
print(f"Total files available: {len(all_files)}")

# Select random subset
random.seed(RANDOM_SEED)
subset_size = int(len(all_files) * SUBSET_RATIO)
selected_files = random.sample(all_files, subset_size)
print(f"Downloading {len(selected_files)} files ({SUBSET_RATIO*100}% subset)")

# Download only selected files
for file in selected_files:
    local_path = os.path.join(local_directory, file)
    with open(local_path, "wb") as f:
        ftp.retrbinary(f"RETR {file}", f.write)
    print(f"Downloaded {file}")

# Close the FTP connection
ftp.quit()

# Step 3: Extract Downloaded Files
import gzip
import shutil

print("Extracting subset files...")
for file in os.listdir(local_directory):
    if file.endswith(".gz"):
        gz_path = os.path.join(local_directory, file)
        xml_path = os.path.join(local_directory, file[:-3])  # Remove .gz extension

        with gzip.open(gz_path, "rb") as f_in:
            with open(xml_path, "wb") as f_out:
                shutil.copyfileobj(f_in, f_out)
        os.remove(gz_path)  # Delete .gz after extraction

print(f"Extracted {len(os.listdir(local_directory))} XML files")

Total files available: 1274
Downloading 12 files (1.0% subset)
Downloaded pubmed25n0228.xml.gz
Downloaded pubmed25n0051.xml.gz
Downloaded pubmed25n0564.xml.gz
Downloaded pubmed25n0503.xml.gz
Downloaded pubmed25n0458.xml.gz
Downloaded pubmed25n0286.xml.gz
Downloaded pubmed25n0209.xml.gz
Downloaded pubmed25n1116.xml.gz
Downloaded pubmed25n0175.xml.gz
Downloaded pubmed25n1209.xml.gz
Downloaded pubmed25n0864.xml.gz
Downloaded pubmed25n0064.xml.gz
Extracting subset files...
Extracted 12 XML files


In [26]:
# Step 4: Load and Process Data
from datasets import Dataset
import xml.etree.ElementTree as ET

# Parse XML files and extract abstracts
def safe_parse(file_path):
    try:
        tree = ET.parse(file_path)
        return [
            {"article": article.findtext(".//AbstractText") or ""}
            for article in tree.findall(".//PubmedArticle")
        ]
    except ET.ParseError:
        return []

# Process only the subset files
articles = []
for file in os.listdir(local_directory):
    if file.endswith(".xml"):
        articles.extend(safe_parse(os.path.join(local_directory, file)))

# Create dataset from subset
dataset = Dataset.from_dict({"article": [a["article"] for a in articles if a["article"]]})

# Split into train/validation sets
dataset = dataset.train_test_split(test_size=0.1, seed=RANDOM_SEED)  # 90% train, 10% validation
print(f"Train size: {len(dataset['train'])}, Validation size: {len(dataset['test'])}")

Train size: 184963, Validation size: 20552


In [31]:
# Step 5: Tokenization (FIXED VERSION)
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")

# Configure padding explicitly
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

def tokenize_function(examples):
    # Tokenize inputs and create labels
    tokenized = tokenizer(
        examples["article"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    # Add labels for causal language modeling
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["article"])

# If tokenized_dataset is a DatasetDict, extract the 'train' split
if isinstance(tokenized_dataset, dict):
    tokenized_dataset = tokenized_dataset["train"]  # Extract the 'train' split

# Split into train/validation sets
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)  # 90% train, 10% validation
tokenized_train = split_dataset["train"]
tokenized_val = split_dataset["test"]

# Step 6: Fine-Tune the Model with LoRA (FIXED VERSION)
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments

# Load the pre-trained model and move to GPU
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")
model.to(device)  # Move model to GPU

# Add padding token to model config
model.config.pad_token_id = tokenizer.pad_token_id

# Configure LoRA with fewer parameters
lora_config = LoraConfig(
    r=2,  # Reduced rank (default: 8)
    lora_alpha=4,  # Reduced scaling factor (default: 32)
    target_modules=["q_proj"],  # Target layers to apply LoRA
    lora_dropout=0.05,  # Reduced dropout (default: 0.1)
    bias="none",  # Whether to add bias
    task_type="CAUSAL_LM"  # Task type (causal language modeling)
)

model = get_peft_model(model, lora_config)

# Print the number of trainable parameters
model.print_trainable_parameters()

# Define training arguments for faster training
training_args = TrainingArguments(
    output_dir="./fast-llama",
    per_device_train_batch_size=16,  # Max batch for T4
    num_train_epochs=1,              # Single epoch
    learning_rate=2e-5,
    fp16=True,
    save_strategy="steps",
    save_steps=200,
    logging_steps=10,
    report_to="none",
    max_steps=1000
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,  # Training data
    eval_dataset=tokenized_val,    # Validation data
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./llama-lora-pubmed-subset")
tokenizer.save_pretrained("./llama-lora-pubmed-subset")

Map:   0%|          | 0/184963 [00:00<?, ? examples/s]

Map:   0%|          | 0/20552 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 131,072 || all params: 1,235,945,472 || trainable%: 0.0106


Step,Training Loss
10,8.5319
20,8.4925
30,8.1618
40,8.0592
50,7.3036
60,6.9394
70,6.3548
80,5.5281
90,4.9515
100,4.119


('./llama-lora-pubmed-subset/tokenizer_config.json',
 './llama-lora-pubmed-subset/special_tokens_map.json',
 './llama-lora-pubmed-subset/tokenizer.json')

In [39]:
import textwrap

model = AutoModelForCausalLM.from_pretrained("./llama-lora-pubmed-subset")
tokenizer = AutoTokenizer.from_pretrained("./llama-lora-pubmed-subset")
model.to("cuda" if torch.cuda.is_available() else "cpu")

# Generation function with formatting
def generate_formatted_text(prompt, max_length=400):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        inputs.input_ids,
        max_new_tokens=max_length,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Format with wrapping and paragraph breaks
    wrapped_text = textwrap.fill(
        full_text,
        width=80,               # Characters per line
        subsequent_indent='    ', # Indent for wrapped lines
        replace_whitespace=False # Preserve existing newlines
    )
    return wrapped_text

# Generate and print formatted response
prompt = "Recent advances in cancer immunotherapy suggest"
result = generate_formatted_text(prompt)

print("\nGenerated Response:\n" + "="*40)
print(result)
print("="*40)


Generated Response:
Recent advances in cancer immunotherapy suggest that the most promising
    immunotherapy is based on the use of cancer vaccines. The success of cancer
    vaccines is related to the ability of the vaccine to induce specific CD8+ T
    cells that can eliminate the tumor cells. In addition, the efficacy of
    cancer vaccines is highly dependent on the ability of the vaccine to induce
    CD4+ T cells that can provide help to the CD8+ T cells to achieve tumor
    elimination. However, the lack of CD4+ T cells has been associated with a
    reduced efficacy of the vaccine. This is a major challenge because CD4+ T
    cells are important for the induction of protective immunity and their
    induction is often associated with severe side effects. Therefore, there is
    a great need for the development of safe and effective vaccines that can
    induce CD4+ T cells. Here, we describe the development of a novel vaccine
    for cancer immunotherapy based on the administ