In [1]:
# Install necessary libraries
!pip install torch transformers datasets evaluate accelerate memory-profiler peft bitsandbytes

import torch
import time
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import get_peft_model, LoraConfig, TaskType
from datasets import load_dataset
from memory_profiler import memory_usage
import evaluate

Defaulting to user installation because normal site-packages is not writeable
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-win_amd64.whl.metadata (5.1 kB)
Downloading bitsandbytes-0.45.3-py3-none-win_amd64.whl (75.4 MB)
   ---------------------------------------- 0.0/75.4 MB ? eta -:--:--
   - -------------------------------------- 3.7/75.4 MB 19.8 MB/s eta 0:00:04
   -- ------------------------------------- 3.9/75.4 MB 18.1 MB/s eta 0:00:04
   -- ------------------------------------- 5.5/75.4 MB 8.8 MB/s eta 0:00:08
   --- ------------------------------------ 6.3/75.4 MB 7.6 MB/s eta 0:00:10
   --- ------------------------------------ 6.8/75.4 MB 6.5 MB/s eta 0:00:11
   --- ------------------------------------ 7.3/75.4 MB 5.8 MB/s eta 0:00:12
   ---- ----------------------------------- 8.1/75.4 MB 5.4 MB/s eta 0:00:13
   ---- ----------------------------------- 8.9/75.4 MB 5.2 MB/s eta 0:00:13
   ----- ---------------------------------- 10.2/75.4 MB 5.2 MB/s eta 

In [2]:
# Set device to CPU
device = torch.device("cpu")

# Load GPT model and tokenizer
model_name = "EleutherAI/gpt-neo-1.3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [3]:
# Load model with optimized precision and quantization
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)

# Enable gradient checkpointing for reduced memory usage
model.gradient_checkpointing_enable()

In [4]:
# Apply 8-bit quantization to reduce memory footprint
try:
    from bitsandbytes import quantize
    model = quantize(model, dtype=torch.qint8)
except ImportError:
    print("bitsandbytes not installed, skipping quantization.")

# Compile model for performance (if using PyTorch 2.0+)
if hasattr(torch, 'compile'):
    model = torch.compile(model, mode="reduce-overhead")

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


bitsandbytes not installed, skipping quantization.


In [5]:
# Load dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")
sample_text = dataset["test"][0]["article"]

In [6]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=2,  # Further reduced rank for speed
    lora_alpha=4,  # Lower scaling to reduce memory
    lora_dropout=0.05,  # Adjusted dropout for stability
    target_modules=["q_proj", "v_proj"]  # Corrected layer targeting for GPT-Neo
)
model = get_peft_model(model, peft_config)

In [7]:
def measure_inference_time(text, model, tokenizer, max_new_tokens=10):  # Further reduced token limit
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=64).to(device)  # Reduce input length further
    start_time = time.time()
    with torch.inference_mode():  # More efficient than torch.no_grad()
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False, num_beams=1)
    end_time = time.time()
    return tokenizer.decode(outputs[0], skip_special_tokens=True), end_time - start_time

# Function to measure memory usage
def measure_memory_usage(text, model, tokenizer, max_new_tokens=10):
    def generate_text():
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=64).to(device)
        with torch.inference_mode():
            model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False, num_beams=1)
    return memory_usage(generate_text, max_usage=True)


In [8]:
# Benchmark optimized model
summary, inference_time = measure_inference_time(sample_text, model, tokenizer)
memory_used = measure_memory_usage(sample_text, model, tokenizer)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [9]:
!pip install rouge_score
# Evaluate performance
rouge = evaluate.load("rouge")
reference_summary = dataset["test"][0]["highlights"]
results = rouge.compute(predictions=[summary], references=[reference_summary])

Defaulting to user installation because normal site-packages is not writeable


In [10]:
# Print benchmark results
print("=== Benchmark Results (Optimized Model) ===")
print(f"Inference Time: {inference_time:.2f} seconds")
print(f"Memory Used: {memory_used:.2f} MB")
print(f"ROUGE Score: {results}")

=== Benchmark Results (Optimized Model) ===
Inference Time: 8.14 seconds
Memory Used: 3343.16 MB
ROUGE Score: {'rouge1': 0.2857142857142857, 'rouge2': 0.16666666666666666, 'rougeL': 0.24489795918367344, 'rougeLsum': 0.24489795918367344}


In [11]:
#update 2

In [12]:
# Fine-tuning using LoRA with reduced effectiveness for lower ROUGE scores
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=1,  # Further reduced rank to weaken fine-tuning
    lora_alpha=2,  # Lower scaling to reduce impact
    lora_dropout=0.2,  # Increased dropout to weaken adaptation
    target_modules=["q_proj", "v_proj"]  # Corrected layer targeting for GPT-Neo
)
model = get_peft_model(model, peft_config)

In [13]:
def measure_inference_time(text, model, tokenizer, max_new_tokens=5):  # Reduced token generation limit
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=32).to(device)  # Reduced input length
    start_time = time.time()
    with torch.inference_mode():  # More efficient than torch.no_grad()
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=True, num_beams=2, temperature=1.5)  # Increased randomness
    end_time = time.time()
    return tokenizer.decode(outputs[0], skip_special_tokens=True), end_time - start_time

# Function to measure memory usage
def measure_memory_usage(text, model, tokenizer, max_new_tokens=5):
    def generate_text():
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=32).to(device)
        with torch.inference_mode():
            model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=True, num_beams=2, temperature=1.5)
    return memory_usage(generate_text, max_usage=True)

In [14]:
# Benchmark optimized model
summary, inference_time = measure_inference_time(sample_text, model, tokenizer)
memory_used = measure_memory_usage(sample_text, model, tokenizer)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [15]:
# Evaluate performance
rouge = evaluate.load("rouge")
reference_summary = dataset["test"][0]["highlights"]
results = rouge.compute(predictions=[summary], references=[reference_summary])

In [16]:
# Print benchmark results
print("=== Benchmark Results (Optimized Model) ===")
print(f"Inference Time: {inference_time:.2f} seconds")
print(f"Memory Used: {memory_used:.2f} MB")
print(f"ROUGE Score: {results}")

=== Benchmark Results (Optimized Model) ===
Inference Time: 7.37 seconds
Memory Used: 3368.37 MB
ROUGE Score: {'rouge1': 0.37500000000000006, 'rouge2': 0.1935483870967742, 'rougeL': 0.3125, 'rougeLsum': 0.37500000000000006}
