In [1]:
# Install necessary libraries
!pip install torch transformers datasets evaluate accelerate memory-profiler peft

Defaulting to user installation because normal site-packages is not writeable


In [2]:
!pip install evaluate
!pip install memory_profiler
!pip install datasets

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [3]:
import torch
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig, TaskType
from datasets import load_dataset
from memory_profiler import memory_usage
import evaluate


In [4]:
# Set device to CPU
device = torch.device("cpu")

In [5]:
# Load GPT model and tokenizer with 8-bit quantization
model_name = "EleutherAI/gpt-neo-1.3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
'''# Load model with quantization configuration
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)'''
# Load model with FP16 precision
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)

In [7]:
# Load dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")
sample_text = dataset["test"][0]["article"]


In [8]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["attention.q_proj", "attention.v_proj"]  # Correct layers for GPT-Neo
)
model = get_peft_model(model, peft_config)

In [9]:
# Function to measure inference time
def measure_inference_time(text, model, tokenizer, max_new_tokens=50):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    start_time = time.time()
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    end_time = time.time()
    return tokenizer.decode(outputs[0], skip_special_tokens=True), end_time - start_time

In [10]:
def measure_memory_usage(text, model, tokenizer, max_new_tokens=50):
    def generate_text():
        inputs = tokenizer(text, return_tensors="pt").to(device)
        with torch.no_grad():
            model.generate(**inputs, max_new_tokens=max_new_tokens)
    return memory_usage(generate_text, max_usage=True)

In [11]:
summary, inference_time = measure_inference_time(sample_text, model, tokenizer)
memory_used = measure_memory_usage(sample_text, model, tokenizer)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [12]:
!pip install rouge_score

Defaulting to user installation because normal site-packages is not writeable


In [13]:
rouge = evaluate.load("rouge")
reference_summary = dataset["test"][0]["highlights"]
results = rouge.compute(predictions=[summary], references=[reference_summary])

In [14]:
print("=== Benchmark Results (Optimized Model) ===")
print(f"Inference Time: {inference_time:.2f} seconds")
print(f"Memory Used: {memory_used:.2f} MB")
print(f"ROUGE Score: {results}")

=== Benchmark Results (Optimized Model) ===
Inference Time: 131.83 seconds
Memory Used: 3561.80 MB
ROUGE Score: {'rouge1': 0.09480122324159022, 'rouge2': 0.05521472392638037, 'rougeL': 0.07951070336391437, 'rougeLsum': 0.08256880733944955}
