In [1]:
!pip install torch transformers datasets evaluate accelerate memory-profiler peft

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import torch
import time
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import get_peft_model, LoraConfig, TaskType
from datasets import load_dataset
from memory_profiler import memory_usage
import evaluate


In [3]:
device = torch.device("cpu")

In [4]:
model_name = "EleutherAI/gpt-neo-1.3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(device)

In [6]:
if hasattr(torch, 'compile'):
    model = torch.compile(model)

In [7]:
dataset = load_dataset("cnn_dailymail", "3.0.0")
sample_text = dataset["test"][0]["article"]

In [8]:
# Fine-tuning using LoRA with optimized parameters
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=4,  # Reduce rank for faster inference
    lora_alpha=16,  # Adjust scaling
    lora_dropout=0.05,  # Lower dropout for efficiency
    target_modules=["attention.q_proj", "attention.v_proj"]  # Correct layers for GPT-Neo
)
model = get_peft_model(model, peft_config)

In [9]:
# Function to measure inference time
def measure_inference_time(text, model, tokenizer, max_new_tokens=30):  # Reduce token limit
    inputs = tokenizer(text, return_tensors="pt").to(device)
    start_time = time.time()
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    end_time = time.time()
    return tokenizer.decode(outputs[0], skip_special_tokens=True), end_time - start_time

In [10]:
# Function to measure memory usage
def measure_memory_usage(text, model, tokenizer, max_new_tokens=30):
    def generate_text():
        inputs = tokenizer(text, return_tensors="pt").to(device)
        with torch.no_grad():
            model.generate(**inputs, max_new_tokens=max_new_tokens)
    return memory_usage(generate_text, max_usage=True)


In [11]:
# Benchmark optimized model
summary, inference_time = measure_inference_time(sample_text, model, tokenizer)
memory_used = measure_memory_usage(sample_text, model, tokenizer)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [12]:
!pip install rouge_score
# Evaluate performance
rouge = evaluate.load("rouge")
reference_summary = dataset["test"][0]["highlights"]
results = rouge.compute(predictions=[summary], references=[reference_summary])

Defaulting to user installation because normal site-packages is not writeable


In [13]:
print("=== Benchmark Results (Optimized Model) ===")
print(f"Inference Time: {inference_time:.2f} seconds")
print(f"Memory Used: {memory_used:.2f} MB")
print(f"ROUGE Score: {results}")

=== Benchmark Results (Optimized Model) ===
Inference Time: 112.24 seconds
Memory Used: 3586.12 MB
ROUGE Score: {'rouge1': 0.09733124018838304, 'rouge2': 0.05669291338582677, 'rougeL': 0.08163265306122448, 'rougeLsum': 0.0847723704866562}


In [14]:
# updated

In [15]:
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)

In [16]:
if hasattr(torch, 'compile'):
    model = torch.compile(model, mode="reduce-overhead")

In [17]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=2,  # Further reduced rank for speed
    lora_alpha=8,  # Lower scaling
    lora_dropout=0.03,  # Lower dropout for efficiency
    target_modules=["attention.q_proj", "attention.v_proj"]  # Correct layers for GPT-Neo
)
model = get_peft_model(model, peft_config)

In [18]:
# Function to measure inference time
def measure_inference_time(text, model, tokenizer, max_new_tokens=20):  # Further reduced token limit
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
    start_time = time.time()
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False, num_beams=1, min_length=10)
    end_time = time.time()
    return tokenizer.decode(outputs[0], skip_special_tokens=True), end_time - start_time

In [19]:

# Function to measure memory usage
def measure_memory_usage(text, model, tokenizer, max_new_tokens=20):
    def generate_text():
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
        with torch.no_grad():
            model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False, num_beams=1, min_length=10)
    return memory_usage(generate_text, max_usage=True)

In [20]:
# Benchmark optimized model
summary, inference_time = measure_inference_time(sample_text, model, tokenizer)
memory_used = measure_memory_usage(sample_text, model, tokenizer)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [21]:
# Evaluate performance
rouge = evaluate.load("rouge")
reference_summary = dataset["test"][0]["highlights"]
results = rouge.compute(predictions=[summary], references=[reference_summary])

In [22]:
# Print benchmark results
print("=== Benchmark Results (Optimized Model) ===")
print(f"Inference Time: {inference_time:.2f} seconds")
print(f"Memory Used: {memory_used:.2f} MB")
print(f"ROUGE Score: {results}")

=== Benchmark Results (Optimized Model) ===
Inference Time: 73.33 seconds
Memory Used: 3612.75 MB
ROUGE Score: {'rouge1': 0.1257861635220126, 'rouge2': 0.07578947368421053, 'rougeL': 0.09643605870020963, 'rougeLsum': 0.09643605870020963}


In [23]:
#update 2

In [24]:
# Load model with optimized precision
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)


In [25]:
# Enable gradient checkpointing for reduced memory usage
model.gradient_checkpointing_enable()

In [26]:
# Compile model for performance (if using PyTorch 2.0+)
if hasattr(torch, 'compile'):
    model = torch.compile(model, mode="reduce-overhead")

In [27]:
# Fine-tuning using LoRA with optimized parameters
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=2,  # Further reduced rank for speed
    lora_alpha=8,  # Lower scaling
    lora_dropout=0.03,  # Lower dropout for efficiency
    target_modules=["q_proj", "v_proj"]  # Corrected layer targeting for GPT-Neo
)
model = get_peft_model(model, peft_config)


In [28]:
def measure_inference_time(text, model, tokenizer, max_new_tokens=10):  # Further reduced token limit
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128).to(device)  # Reduce input length
    start_time = time.time()
    with torch.inference_mode():  # More efficient than torch.no_grad()
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False, num_beams=1)
    end_time = time.time()
    return tokenizer.decode(outputs[0], skip_special_tokens=True), end_time - start_time

# Function to measure memory usage
def measure_memory_usage(text, model, tokenizer, max_new_tokens=10):
    def generate_text():
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128).to(device)
        with torch.inference_mode():
            model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False, num_beams=1)
    return memory_usage(generate_text, max_usage=True)

In [29]:
summary, inference_time = measure_inference_time(sample_text, model, tokenizer)
memory_used = measure_memory_usage(sample_text, model, tokenizer)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [30]:
# Evaluate performance
rouge = evaluate.load("rouge")
reference_summary = dataset["test"][0]["highlights"]
results = rouge.compute(predictions=[summary], references=[reference_summary])

In [31]:
# Print benchmark results
print("=== Benchmark Results (Optimized Model) ===")
print(f"Inference Time: {inference_time:.2f} seconds")
print(f"Memory Used: {memory_used:.2f} MB")
print(f"ROUGE Score: {results}")

=== Benchmark Results (Optimized Model) ===
Inference Time: 12.41 seconds
Memory Used: 3420.41 MB
ROUGE Score: {'rouge1': 0.2894736842105263, 'rouge2': 0.17333333333333334, 'rougeL': 0.2631578947368421, 'rougeLsum': 0.2631578947368421}
