# Notebook to Perform Pytorch Profiler

# Setup

In [None]:
!pip install -U peft

Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.14.0-py3-none-any.whl (374 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.8/374.8 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: peft
  Attempting uninstall: peft
    Found existing installation: peft 0.13.2
    Uninstalling peft-0.13.2:
      Successfully uninstalled peft-0.13.2
Successfully installed peft-0.14.0


In [None]:
!pip install torch transformers torch_pruning evaluate peft accelerate

Collecting torch_pruning
  Downloading torch_pruning-1.5.1-py3-none-any.whl.metadata (29 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading torch_pruning-1.5.1-py3-none-any.whl (63 kB)
[2K   [90m━━━━━━━━━━

In [None]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import evaluate
import accelerate

# For Pruning
import torch.nn.utils.prune as prune
import torch_pruning as tp

# For Quantization and LoRA
from transformers import BertForSequenceClassification, AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoConfig, AutoModel
from transformers import Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

import pandas as pd
import numpy as np
import kagglehub

import matplotlib.pyplot as plt
import seaborn as sns
import os
import random

import argparse
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftConfig, PeftModel

In [None]:
from torch.profiler import profile, record_function, ProfilerActivity

# Unzip Model Files

In [None]:
!cd /content/fine_tuned_optimized_bert
!unzip fine_tuned_optimized_bert.zip

/bin/bash: line 1: cd: /content/fine_tuned_optimized_bert: No such file or directory
Archive:  fine_tuned_optimized_bert.zip
Written using ZipTricks 5.6.0
  inflating: adapter_config.json     
  inflating: adapter_model.safetensors  
   creating: checkpoint-1000/
  inflating: checkpoint-1000/adapter_config.json  
  inflating: checkpoint-1000/adapter_model.safetensors  
  inflating: checkpoint-1000/optimizer.pt  
  inflating: checkpoint-1000/rng_state.pth  
  inflating: checkpoint-1000/scheduler.pt  
  inflating: checkpoint-1000/special_tokens_map.json  
  inflating: checkpoint-1000/tokenizer.json  
  inflating: checkpoint-1000/tokenizer_config.json  
  inflating: checkpoint-1000/trainer_state.json  
  inflating: checkpoint-1000/training_args.bin  
  inflating: checkpoint-1000/vocab.txt  
   creating: checkpoint-1500/
  inflating: checkpoint-1500/adapter_config.json  
  inflating: checkpoint-1500/adapter_model.safetensors  
  inflating: checkpoint-1500/optimizer.pt  
  inflating: checkp

In [None]:
!cd /content/fine_tuned_optimized_gpt2
!unzip fine_tuned_optimized_gpt2.zip

/bin/bash: line 1: cd: /content/fine_tuned_optimized_gpt2: No such file or directory
Archive:  fine_tuned_optimized_gpt2.zip
Written using ZipTricks 5.6.0
  inflating: adapter_config.json     
  inflating: adapter_model.safetensors  
   creating: checkpoint-1062/
  inflating: checkpoint-1062/adapter_config.json  
  inflating: checkpoint-1062/adapter_model.safetensors  
  inflating: checkpoint-1062/merges.txt  
  inflating: checkpoint-1062/optimizer.pt  
  inflating: checkpoint-1062/rng_state_0.pth  
  inflating: checkpoint-1062/rng_state_1.pth  
  inflating: checkpoint-1062/rng_state_2.pth  
  inflating: checkpoint-1062/rng_state_3.pth  
  inflating: checkpoint-1062/scheduler.pt  
  inflating: checkpoint-1062/special_tokens_map.json  
  inflating: checkpoint-1062/tokenizer.json  
  inflating: checkpoint-1062/tokenizer_config.json  
  inflating: checkpoint-1062/trainer_state.json  
  inflating: checkpoint-1062/training_args.bin  
  inflating: checkpoint-1062/vocab.json  
   creating: ch

In [None]:
!cd /content/fine_tuned_optimized_llama
!unzip fine_tuned_optimized_llama.zip

Archive:  fine_tuned_optimized_llama.zip
Written using ZipTricks 5.6.0
  inflating: README.md               
  inflating: adapter_config.json     
  inflating: adapter_model.safetensors  
   creating: checkpoint-1062/
  inflating: checkpoint-1062/README.md  
  inflating: checkpoint-1062/adapter_config.json  
  inflating: checkpoint-1062/adapter_model.safetensors  
  inflating: checkpoint-1062/optimizer.pt  
  inflating: checkpoint-1062/rng_state_0.pth  
  inflating: checkpoint-1062/rng_state_1.pth  
  inflating: checkpoint-1062/rng_state_2.pth  
  inflating: checkpoint-1062/rng_state_3.pth  
  inflating: checkpoint-1062/scheduler.pt  
  inflating: checkpoint-1062/special_tokens_map.json  
  inflating: checkpoint-1062/tokenizer.json  
  inflating: checkpoint-1062/tokenizer_config.json  
  inflating: checkpoint-1062/trainer_state.json  
  inflating: checkpoint-1062/training_args.bin  
   creating: checkpoint-1593/
  inflating: checkpoint-1593/README.md  
  inflating: checkpoint-1593/adap

In [None]:
os.environ['HF_TOKEN'] = ""

# Review Text

Get review straight from amazon

In [None]:
amazon_review = "Bought this watch to replace an older model with a cracked screen. \
The watch arrived on time, included the watch, strap, and charger. The watch looked brand new, no scratches, nicks, or any other obvious damage. \
So far it is functioning flawlessly. However, there is no documentation or set up instruction.Since this is a newer series than my old one, \
I also bought a book about Apple Watches series 9 for Senior Citizens. Was a great help! I highly recommend purchasing a screen protector."

# Test Optimized BERT

Load Optimized Bert

In [None]:
_MODEL_NAME = "bert"
model_path = "/content/fine_tuned_optimized_bert"

## Evaluate

Pass review to model, and run pytorch profiler to save results

In [None]:
peft_model_id = model_path

config = AutoConfig.from_pretrained(model_path)

#Load the base model
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=3,
    use_auth_token=True,
    ignore_mismatched_sizes=True
)

model.config.pad_token_id = model.config.eos_token_id
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Ensure model is in evaluation mode
model.eval()

# Tokenize the input text
inputs = tokenizer(amazon_review, return_tensors="pt", truncation=True, padding=True, max_length=512)

# Perform inference
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    on_trace_ready=torch.profiler.tensorboard_trace_handler(f'./logs_{_MODEL_NAME}'),
    record_shapes=True,
    profile_memory=True,
    with_stack=True
) as prof:
    with record_function("model_inference"):
      with torch.no_grad():
          outputs = model(**inputs)

# Get predicted sentiment
logits = outputs.logits
predicted_class = torch.argmax(logits, dim=1).item()

# Map the predicted class to sentiment labels
sentiment_labels = {0: "negative", 1: "neutral", 2: "positive"}
predicted_sentiment = sentiment_labels[predicted_class]

# Print the result
print(f"Sentiment: {predicted_sentiment}")

  warn("CUDA is not available, disabling CUDA profiling")


Sentiment: positive


## Profiler Results

In [None]:
# print table with cpu/gpu mem and time
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                 Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      model_inference         0.42%      53.200ms       100.00%       12.566s       12.566s           6 b     -51.15 Mb             1  
                                         aten::linear         0.05%       6.803ms        99.16%       12.461s      85.346ms      22.42 Mb           0 b           146  
                                          aten::addmm        98.37%       12.362s        98.41%       12.366s     167.113ms      16.77 Mb      16.77 Mb         

In [None]:
print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))

-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                 Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                          aten::addmm        98.37%       12.362s        98.41%       12.366s     167.113ms      16.77 Mb      16.77 Mb            74  
                                            aten::add         0.04%       4.889ms         0.04%       4.889ms      80.152us       9.47 Mb       9.47 Mb            61  
                                           aten::gelu         0.09%      11.594ms         0.09%      11.594ms     966.192us       7.45 Mb       7.45 Mb         

# Test Optimized GPT2

Load Optimized GPT2

In [None]:
_MODEL_NAME = "gpt2"
model_path = "/content/fine_tuned_optimized_gpt2"

In [None]:
peft_model_id = model_path

config = AutoConfig.from_pretrained(model_path)

#Load the base model
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=3,
    use_auth_token=True,
    ignore_mismatched_sizes=True
)

model.config.pad_token_id = model.config.eos_token_id
tokenizer = AutoTokenizer.from_pretrained("gpt2")

model.safetensors:  23%|##2       | 126M/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

## Evaluate

Pass review to model, and run pytorch profiler to save results

In [None]:
# Ensure model is in evaluation mode
model.eval()


# Tokenize the input text
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer(amazon_review, return_tensors="pt", truncation=True, padding=True, max_length=512)

# Perform inference
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    on_trace_ready=torch.profiler.tensorboard_trace_handler(f'./logs_{_MODEL_NAME}'),
    record_shapes=True,
    profile_memory=True,
    with_stack=True
) as prof:
    with record_function("model_inference"):
      with torch.no_grad():
          outputs = model(**inputs)

# Get predicted sentiment
logits = outputs.logits
predicted_class = torch.argmax(logits, dim=1).item()

# Map the predicted class to sentiment labels
sentiment_labels = {0: "negative", 1: "neutral", 2: "positive"}
predicted_sentiment = sentiment_labels[predicted_class]

# Print the result
print(f"Sentiment: {predicted_sentiment}")

## Profiler Results

In [None]:
# print table with cpu/gpu mem and time
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                 Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      model_inference         0.13%      44.016ms       100.00%       33.128s       33.128s       5.54 Mb     -96.61 Mb             1  
                                          aten::addmm        99.28%       32.889s        99.29%       32.893s     685.262ms      16.61 Mb      16.61 Mb            48  
                                         aten::linear         0.00%     316.525us         0.24%      80.270ms       3.211ms       5.56 Mb           0 b         

In [None]:
print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))

-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                 Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            aten::mul         0.04%      13.334ms         0.05%      15.151ms     252.523us      35.07 Mb      35.07 Mb            60  
                                            aten::add         0.03%       8.784ms         0.03%       9.116ms     149.443us      24.15 Mb      24.15 Mb            61  
                                          aten::addmm        99.28%       32.889s        99.29%       32.893s     685.262ms      16.61 Mb      16.61 Mb         

# Test Optimized LLAMA

Load Optimized Llama

In [None]:
_MODEL_NAME = "llama"
model_path = "/content/fine_tuned_optimized_llama"

In [None]:
peft_model_id = model_path

config = AutoConfig.from_pretrained(model_path)

#Load the base model
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=3,
    use_auth_token=True,
    ignore_mismatched_sizes=True
)

model.config.pad_token_id = model.config.eos_token_id
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

## Evaluate

Pass review to model, and run pytorch profiler to save results

In [None]:
# Ensure model is in evaluation mode
model.eval()

# Tokenize the input text
inputs = tokenizer(amazon_review, return_tensors="pt", truncation=True, padding=True, max_length=512)

# Perform inference
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    on_trace_ready=torch.profiler.tensorboard_trace_handler(f'./logs_{_MODEL_NAME}'),
    record_shapes=True,
    profile_memory=True,
    with_stack=True
) as prof:
    with record_function("model_inference"):
      with torch.no_grad():
          outputs = model(**inputs)

# Get predicted sentiment
logits = outputs.logits
predicted_class = torch.argmax(logits, dim=1).item()

# Map the predicted class to sentiment labels
sentiment_labels = {0: "negative", 1: "neutral", 2: "positive"}
predicted_sentiment = sentiment_labels[predicted_class]

# Print the result
print(f"Sentiment: {predicted_sentiment}")

  warn("CUDA is not available, disabling CUDA profiling")


Sentiment: positive


## Profiler Results

In [None]:
# print table with cpu/gpu mem and time
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                 Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      model_inference         1.25%      70.021ms       100.00%        5.618s        5.618s       6.56 Mb    -520.36 Mb             1  
                                         aten::matmul         0.15%       8.649ms        91.20%        5.124s      28.784ms     167.46 Mb           0 b           178  
                                         aten::linear         0.03%       1.879ms        90.71%        5.096s      28.790ms     167.45 Mb           0 b         

In [None]:
print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))

-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                 Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                             aten::mm        90.38%        5.077s        90.38%        5.077s      28.686ms     167.45 Mb     167.45 Mb           177  
                                            aten::mul         1.41%      79.457ms         1.44%      80.871ms     449.283us     155.91 Mb     155.91 Mb           180  
                                            aten::add         0.29%      16.167ms         0.30%      16.667ms     129.199us      59.08 Mb      59.08 Mb         