In [None]:
# Step 1: Environment Setup

# In this example, we use Conda for managing the environment and dependencies.
# Based on a few experiments, Conda seems to be the optimal choice for this project due to its comprehensive 
# package management and environment isolation capabilities. It allows us to manage dependencies and avoid 
# conflicts efficiently. However, you are not restricted to using Conda. Feel free to use any environment 
# management tool that you are comfortable with, such as virtualenv, pipenv, or others.

# Note that some dependencies and library versions might change and get updated in the future. 
# From the date of writing this code to the time you might be running it, certain packages may have newer 
# versions or changes in compatibility. Always ensure to check for the latest versions and compatibility 
# issues. This can be done by reviewing the package documentation or using version constraints in your 
# package installation commands.

# Performance may vary depending on internet speed, system specifications, and other factors.
# Cloud provisioning and model loading times can differ across systems. High-speed internet can significantly
# reduce the time for downloading and installing packages, while a system with higher specifications (such as
# more CPU cores, RAM, and faster storage) can speed up the environment setup and model training processes.
# Ensure your system meets the recommended requirements for optimal performance. Additionally, cloud 
# provisioning can introduce variability in performance due to differences in network latency, server load, 
# and resource availability at different times.

# These shell commands will download and install Miniconda, and set up the environment.

# Create a directory for Miniconda
!mkdir -p ~/miniconda3

# Download the Intel-compatible Miniconda installer
!curl https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -o ~/miniconda3/miniconda.sh

# Install Miniconda quietly
!bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3

# Clean up the installer
!rm -rf ~/miniconda3/miniconda.sh

# Initialize Miniconda for bash and zsh shells
!~/miniconda3/bin/conda init bash
!~/miniconda3/bin/conda init zsh

# After running this cell, restart the kernel to apply changes.

In [None]:
# Step 2: Set Up Conda Environment

# These commands set up a Conda environment specifically for this project.
# Using a separate environment helps manage dependencies and avoid conflicts with other projects.

# Create a new Conda environment named 'qai_hub' with Python 3.8
!~/miniconda3/bin/conda create -n qai_hub python=3.8 -y

# Activate the new environment
# Note: The following command won't actually change the environment in the notebook.
# You'll need to manually activate the environment in your terminal or JupyterLab.
!source ~/miniconda3/bin/activate qai_hub

# Install the Qualcomm AI Hub Python client and other necessary libraries
# This step might take a few minutes depending on your internet speed and system performance
!~/miniconda3/bin/conda run -n qai_hub pip install qai-hub onnx onnxruntime transformers

# Install the development version of transformers
# We use the development version to ensure compatibility with the Phi-3 model.
!~/miniconda3/bin/conda run -n qai_hub pip uninstall -y transformers && pip install git+https://github.com/huggingface/transformers

In [None]:
# Step 3: Configure Qualcomm AI Hub

import qai_hub as hub

# Configure the Qualcomm AI Hub with your API token.
# This allows us to interact with Qualcomm's cloud services for model deployment and profiling.
# Replace "INSERT_YOUR_API_TOKEN_HERE" with your actual Qualcomm API token.
hub.configure(api_token="INSERT_YOUR_API_TOKEN_HERE")

In [None]:
# Step 4: Load the Model

# We use the Microsoft Phi-3-mini-128k-instruct-onnx model for this experiment.
# This model is chosen for its balance between performance and resource requirements.
# Ensure compatibility with the development version of transformers.

from transformers import AutoTokenizer, AutoModelForCausalLM

# Load model and tokenizer with trust_remote_code=True
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct-onnx", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-128k-instruct-onnx", trust_remote_code=True)

# Example usage: Generate text based on a user prompt
messages = [{"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"}]
inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")

# Generate text
outputs = model.generate(inputs, max_new_tokens=32)
text = tokenizer.batch_decode(outputs)[0]
print(text)

# Use a pipeline as a high-level helper for text generation tasks
from transformers import pipeline

pipe = pipeline("text-generation", model="microsoft/Phi-3-mini-128k-instruct-onnx", trust_remote_code=True)

In [None]:
# Step 5 (Optional): Convert Model to ONNX (If using a different model)

# If you are using a different model that is not already in ONNX format, you need to convert it to ONNX.
# Below is an example of converting a PyTorch model to ONNX format.

import torch
import torch.onnx
from transformers import AutoModel

# Replace 'your_model_here' with the model you are using
model_name = 'your_model_here'
model = AutoModel.from_pretrained(model_name)

# Set the model to evaluation mode
model.eval()

# Example input tensor, adjust the shape as per your model's requirement
example_input = torch.rand(1, 3, 224, 224)

# Export the model to ONNX format
torch.onnx.export(model, example_input, "your_model.onnx", export_params=True, opset_version=11)

print("Model converted to ONNX format successfully.")

In [None]:
# Step 6: Initial Quantization

# Quantization is the process of reducing the precision of the numbers used to represent a model's parameters.
# We use Dynamic Post-Training Quantization (PTQ) to reduce the model size and improve inference speed 
# without needing to retrain the model. Quantization is crucial for deploying models on edge devices 
# with limited computational resources and power budgets, like smartphones and IoT devices.

# Dynamic PTQ is chosen because it quantizes the model weights and dynamically quantizes the activations 
# during inference. This method strikes a balance between model accuracy and performance improvements, 
# making it suitable for on-device deployment where computational resources are constrained.

from onnxruntime.quantization import quantize_dynamic, QuantType

# Path to the ONNX model
model_path = "path_to_your_model/phi-3-mini-128k-instruct-onnx"
quantized_model_path = "path_to_your_model/phi-3-mini-128k-instruct-onnx-quantized.onnx"

# Quantize the model
# QuantType.QInt8 reduces the precision of weights from 32-bit floating-point to 8-bit integer, 
# reducing the model size by approximately 75%. For example, a 100MB model would be reduced to around 25MB.
quantize_dynamic(model_path, quantized_model_path, weight_type=QuantType.QInt8)

print("Model quantized successfully.")

In [None]:
# Step 7: Neural Network Graph Capture

# Capturing the neural network graph is essential for understanding the structure and behavior of the model.
# This step helps in visualizing the model architecture and identifying potential optimization opportunities.
# The captured graph can be used for debugging, optimizing, and ensuring the correctness of the model.

import torch
from torch.onnx import export
import onnx

# Load the quantized ONNX model
onnx_model = onnx.load(quantized_model_path)

# Visualize the graph (this example uses Netron, which should be installed separately)
import netron
netron.start(quantized_model_path)

print("Neural network graph captured and visualized successfully.")

In [None]:
# Step 8: On-Device Compilation

# On-device compilation involves compiling the model specifically for the target hardware.
# This step ensures that the model is optimized for the device's architecture, making use of hardware 
# acceleration features available on the target device, such as the Neural Processing Unit (NPU).

# Compile the traced model for the target device
compile_job = hub.submit_compile_job(
    model=quantized_model_path,
    device=hub.Device("Samsung Galaxy S24 Ultra"),
    input_specs=dict(image=(1, 3, 224, 224)),
    options="--target_runtime qnn_context_binary"
)

# Check the status of the compile job
compile_status = compile_job.get_status()
print(f"Compile job status: {compile_status}")

# Get the compiled model
compiled_model = compile_job.get_target_model()

In [None]:
# Step 9: Hardware Acceleration

# Utilizing hardware acceleration involves leveraging the specific capabilities of the target device's 
# hardware, such as the NPU, GPU, or specialized accelerators. This step ensures that the model runs 
# efficiently, taking advantage of the device's full potential.

# For instance, NPUs can execute certain operations 10-100 times faster than CPUs due to their parallel 
# processing capabilities, which can significantly speed up inference times.

# Submit a profile job to run the compiled model on the device with hardware acceleration
profile_job = hub.submit_profile_job(model=compiled_model, device=hub.Device("Samsung Galaxy S24 Ultra"))

# Check the status of the profile job
profile_status = profile_job.get_status()
print(f"Profile job status: {profile_status}")

# Download profile results
profile_results = profile_job.download_profile()
print(f"Profile results: {profile_results}")

In [None]:
# Step 10: Second Round of Quantization

# Perform a second round of quantization directly on the device using Qualcomm AI Hub.
# This step leverages the specific hardware capabilities of the target device to further optimize the model.
# On-device quantization can take advantage of the exact hardware characteristics, such as the specific 
# capabilities of the Neural Processing Unit (NPU), to achieve better performance and efficiency.

second_quantization_job = hub.submit_quantize_job(model=compiled_model, device=hub.Device("Samsung Galaxy S24 Ultra"))

# Check the status of the quantization job
second_quantization_status = second_quantization_job.get_status()
print(f"Second quantization job status: {second_quantization_status}")

# Get the further quantized model
second_quantized_model = second_quantization_job.get_target_model()

In [None]:
# Step 11: Evaluate and Compare Performance

# Evaluate and compare the performance of both quantized models by submitting inference jobs.
# This step involves running the models with sample inputs and measuring various performance metrics 
# such as inference time, memory usage, and power consumption.

# Submit inference jobs for both models
inference_job_initial = hub.submit_inference_job(model=compiled_model, device=hub.Device("Samsung Galaxy S24 Ultra"), inputs={"input_name": example_input})
inference_job_second = hub.submit_inference_job(model=second_quantized_model, device=hub.Device("Samsung Galaxy S24 Ultra"), inputs={"input_name": example_input})

# Get inference results
inference_results_initial = inference_job_initial.download_output_data()
inference_results_second = inference_job_second.download_output_data()

# Evaluate and compare performance metrics
performance_initial = profile_job.download_profile()
performance_second = second_quantization_job.download_profile()

print(f"Initial Quantized Model Performance: {performance_initial}")
print(f"Second Quantized Model Performance: {performance_second}")