In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
import time
import threading
import torch.nn.utils.prune as prune
import onnx
from torch.nn.utils import prune
import torch.nn as nn
import os

In [None]:
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1"

In [None]:
# Logging in to the hugging face network usign my authorised token to access the model. Enter your access key before moving onto the next cells
login("!!!! ENTER YOUR LOGIN KEY HERE !!!!")

#### Creating a function to load the pretrained mistral model from huggingface using the AutoTokenizer and AutoModelforCausalLM methods provided by the transformers library.

In [None]:
def load_model(model_path):
    
    # Loading the model
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)
    
    return model, tokenizer

#### Creating a function to optimize the model to better fit it within the limitations of this notebook

In [None]:
def optimize_model(model, dtype=torch.float16):
    # Setting the capacity of the model to half its strength
    model = model.half() if dtype == torch.float16 else model
    # Moving the model to the GPU
    model = model.cuda()
    
    return model

In [None]:
def prune_model_stepwise(model, prune_ratio=0.2, step_size=0.0001):
    """
    Prune the model in steps to manage GPU memory usage.
    Args:
        model: The model to be pruned.
        prune_ratio: Total amount of pruning to be applied.
        step_size: Fraction of pruning to be applied in each step.
    Returns:
        The pruned model.
    """
    # Get a list of all prunable layers
    prunable_layers = [(name, module) for name, module in model.named_modules() if isinstance(module, nn.Linear)]
    
    # Calculate the number of steps
    total_layers = len(prunable_layers)
    steps = int(prune_ratio / step_size)
    
    for step in range(steps):
        print(f"Step {step + 1}/{steps}")
        # Calculate the amount to prune in this step
        prune_amount = step_size
        
        for name, module in prunable_layers:
            # Apply pruning to the layer
            prune.l1_unstructured(module, 'weight', amount=prune_amount)
            prune.remove(module, 'weight')
            torch.cuda.empty_cache()  # Clear GPU cache after each layer
        
        # Optional: Validate the model after each step
        # validate_model(model)
        
    return model

In [None]:
def prune_layer(module, amount):
    try:
        prune.l1_unstructured(module, 'weight', amount=amount)
        prune.remove(module, 'weight')
        torch.cuda.empty_cache()  # Clear GPU cache
    except RuntimeError as e:
        print(f"Error during pruning: {e}")
        torch.cuda.empty_cache()  # Clear GPU cache and try again

def prune_model_stepwise_with_offloading(model, prune_ratio=0.2, step_size=0.1):
    """
    Prune the model in steps to manage GPU memory usage.
    Args:
        model: The model to be pruned.
        prune_ratio: Total amount of pruning to be applied.
        step_size: Fraction of pruning to be applied in each step.
    Returns:
        The pruned model.
    """
    # Set PyTorch memory management configuration
    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:32'

    # Get a list of all prunable layers
    prunable_layers = [(name, module) for name, module in model.named_modules() if isinstance(module, nn.Linear)]
    
    # Calculate the number of steps
    steps = int(prune_ratio / step_size)
    
    for step in range(steps):
        print(f"Step {step + 1}/{steps}")
        # Calculate the amount to prune in this step
        prune_amount = step_size
        
        for name, module in prunable_layers:
            # Skip if the layer is already pruned
            if not hasattr(module, 'weight_orig'):
                # Move the module to GPU
                module = module.to(torch.float16).cuda()
                # Offload the rest of the model to CPU
                for other_name, other_module in model.named_modules():
                    if other_name != name:
                        other_module.cpu()
                torch.cuda.empty_cache()  # Clear GPU cache

                # Prune the current module
                prune_layer(module, prune_amount)

                # Convert back to half precision if needed
                module = module.to(torch.float16).cpu()
                torch.cuda.empty_cache()  # Clear GPU cache

    # Move the entire model back to GPU
    model.cuda()
    torch.cuda.empty_cache()  # Clear GPU cache
    return model

#### Creating the function 'infer' that takes in the user input and generates the output by the model

In [None]:
# 'infer' function generates responses to a given prompt
def infer(model, tokenizer, prompt, max_length=128):
    
    # Tokenizing the input prompt
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    # Generating a response from the model
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=max_length)
        
    # Decoding the generated tokens back to the readable text
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return response

#### Creating a function to evaluate the performance metrics of the model.

In [None]:
# 'infer' function generates responses to a given prompt
def infer2(model, tokenizer, prompt, max_length=128):
    
    # Tokenizing the input prompt
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    # Generating a response from the model
    with torch.no_grad():
        with torch.autocast(device_type="cuda", dtype=torch.float16):
            outputs = model.generate(**inputs, max_length=max_length)
        
    # Decoding the generated tokens back to the readable text
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return response

In [None]:
def async_infer(model, tokenizer, prompt, max_length=128):
    stream = torch.cuda.Stream()
    with torch.cuda.stream(stream):
        response = infer2(model, tokenizer, prompt, max_length)
        
    return response

In [None]:
def benchmark(model, tokenizer, prompt, num_runs=1, concurrency=1):
    times = []
    
    for i in range(num_runs):
        start_time = time.time()
        batch_prompts = [prompt for _ in range(concurrency)]
        responses = concurrent_infer(model, tokenizer, batch_prompts)
        end_time = time.time()
        
        times.append(end_time-start_time)
    
    avg_time = sum(times)/num_runs
    throughput = (256*concurrency)/avg_time
    
    return avg_time, throughput

In [None]:
def concurrent_infer(model, tokenizer, prompts, max_length=128):
    def worker(prompt, results, idx):
        results[idx] = infer2(model, tokenizer, prompt, max_length)
    
    threads = []
    results = [None]*len(prompts)
    for i, prompt in enumerate(prompts):
        print(worker)
        thread = threading.Thread(target=worker, args=(prompt, results, i))
        threads.append(thread)
        thread.start()
        
    for thread in threads:
        thread.join()
    
    return results

In [None]:
# Defining the model path
model_path = "mistralai/Mistral-7B-Instruct-v0.3"

# Loading the model
model, tokenizer = load_model(model_path)

#Optimizing the model
model_2 = optimize_model(model, dtype=torch.float16)

In [None]:
# User input
prompt = input("Enter your prompt: ")

In [None]:
# Getting the response
response = infer2(model_2, tokenizer, prompt)

print("Model response: ", response)

In [None]:
model_3 = prune_model_stepwise_with_offloading(model_2)

In [None]:
# Benchmark the model performance
avg_time, throughput = benchmark(model_3, tokenizer, prompt)

print("Average inference time: ", avg_time)
print(f'Throughput: {throughput} tokens/sec')

In [None]:
!nvidia-smi
