In [None]:
# Steps: 
# 1. Install Prerequisites
# 2. Get the Configuration File.
# 3. Check available models
# 4. Select model for Compression
# 5. Compress model for the 3 variants
# 6. Select model for Evaluation
# 7. Evaluate selected model for all the Compression Variants. 
# 8. Save the Results in a CSV File (Per Compression Variant)

import os
import platform

os.environ["GIT_CLONE_PROTECTION_ACTIVE"] = "false"

#added installations 
%pip install rouge-score 
%pip install ipywidgets
%pip install pyngrok
 
#existing installations
%pip install -Uq pip
%pip uninstall -q -y optimum optimum-intel
%pip install --pre -Uq "openvino>=2024.2.0" openvino-tokenizers[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
%pip install -q --extra-index-url https://download.pytorch.org/whl/cpu\
"git+https://github.com/huggingface/optimum-intel.git"\
"nncf==2.14.1"\
"torch>=2.1"\
"datasets" \
"accelerate" \
"gradio>=4.19" \
"huggingface-hub>=0.26.5" \
 "einops" "transformers>=4.43.1" "transformers_stream_generator" "tiktoken" "bitsandbytes"

if platform.system() == "Darwin":
    %pip install -q "numpy<2.0.0"

In [None]:
import os
from pathlib import Path
import requests
import shutil

# fetch model configuration

config_shared_path = Path("../../utils/llm_config.py")
config_dst_path = Path("llm_config.py")

if not config_dst_path.exists():
    if config_shared_path.exists():
        try:
            os.symlink(config_shared_path, config_dst_path)
        except Exception:
            shutil.copy(config_shared_path, config_dst_path)
    else:
        r = requests.get(url="https://raw.githubusercontent.com/GodreignElgin/llm-comparision/llm_config.py")
        with open("llm_config.py", "w", encoding="utf-8") as f:
            f.write(r.text)
elif not os.path.islink(config_dst_path):
    print("LLM config will be updated")
    if config_shared_path.exists():
        shutil.copy(config_shared_path, config_dst_path)
    else:
        r = requests.get(url="https://raw.githubusercontent.com/GodreignElgin/llm-comparision/llm_config.py")
        with open("llm_config.py", "w", encoding="utf-8") as f:
            f.write(r.text)

In [None]:
from llm_config import SUPPORTED_LLM_MODELS
import ipywidgets as widgets

In [2]:
# to check the current available RAM memory before doing evalation
import psutil
print(f"Available Memory: {psutil.virtual_memory().available / 1e9:.2f} GB")

Available Memory: 0.70 GB


In [None]:
# To list all the available Models. 
# The model name listed in below this is the model name you should give in the 
models = list(SUPPORTED_LLM_MODELS)

models = widgets.Dropdown(
    options=models,
    value=models[0],
    description="Models Available: ",
    disabled=False,
)

models

qwen2.5-0.5b-instruct


In [None]:
# Compress the Selected model into all three variants.
# The Compressions done in this code are FP16, INT8 and INT4. To include other versions, you need to explicitly write code for that.
from pathlib import Path
from IPython.display import Markdown, display

model_name = models.value
model_configuration = SUPPORTED_LLM_MODELS[model_name]
print(f"Processing model: {model_name}")
    
pt_model_id = model_configuration["model_id"]
fp16_model_dir = Path(model_name) / "FP16"
int8_model_dir = Path(model_name) / "INT8"
int4_model_dir = Path(model_name) / "INT4"

def convert_to_fp16():
    if not (fp16_model_dir / "openvino_model.xml").exists():
        remote_code = model_configuration.get("remote_code", False)
        export_command = f"optimum-cli export openvino --model {pt_model_id} --task text-generation-with-past --weight-format fp16 {str(fp16_model_dir)}"
        if remote_code:
            export_command += " --trust-remote-code"
        display(Markdown(f"**Export command:** {export_command}"))
        ! $export_command

def convert_to_int8():
    if not (int8_model_dir / "openvino_model.xml").exists():
        int8_model_dir.mkdir(parents=True, exist_ok=True)
        remote_code = model_configuration.get("remote_code", False)
        export_command = f"optimum-cli export openvino --model {pt_model_id} --task text-generation-with-past --weight-format int8 {str(int8_model_dir)}"
        if remote_code:
            export_command += " --trust-remote-code"
        display(Markdown(f"**Export command:** {export_command}"))
        ! $export_command

def convert_to_int4():
    compression_configs = {
        "default": {"sym": False, "group_size": 128, "ratio": 0.8},
        "zephyr-7b-beta": {"sym": True, "group_size": 64, "ratio": 0.6},
        "mistral-7b": {"sym": True, "group_size": 64, "ratio": 0.6},
        "minicpm-2b-dpo": {"sym": True, "group_size": 64, "ratio": 0.6},
        "gemma-2b-it": {"sym": True, "group_size": 64, "ratio": 0.6},
        "notus-7b-v1": {"sym": True, "group_size": 64, "ratio": 0.6},
        "neural-chat-7b-v3-1": {"sym": True, "group_size": 64, "ratio": 0.6},
        "llama-2-chat-7b": {"sym": True, "group_size": 128, "ratio": 0.8},
        "llama-3-8b-instruct": {"sym": True, "group_size": 128, "ratio": 0.8},
        "llama-3.1-8b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0},
        "gemma-7b-it": {"sym": True, "group_size": 128, "ratio": 0.8},
        "chatglm2-6b": {"sym": True, "group_size": 128, "ratio": 0.72},
        "qwen-7b-chat": {"sym": True, "group_size": 128, "ratio": 0.6},
        "red-pajama-3b-chat": {"sym": False, "group_size": 128, "ratio": 0.5},
        "qwen2.5-7b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0},
        "qwen2.5-3b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0},
        "qwen2.5-14b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0},
        "qwen2.5-1.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0},
        "qwen2.5-0.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0},
    }
    model_compression_params = compression_configs.get(model_name, compression_configs["default"])

    if not (int4_model_dir / "openvino_model.xml").exists():
        remote_code = model_configuration.get("remote_code", False)
        export_command = f"optimum-cli export openvino --model {pt_model_id} --task text-generation-with-past --weight-format int4"
        export_command += f" --group-size {model_compression_params['group_size']} --ratio {model_compression_params['ratio']}"
        if model_compression_params["sym"]:
            export_command += " --sym"
        export_command += f" {str(int4_model_dir)}"
        if remote_code:
            export_command += " --trust-remote-code"
        display(Markdown(f"**Export command:** {export_command}"))
        ! $export_command
# Convert models if needed
convert_to_fp16()
convert_to_int8()
convert_to_int4()
print(f"Finished processing {model_name}\n")


# Evaluation Pipeline 

This Pipeline code is designed to iterate through all the available models from the SUPPORTED_LLM_MODELS list, and Evaluate on 11 Evaluation Metrics. The Evaluation results are Stored in a CSV File for future reference. The code is well-structured, readable, and follows best practices.

This pipeline runs through only the specified Compression.

CHATGPT_ i need you to add steps of what all is happening in the below cell of code. put the steps in this markdown. 

In [5]:
# To list all the available Models. 
# The model name listed in below this is the model name you should give in the 
models = list(SUPPORTED_LLM_MODELS)

print(models)

models_eval = widgets.Dropdown(
    options=models,
    value=models[0],
    description="Models Available: ",
    disabled=False,
)

models_eval

['qwen2.5-0.5b-instruct', 'tiny-llama-1b-chat', 'DeepSeek-R1-Distill-Qwen-1.5B', 'DeepSeek-R1-Distill-Qwen-7B', 'DeepSeek-R1-Distill-Llama-8B', 'llama-3.2-1b-instruct', 'llama-3.2-3b-instruct', 'qwen2.5-1.5b-instruct', 'gemma-2b-it', 'gemma-2-2b-it', 'red-pajama-3b-chat', 'qwen2.5-3b-instruct', 'minicpm3-4b', 'qwen2.5-7b-instruct', 'gemma-7b-it', 'gemma-2-9b-it', 'llama-2-chat-7b', 'llama-3-8b-instruct', 'llama-3.1-8b-instruct', 'mistral-7b-instruct', 'zephyr-7b-beta', 'notus-7b-v1', 'neural-chat-7b-v3-3', 'phi-3-mini-instruct', 'phi-3.5-mini-instruct', 'phi-4-mini-instruct', 'phi-4', 'qwen2.5-14b-instruct']


Dropdown(description='Models Available: ', options=('qwen2.5-0.5b-instruct', 'tiny-llama-1b-chat', 'DeepSeek-R…

In [None]:
import time
import numpy as np
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.chrf_score import corpus_chrf
from sklearn.metrics.pairwise import cosine_similarity
def evaluate_model(model, tokenizer, input_text, reference_texts):
    # Time stamp 
    start_time = time.time()
    
    # Running the model with the sample input text.  
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
    output_ids = model.generate(input_ids=input_ids, max_new_tokens=128)
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    # Latency calculation
    latency = (time.time() - start_time) * 1000
    num_tokens = len(output_ids[0])

    # Throughput calculation
    throughput = num_tokens / (latency / 1000)
    
    # Rouge Score Calcuator
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    rouge_scores = scorer.score(reference_texts, generated_text)
    
    # BLEU Score Calculator
    smoothing = SmoothingFunction().method1
    bleu_score = sentence_bleu([reference_texts.split()], generated_text.split(), smoothing_function=smoothing)
    
    # chrf Score Calculator
    chrf_score = corpus_chrf([[reference_texts]], [[generated_text]])
    
    # Unique ngrams, entropy and repeated ngrams Calculation
    tokens = generated_text.split()
    unique_ngrams = len(set(zip(tokens, tokens[1:]))) / len(tokens) if len(tokens) > 1 else 0
    entropy = -np.sum([p * np.log2(p) for p in np.unique(tokens, return_counts=True)[1] / len(tokens)])
    repeated_ngrams = sum([1 for i in range(len(tokens) - 1) if tokens[i] == tokens[i + 1]]) / len(tokens)
    
    try:
        ref_emb = model.get_input_embeddings()(input_ids).detach().numpy()
        gen_emb = model.get_input_embeddings()(output_ids).detach().numpy()

        #Coherence Calculator.
        coherence = cosine_similarity(ref_emb.mean(axis=1), gen_emb.mean(axis=1))[0][0]
    except Exception:
        coherence = 0
    
    return {
        "Model": f"{model.config.name_or_path}_{compression_dir}",
        "Latency (ms)": latency,
        "Throughput (tokens/sec)": throughput,
        "ROUGE-1": rouge_scores["rouge1"].fmeasure,
        "ROUGE-2": rouge_scores["rouge2"].fmeasure,
        "ROUGE-L": rouge_scores["rougeL"].fmeasure,
        "BLEU Score": bleu_score,
        "CHRF Score": chrf_score,
        "Unique n-grams": unique_ngrams,
        "Entropy": entropy,
        "Repeated n-grams (%)": repeated_ngrams * 100,
        "Coherence (Cosine Similarity)": coherence,
    }

In [None]:
import gc
import pandas as pd
from pathlib import Path
from transformers import AutoConfig, AutoTokenizer
from optimum.intel.openvino import OVModelForCausalLM
import openvino.properties as props
import openvino.properties.hint as hints
import openvino.properties.streams as streams

def evaluate_models_and_save(SUPPORTED_LLM_MODELS, compression_dir):

    output_csv=f"evaluation_results_{compression_dir}.csv"

    # results stored as a list
    all_results = []
    existing_df = None
    
    # check if the file already exists, to append to the file and not accidentally create new file everytime and overwrite over it.
    if Path(output_csv).exists():
        existing_df = pd.read_csv(output_csv)
    
    model_name = models_eval.value
    model_configuration = SUPPORTED_LLM_MODELS[model_name]    
    print(f"Processing model: {model_name}")
    
    #loads only the specified precision model file. 
    model_dir = Path(model_name) / compression_dir  
    # only loads the openvino format model. 
    if not (model_dir / "openvino_model.xml").exists():
        return
    
    print(f"Loading model from {model_dir}")
    ov_config = {
        hints.performance_mode(): hints.PerformanceMode.THROUGHPUT,
        streams.num(): "AUTO",
        props.cache_dir(): "ov_cache"
    }
    
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
        model = OVModelForCausalLM.from_pretrained(
            model_dir,
            device="CPU", # u can set the device here. GPU if u have. 
            ov_config=ov_config,
            config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True),
            trust_remote_code=True
        )
    except Exception as e:
        print(f"Error loading {model_name}: {e}")
        return
    
    # sample input text for testing. 
    input_text = "2 + 2 ="
    # one-shot testing (simple one shot)
    reference_texts = "2 + 2 = 4"
    
    try:
        results = evaluate_model(model, tokenizer, input_text, reference_texts)
        results["Model"] = model_name
        all_results.append(results)
    except Exception as e:
        print(f"Error during evaluation of {model_name}: {e}")
        
    # Deletes the evaluated model and its assigned tokenizer to save RAM memory. 
    del model, tokenizer
    # Initializes the Garbage collector to free up memory.
    gc.collect()
    
    # Adding the results to a dataframe for easier analysis.
    df = pd.DataFrame(all_results)
    if existing_df is not None:
        df = pd.concat([existing_df, df], ignore_index=True)
    
    df.to_csv(output_csv, index=False)
    print(f"Results appended to {output_csv}")

compression_dirs = ["FP16", "INT8", "INT4"]
# make sure to set the compression_dir vairable
# compression_dir = model_compressed_directory_name 
# Example, compression_dir = FP16 (if the directory is named FP16 in the format, model/FP16/the compressed files.)
for compression_dir in compression_dirs:
    evaluate_models_and_save(SUPPORTED_LLM_MODELS, compression_dir)