## Import libraries

In [None]:
import os
import requests

pip_install(
    "-q",
    "--extra-index-url",
    "https://download.pytorch.org/whl/cpu",
    "llama-index",
    "faiss-cpu",
    "pymupdf",
    "langchain",
    "llama-index-readers-file",
    "llama-index-vector-stores-faiss",
    "llama-index-llms-langchain",
    "llama-index-llms-huggingface>=0.3.0,<0.3.4",
    "llama-index-embeddings-huggingface>=0.3.0",
)
pip_install("-q", "git+https://github.com/huggingface/optimum-intel.git", "git+https://github.com/openvinotoolkit/nncf.git", "datasets", "accelerate", "gradio")
pip_install("--pre", "-U", "openvino>=2024.2", "--extra-index-url", "https://storage.openvinotoolkit.org/simple/wheels/nightly")
pip_install("--pre", "-U", "openvino-tokenizers[transformers]>=2024.2", "--extra-index-url", "https://storage.openvinotoolkit.org/simple/wheels/nightly")
pip_install("-q", "--no-deps", "llama-index-llms-openvino>=0.3.1", "llama-index-embeddings-openvino>=0.2.1", "llama-index-postprocessor-openvino-rerank>=0.2.0")

In [2]:
from pathlib import Path
import shutil
import io
config_shared_path = Path("../../utils/llm_config.py")
config_dst_path = Path("llm_config.py")

if not config_dst_path.exists():
    if config_shared_path.exists():
        try:
            os.symlink(config_shared_path, config_dst_path)
        except Exception:
            shutil.copy(config_shared_path, config_dst_path)
    else:
        r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/llm_config.py")
        with open("llm_config.py", "w", encoding="utf-8") as f:
            f.write(r.text)
elif not os.path.islink(config_dst_path):
    print("LLM config will be updated")
    if config_shared_path.exists():
        shutil.copy(config_shared_path, config_dst_path)
    else:
        r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/llm_config.py")
        with open("llm_config.py", "w", encoding="utf-8") as f:
            f.write(r.text)


LLM config will be updated


In [35]:
from pathlib import Path
import ipywidgets as widgets
from notebook_utils import device_widget, optimize_bge_embedding
from IPython.display import Markdown, display

prepare_int4_model = widgets.Checkbox(
    value=True,
    description="Prepare INT4 model",
    disabled=False,
)
prepare_int8_model = widgets.Checkbox(
    value=False,
    description="Prepare INT8 model",
    disabled=False,
)
prepare_fp16_model = widgets.Checkbox(
    value=False,
    description="Prepare FP16 model",
    disabled=False,
)

display(prepare_int4_model)
display(prepare_int8_model)
display(prepare_fp16_model)

Checkbox(value=True, description='Prepare INT4 model')

Checkbox(value=False, description='Prepare INT8 model')

Checkbox(value=False, description='Prepare FP16 model')

In [36]:
enable_awq = widgets.Checkbox(
    value=False,
    description="Enable AWQ",
    disabled=not prepare_int4_model.value,
)
display(enable_awq)

Checkbox(value=False, description='Enable AWQ')

## Download model weights

In [6]:
from pathlib import Path

model_path = "meta-llama/Llama-3.2-1B-Instruct"
fp16_model_dir = Path("Llama-3.2-1B-Instruct") / "FP16"
int8_model_dir = Path("Llama-3.2-1B-Instruct") / "INT8_compressed_weights"
int4_model_dir = Path("Llama-3.2-1B-Instruct") / "INT4_compressed_weights"


def convert_to_fp16():
    if (fp16_model_dir / "openvino_model.xml").exists():
        return
    export_command = "optimum-cli export openvino --model {} --task text-generation-with-past --weight-format fp16 {}".format(
        model_path, str(fp16_model_dir)
    )
    display(Markdown("**Export command:**"))
    display(Markdown(f"`{export_command}`"))
    ! $export_command


def convert_to_int8():
    if (int8_model_dir / "openvino_model.xml").exists():
        return
    int8_model_dir.mkdir(parents=True, exist_ok=True)
    export_command = "optimum-cli export openvino --model {} --task text-generation-with-past --weight-format int8 {}".format(
        model_path, str(int8_model_dir)
    )
    display(Markdown("**Export command:**"))
    display(Markdown(f"`{export_command}`"))
    ! $export_command


def convert_to_int4():
    # Zephyr-specific compression configuration
    compression_config = {
        "sym": True,
        "group_size": 64,
        "ratio": 0.6
    }
    
    if (int4_model_dir / "openvino_model.xml").exists():
        return
    
    export_command = "optimum-cli export openvino --model {} --task text-generation-with-past --weight-format int4".format(model_path)
    export_command += " --group-size {} --ratio {} --sym".format(
        compression_config["group_size"], 
        compression_config["ratio"]
    )
    
    if enable_awq.value:
        export_command += " --awq --dataset wikitext2 --num-samples 128"
    
    export_command += " " + str(int4_model_dir)
    display(Markdown("**Export command:**"))
    display(Markdown(f"`{export_command}`"))
    ! $export_command


if prepare_fp16_model.value:
    convert_to_fp16()
if prepare_int8_model.value:
    convert_to_int8()
if prepare_int4_model.value:
    convert_to_int4()

In [7]:
from pathlib import Path

fp16_model_dir = Path("Llama-3.2-1B-Instruct") / "FP16"
int8_model_dir = Path("Llama-3.2-1B-Instruct") / "INT8_compressed_weights"
int4_model_dir = Path("Llama-3.2-1B-Instruct") / "INT4_compressed_weights"

## Compress model weights

In [8]:
fp16_weights = fp16_model_dir / "openvino_model.bin"
int8_weights = int8_model_dir / "openvino_model.bin"
int4_weights = int4_model_dir / "openvino_model.bin"

if fp16_weights.exists():
    print(f"Size of FP16 model is {fp16_weights.stat().st_size / 1024 / 1024:.2f} MB")
for precision, compressed_weights in zip([8, 4], [int8_weights, int4_weights]):
    if compressed_weights.exists():
        print(f"Size of model with INT{precision} compressed weights is {compressed_weights.stat().st_size / 1024 / 1024:.2f} MB")
    if compressed_weights.exists() and fp16_weights.exists():
        print(f"Compression rate for INT{precision} model: {fp16_weights.stat().st_size / compressed_weights.stat().st_size:.3f}")


Size of model with INT4 compressed weights is 920.42 MB


In [37]:
from llm_config import (
    SUPPORTED_EMBEDDING_MODELS,
    SUPPORTED_RERANK_MODELS,
    SUPPORTED_LLM_MODELS,
)

model_languages = list(SUPPORTED_LLM_MODELS)

model_language = widgets.Dropdown(
    options=model_languages,
    value=model_languages[0],
    description="Model Language:",
    disabled=False,
)

model_language

Dropdown(description='Model Language:', options=('English', 'Chinese', 'Japanese'), value='English')

## Set embedding model configuration

In [38]:
embedding_model_id = list(SUPPORTED_EMBEDDING_MODELS[model_language.value])

embedding_model_id = widgets.Dropdown(
    options=embedding_model_id,
    value=embedding_model_id[0],
    description="Embedding Model:",
    disabled=False,
)

embedding_model_id
embedding_model_configuration = SUPPORTED_EMBEDDING_MODELS[model_language.value][embedding_model_id.value]
print(f"Selected {embedding_model_id.value} model")

Dropdown(description='Embedding Model:', options=('bge-small-en-v1.5', 'bge-large-en-v1.5', 'bge-m3'), value='…

In [40]:
export_command_base = "optimum-cli export openvino --model {} --task feature-extraction".format(embedding_model_configuration["model_id"])
export_command = export_command_base + " " + str(embedding_model_id.value)

if not Path(embedding_model_id.value).exists():
    ! $export_command

## Set rerank model configuration

In [41]:
rerank_model_id = list(SUPPORTED_RERANK_MODELS)

rerank_model_id = widgets.Dropdown(
    options=rerank_model_id,
    value=rerank_model_id[0],
    description="Rerank Model:",
    disabled=False,
)

rerank_model_id
rerank_model_configuration = SUPPORTED_RERANK_MODELS[rerank_model_id.value]
print(f"Selected {rerank_model_id.value} model")

Selected bge-reranker-v2-m3 model


In [43]:
export_command_base = "optimum-cli export openvino --model {} --task text-classification".format(rerank_model_configuration["model_id"])
export_command = export_command_base + " " + str(rerank_model_id.value)

if not Path(rerank_model_id.value).exists():
    ! $export_command
embedding_device = device_widget()
embedding_device
print(f"Embedding model will be loaded to {embedding_device.value} device for text embedding")

Embedding model will be loaded to AUTO device for text embedding


In [45]:
USING_NPU = embedding_device.value == "NPU"

npu_embedding_dir = embedding_model_id.value + "-npu"
npu_embedding_path = Path(npu_embedding_dir) / "openvino_model.xml"

if USING_NPU and not Path(npu_embedding_dir).exists():
    shutil.copytree(embedding_model_id.value, npu_embedding_dir)
    optimize_bge_embedding(Path(embedding_model_id.value) / "openvino_model.xml", npu_embedding_path)
rerank_device = device_widget()
rerank_device
print(f"Rerenk model will be loaded to {rerank_device.value} device for text reranking")
llm_device = device_widget("CPU", exclude=["NPU"])
llm_device
print(f"LLM model will be loaded to {llm_device.value} device for response generation")

Rerenk model will be loaded to AUTO device for text reranking
LLM model will be loaded to CPU device for response generation


In [46]:
from llama_index.embeddings.huggingface_openvino import OpenVINOEmbedding

embedding_model_name = npu_embedding_dir if USING_NPU else embedding_model_id.value
batch_size = 1 if USING_NPU else 4

embedding = OpenVINOEmbedding(
    model_id_or_path=embedding_model_name, embed_batch_size=batch_size, device=embedding_device.value, model_kwargs={"compile": False}
)
if USING_NPU:
    embedding._model.reshape(1, 512)
embedding._model.compile()

embeddings = embedding.get_text_embedding("Hello World!")
print(len(embeddings))
print(embeddings[:5])

384
[-0.003010430606082082, -0.011976574547588825, 0.04138164594769478, -0.03790159523487091, 0.0242850873619318]


In [47]:
available_models = []
if int4_model_dir.exists():
    available_models.append("INT4")
if int8_model_dir.exists():
    available_models.append("INT8")
if fp16_model_dir.exists():
    available_models.append("FP16")

model_to_run = widgets.Dropdown(
    options=available_models,
    value=available_models[0],
    description="Model to run:",
    disabled=False,
)

model_to_run

Dropdown(description='Model to run:', options=('INT4',), value='INT4')

In [48]:
llm_model_ids = [model_id for model_id, model_config in SUPPORTED_LLM_MODELS[model_language.value].items() if model_config.get("rag_prompt_template")]

llm_model_id = widgets.Dropdown(
    options= llm_model_ids,
    value=llm_model_ids[-1],
    description="Model:",
    disabled=False,
)

llm_model_id

Dropdown(description='Model:', index=19, options=('tiny-llama-1b-chat', 'llama-3.2-1b-instruct', 'llama-3.2-3b…

In [49]:
llm_model_id

Dropdown(description='Model:', index=1, options=('tiny-llama-1b-chat', 'llama-3.2-1b-instruct', 'llama-3.2-3b-…

In [29]:
llm_model_configuration = SUPPORTED_LLM_MODELS[model_language.value][llm_model_id.value]

In [50]:
from llama_index.llms.openvino import OpenVINOLLM

import openvino.properties as props
import openvino.properties.hint as hints
import openvino.properties.streams as streams


if model_to_run.value == "INT4":
    model_dir = int4_model_dir
elif model_to_run.value == "INT8":
    model_dir = int8_model_dir
else:
    model_dir = fp16_model_dir
print(f"Loading model from {model_dir}")

ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""}

stop_tokens = llm_model_configuration.get("stop_tokens")
completion_to_prompt = llm_model_configuration.get("completion_to_prompt")

if "GPU" in llm_device.value and "qwen2-7b-instruct" in llm_model_id.value:
    ov_config["GPU_ENABLE_SDPA_OPTIMIZATION"] = "NO"

if llm_model_id.value == "red-pajama-3b-chat" and "GPU" in core.available_devices and llm_device.value in ["GPU", "AUTO"]:
    ov_config["INFERENCE_PRECISION_HINT"] = "f32"

llm = OpenVINOLLM(
    model_id_or_path=str(model_dir),
    context_window=3900,
    max_new_tokens=2,
    model_kwargs={"ov_config": ov_config, "trust_remote_code": True},
    generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
    completion_to_prompt=completion_to_prompt,
    device_map=llm_device.value,
)


Loading model from Llama-3.2-1B-Instruct/INT4_compressed_weights


In [51]:
stop_tokens = llm_model_configuration.get("stop_tokens")

In [52]:
from llama_index.postprocessor.openvino_rerank import OpenVINORerank

reranker = OpenVINORerank(model_id_or_path=rerank_model_id.value, device=rerank_device.value, top_n=2)

In [53]:
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Settings
from llama_index.readers.file import PyMuPDFReader
from llama_index.vector_stores.faiss import FaissVectorStore
from transformers import StoppingCriteria, StoppingCriteriaList
from langchain.text_splitter import RecursiveCharacterTextSplitter
from llama_index.core.node_parser import LangchainNodeParser
import faiss
import torch
import gradio as gr
import requests
from pathlib import Path

# Initialize global variables
query_engine = None
index = None
TEXT_SPLITERS = {
    "SentenceSplitter": SentenceSplitter,
    "RecursiveCharacter": RecursiveCharacterTextSplitter,
}

def default_partial_text_processor(partial_text: str, new_text: str):
    """
    helper for updating partially generated answer, used by default

    Params:
      partial_text: text buffer for storing previosly generated text
      new_text: text update for the current step
    Returns:
      updated text string

    """
    partial_text += new_text
    return partial_text


text_processor = llm_model_configuration.get("partial_text_processor", default_partial_text_processor)

class StopOnTokens(StoppingCriteria):
    def __init__(self, token_ids):
        self.token_ids = token_ids

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_id in self.token_ids:
            if input_ids[0][-1] == stop_id:
                return True
        return False
    
if llm._tokenizer.pad_token is None:
    llm._tokenizer.pad_token = llm._tokenizer.eos_token

# Setup the models and settings
if stop_tokens is not None:
    if isinstance(stop_tokens[0], str):
        stop_tokens = llm._tokenizer.convert_tokens_to_ids(stop_tokens)
    stop_tokens = [StopOnTokens(stop_tokens)]

# Configure embedding dimensions
d = embedding._model.request.outputs[0].get_partial_shape()[2].get_length()
Settings.embed_model = embedding

# Configure LLM settings
llm.max_new_tokens = 2048
if stop_tokens is not None:
    llm._stopping_criteria = StoppingCriteriaList(stop_tokens)
Settings.llm = llm

def create_vectordb(doc, spliter_name, chunk_size, chunk_overlap, vector_search_top_k, vector_rerank_top_n, run_rerank):
    """
    Initialize a vector database from user uploaded document
    """
    global query_engine
    global index

    if vector_rerank_top_n > vector_search_top_k:
        gr.Warning("Search top k must >= Rerank top n")

    # Load and process the uploaded document
    loader = PyMuPDFReader()
    for document in doc:
        documents = loader.load(file_path=document.name)
    
    # Configure text splitter
    spliter = TEXT_SPLITERS[spliter_name](chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    if spliter_name == "RecursiveCharacter":
        spliter = LangchainNodeParser(spliter)
    
    # Initialize FAISS vector store
    faiss_index = faiss.IndexFlatL2(d)
    vector_store = FaissVectorStore(faiss_index=faiss_index)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    # Create the index from documents
    index = VectorStoreIndex.from_documents(
        documents,
        storage_context=storage_context,
        transformations=[spliter],
    )
    
    # Setup query engine
    if run_rerank:
        reranker.top_n = vector_rerank_top_n
        query_engine = index.as_query_engine(streaming=True, similarity_top_k=vector_search_top_k, node_postprocessors=[reranker])
    else:
        query_engine = index.as_query_engine(streaming=True, similarity_top_k=vector_search_top_k)

    return "Vector database is Ready"

def update_retriever(vector_search_top_k, vector_rerank_top_n, run_rerank):
    """
    Update retriever settings
    """
    global query_engine
    global index

    if vector_rerank_top_n > vector_search_top_k:
        gr.Warning("Search top k must >= Rerank top n")

    if run_rerank:
        reranker.top_n = vector_rerank_top_n
        query_engine = index.as_query_engine(streaming=True, similarity_top_k=vector_search_top_k, node_postprocessors=[reranker])
    else:
        query_engine = index.as_query_engine(streaming=True, similarity_top_k=vector_search_top_k)

def bot(history, temperature, top_p, top_k, repetition_penalty, do_rag):
    """
    Chatbot callback function
    """
    llm.generate_kwargs = dict(
        temperature=temperature,
        do_sample=temperature > 0.0,
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=repetition_penalty,
    )

    partial_text = ""
    if do_rag:
        streaming_response = query_engine.query(history[-1][0])
        for new_text in streaming_response.response_gen:
            partial_text = text_processor(partial_text, new_text)
            history[-1][1] = partial_text
            yield history
    else:
        streaming_response = llm.stream_complete(history[-1][0])
        for new_text in streaming_response:
            partial_text = text_processor(partial_text, new_text.delta)
            history[-1][1] = partial_text
            yield history

def request_cancel():
    llm._model.request.cancel()

if not Path("gradio_helper.py").exists():
    r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/llm-rag-langchain/gradio_helper.py")
    open("gradio_helper.py", "w").write(r.text)


from gradio_helper import make_demo

# Create and launch the demo
demo = make_demo(
    load_doc_fn=create_vectordb,
    run_fn=bot,
    update_retriever_fn=update_retriever,
    model_name=llm_model_id.value,
    language=model_language.value,
)

try:
    demo.queue().launch(share=True)
except Exception:
    demo.queue().launch(share=True)



Running on local URL:  http://127.0.0.1:7861


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Running on public URL: https://dd68ddfbc7409155d4.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)
