In [1]:
import torch
from llama_index.llms import HuggingFaceLLM
from llama_index.prompts import PromptTemplate

selected_model = "mistralai/Mistral-7B-Instruct-v0.2"

# SYSTEM_PROMPT = """You are an AI assistant that answers questions in a friendly manner, based on the given source documents. Here are some rules you always follow:
# - Generate human readable output, avoid creating output with gibberish text.
# - Generate only the requested output, don't include any other language before or after the requested output.
# - Never say thank you, that you are happy to help, that you are an AI agent, etc. Just answer directly.
# - Generate professional language typically used in business documents in North America.
# - Never generate offensive or foul language.
# """

# query_wrapper_prompt = PromptTemplate(
#     "[INST]<<SYS>>\n" + SYSTEM_PROMPT + "<</SYS>>\n\n"
#     "Context information is below.\\n"
#     "---------------------\\n"
#     "{context_str}\\n"
#     "---------------------\\n"
#     "Given the context information and not prior knowledge, "
#     "answer the query. Please be brief, concise, and complete.\\n"
#     "If the context information does not contain an answer to the query, "
#     "respond with \"No information\"."
#     "Query: {query_str}\\n"
#     "Answer: "
# )

llm = HuggingFaceLLM(
    context_window=30000,
    max_new_tokens=2048,
    # query_wrapper_prompt=query_wrapper_prompt,
    model_name=selected_model,
    tokenizer_name=selected_model,
    device_map="mps",
    model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": False, "trust_remote_code":True},
)

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
from huggingface_hub import login
login("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")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/xujiantong/.cache/huggingface/token
Login successful


In [3]:
from llama_index.embeddings import HuggingFaceEmbedding
# Replace 'your-huggingface-model' with the actual model name from Hugging Face
embed_model = HuggingFaceEmbedding(model_name="jinaai/jina-embeddings-v2-base-en")

Some weights of BertModel were not initialized from the model checkpoint at jinaai/jina-embeddings-v2-base-en and are newly initialized: ['encoder.layer.11.output.LayerNorm.bias', 'encoder.layer.10.output.dense.bias', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.8.intermediate.dense.bias', 'encoder.layer.6.output.dense.bias', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.6.output.LayerNorm.weight', 'encoder.layer.1.output.dense.weight', 'encoder.layer.8.output.dense.bias', 'encoder.layer.10.output.LayerNorm.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.6.intermediate.dense.bias', 'encoder.layer.7.output.LayerNorm.weight', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.7.intermediate.dense.bias', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.5.output.LayerNorm

In [4]:
import time 
import pypdf 
import pandas as pd 
from llama_index.evaluation import ( 
    RelevancyEvaluator, 
    FaithfulnessEvaluator, 
) 

from llama_index import (
    SimpleDirectoryReader, 
    VectorStoreIndex, 
    ServiceContext,
    set_global_service_context
)

documents = SimpleDirectoryReader("../data/").load_data()

In [5]:
service_context = ServiceContext.from_defaults(
    llm=llm, 
    embed_model=embed_model
)
set_global_service_context(service_context)
index = VectorStoreIndex.from_documents(  
    documents=documents
)

In [6]:
query_engine = index.as_query_engine()
response = query_engine.query("What is Multimodal Agents?")
print(response)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Multimodal agents refer to artificial intelligence systems that can process and analyze data from multiple sources or modalities, such as text, images, and videos, to understand and interact with complex environments. These agents can utilize large language models (LLMs) and vision language models (VLMs) to analyze text data, including chat logs, player feedback, and narrative content, as well as image and video data from gaming sessions. They can help identify patterns of player behavior, preferences, and interactions, facilitate the development of intelligent agents within games, and assist in the creation and enhancement of immersive gaming environments through scene synthesis.


In [None]:
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index import get_response_synthesizer

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=2,
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer(
    service_context=service_context,
    text_qa_template=query_wrapper_prompt,
    response_mode="compact",
)

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

In [None]:
import time

response = query_engine.query("What is Multimodal Agents?")

start_time = time.time()

token_count = 0
for token in response.response_gen:
    print(token, end="")
    token_count += 1

time_elapsed = time.time() - start_time
tokens_per_second = token_count / time_elapsed

print(f"\n\nStreamed output at {tokens_per_second} tokens/s")

In [None]:
from llama_index.prompt import PromptTemplate

# Define the prompt template string
qa_prompt_tmpl_str = """\
Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the query.
Query: {query_str}
Answer: \
"""

# Define a function to format the context
def format_context_fn(**kwargs):
    # format context with bullet points
    context_list = kwargs["context_str"].split("\n\n")
    fmtted_context = "\n\n".join([f"- {c}" for c in context_list])
    return fmtted_context

# Create a PromptTemplate instance
prompt_tmpl = PromptTemplate(
    qa_prompt_tmpl_str, function_mappings={"context_str": format_context_fn}
)

# Define the context string
context_str = """\
In this work, we develop and release Llama 2, a collection of pretrained and fine-tuned large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters.
Our fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases.
Our models outperform open-source chat models on most benchmarks we tested, and based on our human evaluations for helpfulness and safety, may be a suitable substitute for closed-source models.
"""

# Use the PromptTemplate instance to format the context and query
fmt_prompt = prompt_tmpl.format(
    context_str=context_str, query_str="How many params does llama 2 have"
)

# Print the formatted prompt
print(fmt_prompt)