In [None]:
# !pip install llama-index modelscope

In [1]:
import os
from llama_index.core import VectorStoreIndex, ServiceContext
from llama_index.core.llms.callbacks import llm_completion_callback
from llama_index.legacy.embeddings import HuggingFaceEmbedding
from llama_index.legacy.llms import (CustomLLM, CompletionResponse, CompletionResponseGen, LLMMetadata)
from llama_index.legacy.embeddings import HuggingFaceEmbedding

from typing import Any
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel

from modelscope import snapshot_download

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
# load embedding
emb_path = snapshot_download('jieshenai/m3e-base')
embedding_model = HuggingFaceEmbedding(emb_path)

  from .autonotebook import tqdm as notebook_tqdm




In [7]:
model_name = "chatglm3-6b"
model_path = snapshot_download('ZhipuAI/chatglm3-6b')

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()
model = model.eval()

  return self.fget.__get__(instance, owner)()
Loading checkpoint shards: 100%|██████████| 7/7 [00:06<00:00,  1.07it/s]


In [14]:
# set context window size
context_window = 2048
# set number of output tokens
num_output = 256


class ChatGML(CustomLLM):
    @property
    def metadata(self) -> LLMMetadata:
        """Get LLM metadata."""
        return LLMMetadata(
            context_window=context_window,
            num_output=num_output,
            model_name=model_name,
        )

    @llm_completion_callback()
    def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
        # prompt_length = len(prompt)

        # only return newly generated tokens
        text,_ = model.chat(tokenizer, prompt, history=[])
        return CompletionResponse(text=text)

    @llm_completion_callback()
    def stream_complete(
        self, prompt: str, **kwargs: Any
    ) -> CompletionResponseGen:
        raise NotImplementedError()

llm_model = ChatGML()

In [10]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader('data').load_data()

In [None]:
from llama_index.core import ServiceContext

In [15]:
service_context = ServiceContext.from_defaults(llm=llm_model, embed_model=embedding_model)

  service_context = ServiceContext.from_defaults(llm=llm_model, embed_model=embedding_model)


In [16]:
service_context

ServiceContext(llm_predictor=LLMPredictor(system_prompt=None, query_wrapper_prompt=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>), prompt_helper=PromptHelper(context_window=2048, num_output=256, chunk_overlap_ratio=0.1, chunk_size_limit=None, separator=' '), embed_model=HuggingFaceEmbedding(model_name='models/m3e-base', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7efc9753d2d0>, tokenizer_name='models/m3e-base', max_length=512, pooling=<Pooling.CLS: 'cls'>, normalize=True, query_instruction=None, text_instruction=None, cache_folder=None), transformations=[SentenceSplitter(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7efc9753d2d0>, id_func=<function default_id_func at 0x7efd8533f2e0>, chunk_size=1024, chunk_overlap=200, separator=' ', paragraph_separator='\n\n\n', secondary_chunking_regex='[^,.;。？！]+[,.;。？！]?')], llama_logger=<l

In [18]:
# create index
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

# query engine
query_engine = index.as_query_engine()

# query
response = query_engine.query("少女感激不已，送给小风一件神奇的礼物是什么？")
print(response)

少女送给小风的神奇礼物是一把能够召唤风的力量的魔法扇。
