In [None]:
!pip install torch langchain sentence-transformers llama-index llama-cpp-python

## Конфигурационные параметры

In [None]:
data_dir = '/home/brarrow/repos/memes/data'
weights_dir = '/models'
llm_weights_filename = 'openchat_3.5.Q4_K_M.gguf'
temp = 0.0
max_new_tokens = 2190
n_ctx = 8192
node_ctx = 1024
similarity_topk = 1

In [None]:
DEFAULT_INSTRUCTION = 'Ты умный ассистент, отвечающий на вопросы пользователя о файлах, содержащихся в базе данных. '\
                      'Полезные для ответа части текста из файлов будут поданы в контексте. '\
                      'Используй их, чтобы дать точный и полный ответ на вопрос пользователя. '\
                      'Не матерись. Все матерные слова заменяй на звездочки. '
DEFAULT_INSTRUCTION

## Обвязка для используемой LLM

In [None]:
from typing import Any, List, Optional, Sequence
from llama_cpp.llama_chat_format import (ChatFormatterResponse, _format_chatml,
                                         _get_system_message, _map_roles,
                                         llama_types)
from llama_cpp.llama_types import (ChatCompletionRequestAssistantMessage,
                                   ChatCompletionRequestFunctionMessage,
                                   ChatCompletionRequestSystemMessage,
                                   ChatCompletionRequestToolMessage,
                                   ChatCompletionRequestUserMessage)
from llama_index import PromptTemplate
from llama_index.llms.types import ChatMessage, MessageRole

LLAMA_INDEX_TO_CPP_MAPPING = {
    MessageRole.SYSTEM: ChatCompletionRequestSystemMessage,
    MessageRole.USER: ChatCompletionRequestUserMessage,
    MessageRole.ASSISTANT: ChatCompletionRequestAssistantMessage,
    MessageRole.FUNCTION: ChatCompletionRequestFunctionMessage,
    MessageRole.TOOL: ChatCompletionRequestToolMessage,
}

def format_openchat(
    messages: List[llama_types.ChatCompletionRequestMessage],
    **kwargs: Any,
) -> ChatFormatterResponse:
    system_template = "{system_message}"
    system_message = _get_system_message(messages)
    system_message = system_template.format(system_message=system_message)
    _roles = dict(
        user="GPT4 Correct User: ", assistant="GPT4 Correct Assistant: "
    )
    _sep = "<|end_of_turn|>"
    _messages = _map_roles(messages, _roles)
    _messages.append((_roles["assistant"], None))
    _prompt = _format_chatml(system_message, _messages, _sep)
    return ChatFormatterResponse(prompt=_prompt, stop=_sep)

def messages_to_prompt(
    messages: Sequence[ChatMessage], system_prompt: Optional[str] = None
) -> str:
    llama_cpp_messages = []
    for message in messages:
        if message.role not in LLAMA_INDEX_TO_CPP_MAPPING:
            raise ValueError(f"Invalid message role: {message.role}")

        message_class = LLAMA_INDEX_TO_CPP_MAPPING[message.role]
        new_message = message_class(role=message.role.value, content=message.content)
        llama_cpp_messages.append(new_message)

    return format_openchat(llama_cpp_messages).prompt

def completion_to_prompt(completion: str, system_prompt: Optional[str] = None) -> str:
    system_prompt_str = system_prompt or DEFAULT_INSTRUCTION

    return (
        f"{system_prompt_str.strip()}<|end_of_turn|>"
        f"{completion.strip()}"
    )

query_wrapper_prompt = PromptTemplate(
    "GPT4 Correct User: {query_str}<|end_of_turn|>GPT4 Correct Assistant: "
)

## Загружаем LLM и модель эмбеддингов

In [None]:
import os

from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.llms import LlamaCPP
from llama_index import ServiceContext

In [None]:
embed_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large", model_kwargs={"device": "cuda:0"})

In [None]:
llm = LlamaCPP(
    model_path=os.path.join(weights_dir, llm_weights_filename),
    temperature=temp,
    max_new_tokens=max_new_tokens,
    context_window=n_ctx - 200,
    generate_kwargs={},
    model_kwargs={"n_gpu_layers": 100},
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=False
    )

In [None]:
service_context = ServiceContext.from_defaults(llm=llm,
                                               embed_model=embed_model,
                                               chunk_size=node_ctx,
                                               query_wrapper_prompt=query_wrapper_prompt)


## Создание индекса

In [None]:
from llama_index.readers import SimpleDirectoryReader
from llama_index.memory import ChatMemoryBuffer
from llama_index import VectorStoreIndex

loader = SimpleDirectoryReader(input_dir=data_dir)
docs = loader.load_data()

index = VectorStoreIndex.from_documents(docs,
                                        service_context=service_context,
                                        show_progress=True)


## Создание ретривера

In [None]:
retriever = index.as_retriever(similarity_top_k=similarity_topk)

memory = ChatMemoryBuffer.from_defaults(token_limit=4500)

chat_engine = index.as_chat_engine(
    chat_mode="context",
    memory=memory,
    system_prompt=DEFAULT_INSTRUCTION,
)

## Ответы на вопросы по тексту

In [None]:
resp_obj = chat_engine.chat("Что ты знаешь о ...?")
print(str(resp_obj.response).replace('.', '.\n'))