Refrensi
Url 1 : https://docs.llamaindex.ai/en/stable/
url 2 : https://python.langchain.com/v0.1/docs/get_started/introduction

In [None]:
import os
from typing import Sequence, Optional
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    ServiceContext,
    StorageContext,
    load_index_from_storage,
    set_global_service_context
)
from langchain_community.embeddings import HuggingFaceEmbeddings
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.llms import ChatMessage


In [None]:
model_url = "https://huggingface.co/bartowski/llama-3-neural-chat-v1-8b-GGUF/resolve/main/llama-3-neural-chat-v1-8b-Q6_K.gguf"
# Size = 6 Gb

In [None]:
def messages_to_prompt(
    messages: Sequence[ChatMessage],
    system_prompt: Optional[str]=None
) -> str:
    prompt = ""
    for message in messages:
        prompt += f"<|{message.role}|>\n"
        prompt += f"{message.content}</s>\n"

    return prompt + "<|assistant|>\n"

def completion_to_prompt(completion):
    return f"<|system|>\n</s>\n<|user|>\n{completion}</s>\n<|assistant|>\n"

In [None]:
llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_url=model_url, 
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=4096,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 1},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

In [None]:
# Inisialisasi SentenceSplitter
text_splitter = SentenceSplitter(
    chunk_size=1024,  # Ukuran maksimum tiap chunk
    chunk_overlap=250  # Jumlah token yang tumpang tindih antara chunk
)

# Membaca dokumen dari file
raw_text = SimpleDirectoryReader(input_files=["dataset/Data1.txt"]).load_data()
# Membagi teks menjadi chunks/nodes
chunks = text_splitter.get_nodes_from_documents(raw_text)

In [None]:
embedding = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    model_kwargs={"device": "cuda" or "cpu"}  # Pastikan menggunakan GPU jika tersedia
)
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embedding)
set_global_service_context(service_context)

In [None]:
# create vector store index
index = VectorStoreIndex.from_documents(raw_text, embed_model=embedding)

In [None]:
# save index to disk
index.set_index_id("vector_index")
index.storage_context.persist("./database/data1")

In [None]:
# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir="database/data1")
# load index
index = load_index_from_storage(storage_context, index_id="vector_index")

In [None]:
# set up query engine
'''query_engine = index.as_query_engine(llm=llm, similiarity_top_k=1, streaming=True,
                                     vector_store_query_mode="mmr", 
                                     vector_store_kwargs={"mmr_threshold": 0.2})'''

In [None]:
# set up query engine
query_engine = index.as_query_engine(llm=llm, vector_store_query_mode="mmr", 
                                     vector_store_kwargs={"mmr_threshold": 0.2},
                                     streaming=True)

In [None]:
# set up query engine
'''query_engine = index.as_query_engine(llm=llm, similiarity_threshold=0.8, streaming=True,)'''

In [None]:
# Contoh penggunaan fitur chat
response = query_engine.query("pada musim berapa messi mencetak banyak goal?")

In [None]:
response.print_response_stream()

response 2 hanya opsional

In [None]:
response2 = query_engine.query("pada musim berapa mbappe mencetak banyak goal?")

In [None]:
response2.print_response_stream()