In [None]:
import torch
from unsloth import FastLanguageModel

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "../lora_model", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # quantization_config = quantization_config,
)
FastLanguageModel.for_inference(model)

In [None]:
from llama_index.llms.huggingface import HuggingFaceLLM
llm = HuggingFaceLLM(
 context_window=4096,
 max_new_tokens=256,
 generate_kwargs={"temperature": 0.7, "do_sample": False},  
 device_map="auto",
 stopping_ids=[50278, 50279, 50277, 1, 0],
 tokenizer_kwargs={"max_length": 4096},
 model_kwargs={"torch_dtype": torch.float16},
 model=model,
 tokenizer=tokenizer,
)

In [None]:
from datasets import load_dataset
document = load_dataset("xDAN-datasets/medical_meadow_wikidoc_patient_information_6k", split="train")
document

In [None]:
document.to_csv("./dataset/rag_data.csv")

In [5]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

documents = SimpleDirectoryReader("./dataset").load_data()

In [6]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.indices.query.query_transform import HyDEQueryTransform
from llama_index.core.query_engine import TransformQueryEngine
from IPython.display import Markdown, display

documents = SimpleDirectoryReader("./dataset").load_data()
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [7]:
from llama_index.core import Settings
Settings.llm = llm
Settings.chunk_size = 1024
Settings.embed_model = embed_model

from llama_index.core.node_parser import SentenceSplitter
Settings.transformations = [SentenceSplitter(chunk_size=1024)]

In [None]:
index = VectorStoreIndex.from_documents(documents)
query_str = "What causes Alstrom syndrome?"

query_engine = index.as_query_engine()
response = query_engine.query(query_str)
display(Markdown(f"<b>{response}</b>"))

In [14]:
hyde = HyDEQueryTransform(include_original=True)
hyde_query_engine = TransformQueryEngine(query_engine, hyde)
response = hyde_query_engine.query(query_str)

In [29]:
index = VectorStoreIndex.from_documents(
    documents, embed_model=embed_model,
transformations=Settings.transformations
)


from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import VectorIndexRetriever
vector_retriever = VectorIndexRetriever(index=index, similarity_top_k=2)
response_synthesizer = get_response_synthesizer()

vector_query_engine = RetrieverQueryEngine(
    retriever=vector_retriever,
    response_synthesizer=response_synthesizer,
)

In [None]:
query_str = "What causes Alstrom syndrome?"
hyde = HyDEQueryTransform(include_original=True)
hyde_query_engine = TransformQueryEngine(vector_query_engine, hyde)
response = hyde_query_engine.query(query_str)
display(Markdown(f"<b>{response}</b>"))