## Libraries

In [1]:
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core import PromptTemplate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
#import llama_index.core
#llama_index.core.set_global_handler("simple")

Loading LLM

In [4]:

#from llama_index.core import set_global_tokenizer
#from transformers import AutoTokenizer

#set_global_tokenizer(
    #AutoTokenizer.from_pretrained("mistralai/mistral-7b-instruct-v0.2").encode
#)

In [5]:

llm = LlamaCPP(
    model_path="model\stablelm-zephyr-3b.Q5_K_M.gguf",
    temperature=0.2,
    max_new_tokens=1024,
    context_window=4096,
    #generate_kwargs={"stop":['<|endoftext|>']},
    model_kwargs={"n_gpu_layers": -1},  # if compiled to use GPU
    verbose=True,
)

llama_model_loader: loaded meta data with 21 key-value pairs and 356 tensors from model\stablelm-zephyr-3b.Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = stablelm
llama_model_loader: - kv   1:                               general.name str              = source
llama_model_loader: - kv   2:                    stablelm.context_length u32              = 4096
llama_model_loader: - kv   3:                  stablelm.embedding_length u32              = 2560
llama_model_loader: - kv   4:                       stablelm.block_count u32              = 32
llama_model_loader: - kv   5:               stablelm.feed_forward_length u32              = 6912
llama_model_loader: - kv   6:              stablelm.rope.dimension_count u32              = 20
llama_model_loader: - kv   7:              stablelm.attention.head_count u3

In [6]:
llm.complete("What is the Capital of Malaysia?")


llama_print_timings:        load time =    2020.07 ms
llama_print_timings:      sample time =     118.06 ms /   358 runs   (    0.33 ms per token,  3032.25 tokens per second)
llama_print_timings: prompt eval time =    2020.00 ms /     8 tokens (  252.50 ms per token,     3.96 tokens per second)
llama_print_timings:        eval time =    7361.64 ms /   357 runs   (   20.62 ms per token,    48.49 tokens per second)
llama_print_timings:       total time =   11280.91 ms /   365 tokens


CompletionResponse(text='\nThe capital city of Malaysia is Kuala Lumpur (also known as KL). It is located in the central region of Peninsular Malaysia, on the east bank of the Selangor River.\n\nKuala Lumpur serves as both the political and administrative center of the country, housing numerous government institutions such as the Prime Minister\'s office, the Parliament building, and the Sultan\'s palace. It is also home to several international organizations with their regional headquarters located in the city, including the International Monetary Fund (IMF) and the World Bank (WB).\n\nThe name "Kuala Lumpur" itself translates to "Fire Dragon Portage" in Malay, while there are different stories about how it got its nickname. One popular legend suggests that the name was given by a Chinese trader who saw a fire dragon during an early morning ritual performed by local natives. Another story claims that the name was derived from the Malay words for "muddy portage," reflecting the city\'s

In [7]:
Settings.llm = llm

## Creating Ingestion Pipelne

Loading Local Embedding Model

In [8]:
embed_model = HuggingFaceEmbedding(
    model_name="Snowflake/snowflake-arctic-embed-s"
)
Settings.embed_model = embed_model

You try to use a model that was created with version 2.7.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





Loading Documents

In [10]:
documents = SimpleDirectoryReader(input_dir='papers').load_data()

In [11]:
import chromadb
from llama_index.core import Document
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core import StorageContext

# save/load to disk
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("CEM5011")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)


In [12]:
from llama_index.core.node_parser import SentenceSplitter

pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=1000, chunk_overlap=200),
    ],
    vector_store=vector_store,
)


In [13]:
pipeline.run(documents=documents)

[TextNode(id_='a5d6f0e5-cd71-4973-ab30-360008dc52a2', embedding=None, metadata={'page_label': '1', 'file_name': '2022071815345624_MJMHS_1300.pdf', 'file_path': 'e:\\Github Repositories\\RAGTests\\papers\\2022071815345624_MJMHS_1300.pdf', 'file_type': 'application/pdf', 'file_size': 133091, 'creation_date': '2024-05-10', 'last_modified_date': '2024-05-05'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='d482d03d-45c5-41d1-8dbb-1198133c7988', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '1', 'file_name': '2022071815345624_MJMHS_1300.pdf', 'file_path': 'e:\\Github Repositories\\RAGTests\\papers\\2022071815345624_MJMHS_1300.pdf', 'file_type': 'application/pdf', 'file_size': 133091, 'creation_d

In [14]:
from llama_index.postprocessor.flag_embedding_reranker import (
    FlagEmbeddingReranker,
)

rerank = FlagEmbeddingReranker(model="mixedbread-ai/mxbai-embed-large-v1", top_n=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mixedbread-ai/mxbai-embed-large-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, 
)

## Retrieval

In [16]:
query_engine = index.as_query_engine(streaming=True,similarity_top_k=7)

Setting Prompts


In [21]:
qa_prompt_template_str = """<|user|>Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the query based on the context given ONLY else you will be penalized. Provide your answer in an easy to read and understand format.
Further elaborate your answer by finding examples or information within the context if possible.
Query: {query_str}
Answer:
<|endoftext|>
<|assistant|>
"""

qa_prompt_template = PromptTemplate(qa_prompt_template_str)
query_engine.update_prompts(
    {"response_synthesizer:text_qa_template":qa_prompt_template}
)

In [23]:
from IPython.display import Markdown, display
response = query_engine.query("Examples of ML techniques to combat COVID-19")
display(Markdown(f"<b>{response.print_response_stream()}</b>"))

Llama.generate: prefix-match hit

llama_print_timings:        load time =    2020.07 ms
llama_print_timings:      sample time =     122.46 ms /   372 runs   (    0.33 ms per token,  3037.63 tokens per second)
llama_print_timings: prompt eval time =    6137.80 ms /  3300 tokens (    1.86 ms per token,   537.65 tokens per second)
llama_print_timings:        eval time =   13805.35 ms /   371 runs   (   37.21 ms per token,    26.87 tokens per second)
llama_print_timings:       total time =   22087.10 ms /  3671 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =    2020.07 ms
llama_print_timings:      sample time =     122.09 ms /   372 runs   (    0.33 ms per token,  3046.98 tokens per second)
llama_print_timings: prompt eval time =    5342.26 ms /  3336 tokens (    1.60 ms per token,   624.45 tokens per second)
llama_print_timings:        eval time =   13829.32 ms /   371 runs   (   37.28 ms per token,    26.83 tokens per second)
llama_print_timings:       to


Examples of machine learning (ML) techniques used to combat COVID-19 include:
1. Deep learning and alternative learning strategies for retrospective real-world clinical data, such as the study by Chen et al. (2019). This technique has been applied in predicting disease progression, diagnosing and treating patients with COVID-19.
2. Computational epidemiology (CE) techniques, which use ML algorithms to analyze large datasets and predict the spread of the virus, track disease trajectories, and assess risk factors for severe cases or infection. For instance, a study by Chowdhury et al. (2020) compared dynamic interventions across 16 countries during the COVID-19 pandemic using a multivariate prediction modeling approach.
3. Natural language processing (NLP) techniques, which involve using ML algorithms to analyze large volumes of unstructured data from social media, news articles, and other sources to understand public sentiment and behavior related to the pandemic. An example is the stu


llama_print_timings:        load time =    2020.07 ms
llama_print_timings:      sample time =     127.42 ms /   372 runs   (    0.34 ms per token,  2919.55 tokens per second)
llama_print_timings: prompt eval time =    2431.75 ms /  1602 tokens (    1.52 ms per token,   658.79 tokens per second)
llama_print_timings:        eval time =   11194.66 ms /   371 runs   (   30.17 ms per token,    33.14 tokens per second)
llama_print_timings:       total time =   16169.92 ms /  1973 tokens


<b>None</b>

Evaluation

In [None]:
from llama_index.core.llama_dataset import download_llama_dataset
from llama_index.core.llama_pack import download_llama_pack
from llama_index.core import VectorStoreIndex

# download and install dependencies for benchmark dataset
rag_dataset, documents = download_llama_dataset(
    "OriginOfCovid19Dataset", "./data"
)