In [21]:
from llama_index.llms.ollama import Ollama
from llama_index.core import SimpleDirectoryReader
from  llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from llama_index.core.node_parser import (
    SentenceSplitter,
    SemanticSplitterNodeParser,
)
from llama_index.core import get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.core import VectorStoreIndex

In [22]:
#model is loaded from ollama model phi3.5
llm = Ollama(model="phi3.5", request_timeout=600)

In [25]:
#setting up the embedding model from huggingface
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-large-en-v1.5"
)

In [24]:
#loading the data
documents = SimpleDirectoryReader("./data").load_data()
print(documents)

[Document(id_='7d5baed2-d74f-4a07-a3eb-6bd433e0277c', embedding=None, metadata={'page_label': '1', 'file_name': 'cheatsheet-unsupervised-learning.pdf', 'file_path': 'c:\\Users\\ziade\\Desktop\\eeeeee\\data\\cheatsheet-unsupervised-learning.pdf', 'file_type': 'application/pdf', 'file_size': 456557, 'creation_date': '2024-09-11', 'last_modified_date': '2024-09-11'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='CS 229 – Machine Learning https://stanford.edu/~shervine\nVIP Cheatsheet: Unsupervised Learning\nAfshineAmidi and Shervine Amidi\nSeptember 9, 2018\nIntroduction to Unsupervised Learning\nÌMotivation – The goal of unsupervised learning is to ﬁnd hidden patterns in unlabeled data\n{x(1),...,x(m)}.\nÌJensen’s inequality – Letfbe a convex function an

In [26]:
#splitting the document into nodes
splitter = SemanticSplitterNodeParser(
    buffer_size=1, breakpoint_percentile_threshold=95, embed_model=Settings.embed_model
)

# also baseline splitter
base_splitter = SentenceSplitter(chunk_size=512)

In [27]:
#affecting nodes variable to the splitted document
nodes = splitter.get_nodes_from_documents(documents)

In [28]:
print(nodes[1].get_content())

ÌJensen’s inequality – Letfbe a convex function and Xa random variable. We have the
following inequality:
E[f(X)]⩾f(E[X])
Expectation-Maximization
ÌLatent variables – Latent variables are hidden/unobserved variables that make estimation
problems diﬃcult, and are often denoted z. Here are the most common settings where there are
latent variables:
Setting Latent variable zx|z Comments
Mixture of kGaussians Multinomial (φ)N(µj,Σj)µj∈Rn,φ∈Rk
Factor analysis N(0,I)N(µ+ Λz,ψ)µj∈Rn
ÌAlgorithm – The Expectation-Maximization (EM) algorithm gives an eﬃcient method at
estimating the parameter θthrough maximum likelihood estimation by repeatedly constructing
a lower-bound on the likelihood (E-step) and optimizing that lower bound (M-step) as follows:
•E-step: Evaluate the posterior probability Qi(z(i))that each data point x(i)came from
a particular cluster z(i)as follows:
Qi(z(i)) =P(z(i)|x(i);θ)
•M-step: Use the posterior probabilities Qi(z(i))as cluster speciﬁc weights on data points
x(i)to sepa

In [29]:
#indexing nodes using vector store index
index = VectorStoreIndex(nodes)

In [30]:
#stored the indexed nodes in the storage context for later use to
# avoid reindexing
index.storage_context.persist(persist_dir=".\emb_storage")

'\nreloading sample code\nfrom llama_index.core import StorageContext, load_index_from_storage\n\n# rebuild storage context\nstorage_context = StorageContext.from_defaults(persist_dir="<persist_dir>")\n\n# load index\nindex = load_index_from_storage(storage_context)\n'

In [32]:
#used to override the default retriever which is from OPENAI
Settings.llm = llm

In [33]:
# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=10,
)
# configure response synthesizer
response_synthesizer = get_response_synthesizer()

In [47]:
# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.6)],
)

In [48]:
response = query_engine.query("what are the types of hierarchichal clustering")
print(response)