In [1]:
import langchain
import numpy as np
from langchain_together import Together
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_text_splitters import RecursiveCharacterTextSplitter


# Pdf Loader

In [2]:
from langchain.document_loaders import PyPDFLoader

# for multiple pdf files
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader("MachineLearning-Lecture01.pdf"),
    PyPDFLoader("MachineLearning-Lecture02.pdf"),
    PyPDFLoader("MachineLearning-Lecture03.pdf"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [3]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 100
)
splits = text_splitter.split_documents(docs)

# Embeddings

In [31]:
# from langchain_community.embeddings import OllamaEmbeddings
# embeddings = OllamaEmbeddings()

from langchain_together.embeddings import TogetherEmbeddings
embeddings = TogetherEmbeddings(
    model="togethercomputer/m2-bert-80M-8k-retrieval",
    together_api_key="24cdbdf50106e08f6ba3328ac07f97a73eb440ae36da6cdd72f9b091ccca850a"                          
)

In [75]:
text1 = "dogs are good"
text2 = "dogs are better"
text3 = "israel is a country"

In [78]:
emb_1 = embeddings.embed_query(text1)
emb_2 = embeddings.embed_query(text2)
emb_3 = embeddings.embed_query(text3)

from scipy.spatial.distance import cosine
cosine_similarity_1 = 1 - cosine(emb_1, emb_2)
cosine_similarity_2 = 1 - cosine(emb_1, emb_3)

In [81]:
cosine_similarity_2

0.28266973618505165

# VectorStore

FAISS Vector Store

In [58]:
from langchain_community.vectorstores import FAISS

In [54]:
vectordb = FAISS.from_documents(
    documents=splits,
    embedding=embeddings
)

In [57]:
folder_path = 'docs/faiss/'
vectordb.save_local(folder_path=folder_path)

In [8]:
from langchain_community.embeddings import OllamaEmbeddings
embeddings = OllamaEmbeddings()
loaded_vectordb = FAISS.load_local(folder_path=folder_path)

Chroma Vector Store

In [73]:
from langchain_community.vectorstores import Chroma
# persist_directory = 'docs/chroma/file_n'
# persist_directory = 'docs/chroma/vector-kaggle'
persist_directory = 'docs/chroma/CSLectures'

In [74]:
# vectordb = Chroma.from_documents(
#     documents=splits,
#     embedding=embeddings,
#     persist_directory=persist_directory
# )
# vectordb.persist()

In [75]:
# Importing from local save
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embeddings
)

In [76]:
print(vectordb._collection.count())

56


# Similarity_Search + Max_Marginal_Relevance_Search
``Similarity-search`` prioritizes finding the closest documents

``MMR-search`` focuses on retrieving a set of documents that are both relevant and diverse, covering different aspects of the query

**Similarity Search**: Use this when you simply want the most similar documents to the query, regardless of their redundancy. It's faster and easier to implement.

**Max-Marginal Relevance Search**: Use this when you need a diverse set of documents that cover different aspects of the query. This is particularly helpful for tasks like document summarization or generating informative responses.

In [9]:
question = "what is Convolutional Neural Networks"

In [10]:
docs = vectordb.similarity_search(question,k=3)

In [169]:
# specifying which file to retrieve info from
docs = vectordb.similarity_search(
    question,
    k=2,
    filter={"source":"file_1.pdf"}
)

In [1]:
# docs[0]
# docs[1]
# docs[2]

In [45]:
docs = vectordb.max_marginal_relevance_search(question,k=3)

In [7]:
# docs[0]
# docs[1]
# docs[2]

In [9]:
# docs[0].metadata
# docs[1].metadata
# docs[2].metadata

In [23]:
from langchain_together import Together
from langchain_openai import ChatOpenAI

# llm = Together(
#     model="META-LLAMA/LLAMA-2-7B-CHAT-HF",
#     together_api_key="24cdbdf50106e08f6ba3328ac07f97a73eb440ae36da6cdd72f9b091ccca850a"
# )

model = ChatOpenAI(
    base_url="https://api.together.xyz/v1",
    api_key="24cdbdf50106e08f6ba3328ac07f97a73eb440ae36da6cdd72f9b091ccca850a",
    model="META-LLAMA/LLAMA-3-8B-CHAT-HF",
)

# Retriever Types

https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/

### Self-Query Retriever

In [117]:
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [118]:
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="lectures on machine learning based on three pdf files: file_1.pdf, file_2.pdf, file_3.pdf",  
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page number within the document",
        type="integer",
    ),
]

In [119]:
# This variable stores a brief description of the content of the documents in the vectordb. 
# In this case, it's set to "Lecture notes", indicating that the documents are lecture notes.
document_content_description = "Lectures on machine learning"

# llm = OpenAI(model='gpt-3.5-turbo-instruct', temperature=0)
self_query_retriever = SelfQueryRetriever.from_llm(
    model,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [120]:
docs = self_query_retriever.get_relevant_documents(question)

In [121]:
len(docs)

4

In [58]:
for d in docs:
    print(d.metadata)

{'page': 16, 'source': '/kaggle/input/cslectures/MachineLearning-Lecture01.pdf'}
{'page': 1, 'source': '/kaggle/input/cslectures/MachineLearning-Lecture01.pdf'}
{'page': 9, 'source': '/kaggle/input/cslectures/MachineLearning-Lecture01.pdf'}
{'page': 5, 'source': '/kaggle/input/cslectures/MachineLearning-Lecture03.pdf'}


In [31]:
docs[0]

Document(page_content='Clustering is a method of unsupervised learning that involves grouping a set of objects in such a \nway that objects in the same group (cluster) are more similar to each other than to those in other \ngroups.  \n• k-Means Clustering : Partitions the data into k clusters by minimizing the variance \nwithin each cluster.  \n• Hierarchical Clustering : Builds a hierarchy of clusters either through a bottom -up \n(agglomerative) or top -down (divisive) approach.  \n• DBSCAN (Density -Based Spatial Clustering of Applications with Noise) : Groups \ntogether points that are close to each other based on a distance measurement, and marks \npoints that are in low -density regions as outliers.  \nDimensionality Reduction  \nDimensionality reduction is the process of reducing the number of random variables under \nconsideration by obtaining a set of principal variables.  \n• Principal Component Analysis (PCA) : Projects the data into a lower -dimensional space \nby maximizin

### Compression

In [39]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [40]:
question = "what is supervised learning?"
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

In [41]:
compressor = LLMChainExtractor.from_llm(model)

In [42]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(search_type = "mmr")
)

In [43]:
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)



Document 1:

NO_OUTPUT. There is no relevant part of the context that is related to the question "what is supervised learning?". The context appears to be about administrative announcements, grading assignments, and linear regression, but does not mention supervised learning.
----------------------------------------------------------------------------------------------------
Document 2:

Here are the extracted relevant parts:

So in supervised learning, this is what we 're going to do. We're given a training set, and we're going to feed our training set, compri sing our M training example, so 47 training examples, into a learning algorithm. Okay, and our algorithm then has output function that is by tradition, and for hist orical reasons, is usually de noted lower case alphabet H, and is called a hypothesis.


In [44]:
for d in compressed_docs:
    print(d.metadata)

{'page': 0, 'source': '/kaggle/input/cslectures/MachineLearning-Lecture02.pdf'}
{'page': 3, 'source': '/kaggle/input/cslectures/MachineLearning-Lecture02.pdf'}


# Legacy Chains ~ retrievers

### - RetrievalQA
### - ConversationalRetrievalChain

https://python.langchain.com/v0.1/docs/modules/chains/

In [1]:
from langchain.chains import RetrievalQA
import chainlit as cl
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

2024-07-01 17:23:37 - Created default config file at D:\Work\NLP\Langchain\.chainlit\config.toml
2024-07-01 17:23:37 - Created default translation directory at D:\Work\NLP\Langchain\.chainlit\translations
2024-07-01 17:23:37 - Created default translation file at D:\Work\NLP\Langchain\.chainlit\translations\en-US.json


##### ConversationalRetrievalChain

In [192]:
from langchain.chains import ConversationalRetrievalChain
qa = ConversationalRetrievalChain.from_llm(
    model,
    memory=memory,
    retriever=vectordb.as_retriever(search_type = "mmr")
    # retriever=self_query_retriever,
    # retriever=compression_retriever,  

    # return_source_documents=True,
    # chain_type="map_reduce"
    # chain_type="refine"    
)

In [194]:
qa({"question": "who is the lecturer?"})

{'question': 'who is the lecturer?',
 'chat_history': [HumanMessage(content='who is the lecturer?'),
  AIMessage(content='The lecturer in this context is Andrew Ng, a well-known artificial intelligence researcher and entrepreneur. At the time of this recording, he was an assistant professor at Stanford University and was teaching a class on machine learning.')],
 'answer': 'The lecturer in this context is Andrew Ng, a well-known artificial intelligence researcher and entrepreneur. At the time of this recording, he was an assistant professor at Stanford University and was teaching a class on machine learning.'}

In [10]:
# qa({"question": "what does he teach"})

In [11]:
# qa({"question": "what is supervised learning"})

In [12]:
# for i in memory:
#     for j in i:
#         print(j)

##### RetrievalQA

In [167]:
from langchain.prompts import PromptTemplate

template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [168]:
# Run chain
qa_chain = RetrievalQA.from_chain_type(
    
    model,
    retriever=vectordb.as_retriever(search_type = "mmr"),
    # retriever=self_query_retriever,
    # retriever=compression_retriever,  

    return_source_documents=True,
    # chain_type="map_reduce"
    # chain_type="refine"
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}    
)

In [13]:
# qa_chain({"query": "who are the TAs"})