In [1]:
import langchain
import numpy as np
from langchain_together import Together
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_text_splitters import RecursiveCharacterTextSplitter


# Pdf Loader

In [2]:
from langchain.document_loaders import PyPDFLoader

# for multiple pdf files
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader("MachineLearning-Lecture01.pdf"),
    PyPDFLoader("MachineLearning-Lecture02.pdf"),
    PyPDFLoader("MachineLearning-Lecture03.pdf"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [3]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 100
)
splits = text_splitter.split_documents(docs)

# Embeddings

In [31]:
# from langchain_community.embeddings import OllamaEmbeddings
# embeddings = OllamaEmbeddings()

from langchain_together.embeddings import TogetherEmbeddings
embeddings = TogetherEmbeddings(
    model="togethercomputer/m2-bert-80M-8k-retrieval",
    together_api_key="24cdbdf50106e08f6ba3328ac07f97a73eb440ae36da6cdd72f9b091ccca850a"                          
)

In [75]:
text1 = "dogs are good"
text2 = "dogs are better"
text3 = "israel is a country"

In [78]:
emb_1 = embeddings.embed_query(text1)
emb_2 = embeddings.embed_query(text2)
emb_3 = embeddings.embed_query(text3)

from scipy.spatial.distance import cosine
cosine_similarity_1 = 1 - cosine(emb_1, emb_2)
cosine_similarity_2 = 1 - cosine(emb_1, emb_3)

In [81]:
cosine_similarity_2

0.28266973618505165

# VectorStore

FAISS Vector Store

In [58]:
from langchain_community.vectorstores import FAISS

In [54]:
vectordb = FAISS.from_documents(
    documents=splits,
    embedding=embeddings
)

In [57]:
folder_path = 'docs/faiss/'
vectordb.save_local(folder_path=folder_path)

In [8]:
from langchain_community.embeddings import OllamaEmbeddings
embeddings = OllamaEmbeddings()
loaded_vectordb = FAISS.load_local(folder_path=folder_path)

Chroma Vector Store

In [73]:
from langchain_community.vectorstores import Chroma
# persist_directory = 'docs/chroma/file_n'
# persist_directory = 'docs/chroma/vector-kaggle'
persist_directory = 'docs/chroma/CSLectures'

In [74]:
# vectordb = Chroma.from_documents(
#     documents=splits,
#     embedding=embeddings,
#     persist_directory=persist_directory
# )
# vectordb.persist()

In [75]:
# Importing from local save
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embeddings
)

In [76]:
print(vectordb._collection.count())

56


# Similarity_Search + Max_Marginal_Relevance_Search
``Similarity-search`` prioritizes finding the closest documents

``MMR-search`` focuses on retrieving a set of documents that are both relevant and diverse, covering different aspects of the query

**Similarity Search**: Use this when you simply want the most similar documents to the query, regardless of their redundancy. It's faster and easier to implement.

**Max-Marginal Relevance Search**: Use this when you need a diverse set of documents that cover different aspects of the query. This is particularly helpful for tasks like document summarization or generating informative responses.

In [9]:
question = "what is Convolutional Neural Networks"

In [10]:
docs = vectordb.similarity_search(question,k=3)

In [169]:
# specifying which file to retrieve info from
docs = vectordb.similarity_search(
    question,
    k=2,
    filter={"source":"file_1.pdf"}
)

In [11]:
docs[0]

Document(page_content='3. Expand clusters from core points.  \n• Strengths : Can find arbitrarily shaped clusters, robust to noise and outliers.  \n• Weaknesses : Not suitable for datasets with varying densities, sensitive to parameter \nselection.  \nDimensionality Reduction  \nDimensionality reduction involves reducing the number of random variables under \nconsideration, making the data easier to visualize and often improving algorithm performance.  \nPrincipal Component Analysis (PCA)  \n• Concept : PCA transforms the data to a new coordinate system where the greatest \nvariance lies on the first axis, the second greatest variance on the second axis, and so on.  \n• Algorithm : \n1. Standardize the data.  \n2. Calculate the covariance matrix.  \n3. Calculate the eigenvalues and eigenvectors of the covariance matrix.  \n4. Sort eigenvalues and eigenvectors.  \n5. Select the top k eigenvectors to form a new feature space.  \n• Strengths : Reduces complexity, improves computational ef

In [12]:
docs[1]

Document(page_content='• Strengths : Capable of generating high -quality synthetic data, useful for image \ngeneration, style transfer, and data augmentation.  \n• Weaknesses : Difficult to train, prone to mode collapse where the generator produces \nlimited varieties of outputs.  \nAutoencoders  \n• Concept : Autoencoders are neural networks used to learn efficient codings of input data, \ntypically for the purposes of dimensionality reduction or feature learning.  \n• Architecture : \no Encoder : Compresses the input data into a latent -space representation.  \no Decoder : Reconstructs the input data from the latent representation.  \no Variational Autoencoders (VAEs) : A type of autoencoder that provides a \nprobabilistic manner for describing an observation in latent space.  \n• Strengths : Useful for noise reduction, data compression, and feature extraction.  \n• Weaknesses : Reconstructed data might not be perfect, requires careful tuning of \narchitecture and hyperparameters.  \

In [53]:
docs[2]

Document(page_content='Clustering  \nClustering is a type of unsupervised learning that involves grouping a set of objects in such a \nway that objects in the same group (or cluster) are more similar to each other than to those in \nother groups.  \nk-Means Clustering  \n• Concept : k-Means is one of the simplest and most popular clustering algorithms. It \npartitions the dataset into k clusters, where each data point belongs to the cluster with the \nnearest mean.  \n• Algorithm : \n1. Initialize k cluster centroids randomly.  \n2. Assign each data point to the nearest centroid.  \n3. Recalculate the centroids as the mean of all points assigned to each cluster.  \n4. Repeat steps 2 and 3 until convergence (i.e., centroids no longer change \nsignificantly).  \n• Strengths : Simple and fast for small to medium -sized datasets.  \n• Weaknesses : Sensitive to initial centroid positions, may converge to a local minimum, \nnot suitable for non -globular clusters or clusters of different siz

In [45]:
docs = vectordb.max_marginal_relevance_search(question,k=3)

In [46]:
question

'what is supervised learning?'

In [47]:
docs[0]

Document(page_content="MachineLearning-Lecture02  \nInstructor (Andrew Ng) :All right, good morning, welcom e back. So before we jump \ninto today's material, I just have one admini strative announcement, which is graders. So I \nguess sometime next week, we'll hand out the fi rst homework assignment for this class.  \nIs this loud enough, by the way? Can people in  the back hear me? No. Can you please \nturn up the mic a bit louder? Is this bette r? Is this okay? This is okay? Great.  \nSo sometime next week, we'll hand out the firs t problem sets and it'll be two weeks after \nthat, and the way we grade homework problems in this class is by some combination of \nTAs and graders, where graders are usually me mbers – students currently in the class.  \nSo in maybe about a week or so, I'll email the class to solicit applica tions for those of you \nthat might be interested in becoming graders fo r this class, and ther e's usually sort of a \nfun thing to do. So four times this quarter, 

In [48]:
docs[1]

Document(page_content="So in supervised learning, this is what we 're going to do. We're given a training set, and \nwe're going to feed our training set, compri sing our M training example, so 47 training \nexamples, into a learning algorithm. Okay, and our algorithm then has output function \nthat is by tradition, and for hist orical reasons, is usually de noted lower case alphabet H, \nand is called a hypothesis. Don't worry too mu ch about whether the term hypothesis has a \ndeep meaning. It's more a term that's used for historical reasons.  \nAnd the hypothesis's job is to take this i nput. There's some new [inaudible]. What the \nhypothesis does is it takes this  input, a new living area in s quare feet saying and output \nestimates the price of this house. So the hypothesis H maps from inputs X to outputs Y. \nSo in order to design a learning algorithm, the first thing we have to decide is how we \nwant to represent the hypothesis, right.  \nAnd just for this purposes of th is l

In [49]:
docs[2]

Document(page_content='classes teach. And this is something I\'m rea lly convinced is a huge deal, and so by the \nend of this class, I hope all of you will be master carpenters. I hope all of you will be \nreally good at applying these learning algor ithms and getting them to work amazingly \nwell in many problems. Okay?  \nLet\'s see. So [inaudible] the board. After lear ning theory, there\'s a nother class of learning \nalgorithms that I then want to teach you a bout, and that\'s unsupervised learning. So you \nrecall, right, a little ea rlier I drew an example like this , right, where you have a couple of \nfeatures, a couple of input vari ables and sort of malignant tumors and benign tumors or \nwhatever. And that was an example of a s upervised learning problem because the data \nyou have gives you the right answer for each of your patients. The data tells you this \npatient has a malignant tumor;  this patient has a benign tumor. So it had the right \nanswers, and you wanted the

In [50]:
docs[0].metadata

{'page': 0, 'source': '/kaggle/input/cslectures/MachineLearning-Lecture02.pdf'}

In [51]:
docs[1].metadata

{'page': 3, 'source': '/kaggle/input/cslectures/MachineLearning-Lecture02.pdf'}

In [52]:
docs[2].metadata

{'page': 15,
 'source': '/kaggle/input/cslectures/MachineLearning-Lecture01.pdf'}

In [23]:
from langchain_together import Together
from langchain_openai import ChatOpenAI

# llm = Together(
#     model="META-LLAMA/LLAMA-2-7B-CHAT-HF",
#     together_api_key="24cdbdf50106e08f6ba3328ac07f97a73eb440ae36da6cdd72f9b091ccca850a"
# )

model = ChatOpenAI(
    base_url="https://api.together.xyz/v1",
    api_key="24cdbdf50106e08f6ba3328ac07f97a73eb440ae36da6cdd72f9b091ccca850a",
    model="META-LLAMA/LLAMA-3-8B-CHAT-HF",
)

# Retriever Types

https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/

### Self-Query Retriever

In [117]:
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [118]:
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="lectures on machine learning based on three pdf files: file_1.pdf, file_2.pdf, file_3.pdf",  
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page number within the document",
        type="integer",
    ),
]

In [119]:
# This variable stores a brief description of the content of the documents in the vectordb. 
# In this case, it's set to "Lecture notes", indicating that the documents are lecture notes.
document_content_description = "Lectures on machine learning"

# llm = OpenAI(model='gpt-3.5-turbo-instruct', temperature=0)
self_query_retriever = SelfQueryRetriever.from_llm(
    model,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [120]:
docs = self_query_retriever.get_relevant_documents(question)

In [121]:
len(docs)

4

In [58]:
for d in docs:
    print(d.metadata)

{'page': 16, 'source': '/kaggle/input/cslectures/MachineLearning-Lecture01.pdf'}
{'page': 1, 'source': '/kaggle/input/cslectures/MachineLearning-Lecture01.pdf'}
{'page': 9, 'source': '/kaggle/input/cslectures/MachineLearning-Lecture01.pdf'}
{'page': 5, 'source': '/kaggle/input/cslectures/MachineLearning-Lecture03.pdf'}


In [31]:
docs[0]

Document(page_content='Clustering is a method of unsupervised learning that involves grouping a set of objects in such a \nway that objects in the same group (cluster) are more similar to each other than to those in other \ngroups.  \n• k-Means Clustering : Partitions the data into k clusters by minimizing the variance \nwithin each cluster.  \n• Hierarchical Clustering : Builds a hierarchy of clusters either through a bottom -up \n(agglomerative) or top -down (divisive) approach.  \n• DBSCAN (Density -Based Spatial Clustering of Applications with Noise) : Groups \ntogether points that are close to each other based on a distance measurement, and marks \npoints that are in low -density regions as outliers.  \nDimensionality Reduction  \nDimensionality reduction is the process of reducing the number of random variables under \nconsideration by obtaining a set of principal variables.  \n• Principal Component Analysis (PCA) : Projects the data into a lower -dimensional space \nby maximizin

### Compression

In [39]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [40]:
question = "what is supervised learning?"
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

In [41]:
compressor = LLMChainExtractor.from_llm(model)

In [42]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(search_type = "mmr")
)

In [43]:
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)



Document 1:

NO_OUTPUT. There is no relevant part of the context that is related to the question "what is supervised learning?". The context appears to be about administrative announcements, grading assignments, and linear regression, but does not mention supervised learning.
----------------------------------------------------------------------------------------------------
Document 2:

Here are the extracted relevant parts:

So in supervised learning, this is what we 're going to do. We're given a training set, and we're going to feed our training set, compri sing our M training example, so 47 training examples, into a learning algorithm. Okay, and our algorithm then has output function that is by tradition, and for hist orical reasons, is usually de noted lower case alphabet H, and is called a hypothesis.


In [44]:
for d in compressed_docs:
    print(d.metadata)

{'page': 0, 'source': '/kaggle/input/cslectures/MachineLearning-Lecture02.pdf'}
{'page': 3, 'source': '/kaggle/input/cslectures/MachineLearning-Lecture02.pdf'}


# Legacy Chains ~ retrievers

### - RetrievalQA
### - ConversationalRetrievalChain

https://python.langchain.com/v0.1/docs/modules/chains/

In [1]:
from langchain.chains import RetrievalQA
import chainlit as cl
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

2024-07-01 17:23:37 - Created default config file at D:\Work\NLP\Langchain\.chainlit\config.toml
2024-07-01 17:23:37 - Created default translation directory at D:\Work\NLP\Langchain\.chainlit\translations
2024-07-01 17:23:37 - Created default translation file at D:\Work\NLP\Langchain\.chainlit\translations\en-US.json


ConversationalRetrievalChain

In [192]:
from langchain.chains import ConversationalRetrievalChain
qa = ConversationalRetrievalChain.from_llm(
    model,
    memory=memory,
    retriever=vectordb.as_retriever(search_type = "mmr")
    # retriever=self_query_retriever,
    # retriever=compression_retriever,  

    # return_source_documents=True,
    # chain_type="map_reduce"
    # chain_type="refine"    
)

In [194]:
qa({"question": "who is the lecturer?"})

{'question': 'who is the lecturer?',
 'chat_history': [HumanMessage(content='who is the lecturer?'),
  AIMessage(content='The lecturer in this context is Andrew Ng, a well-known artificial intelligence researcher and entrepreneur. At the time of this recording, he was an assistant professor at Stanford University and was teaching a class on machine learning.')],
 'answer': 'The lecturer in this context is Andrew Ng, a well-known artificial intelligence researcher and entrepreneur. At the time of this recording, he was an assistant professor at Stanford University and was teaching a class on machine learning.'}

In [195]:
qa({"question": "what does he teach"})

{'question': 'what does he teach',
 'chat_history': [HumanMessage(content='who is the lecturer?'),
  AIMessage(content='The lecturer in this context is Andrew Ng, a well-known artificial intelligence researcher and entrepreneur. At the time of this recording, he was an assistant professor at Stanford University and was teaching a class on machine learning.'),
  HumanMessage(content='what does he teach'),
  AIMessage(content="Andrew Ng is a well-known artificial intelligence (AI) researcher and educator. He teaches various topics related to AI, machine learning, and data science. Specifically, he has taught courses on:\n\n1. Machine Learning: Ng has taught machine learning courses at Stanford University, Coursera, and edX. His courses cover topics such as linear regression, logistic regression, neural networks, and deep learning.\n2. Deep Learning: Ng has also taught deep learning courses, focusing on topics like convolutional neural networks, recurrent neural networks, and generative a

In [196]:
qa({"question": "what is supervised learning"})

{'question': 'what is supervised learning',
 'chat_history': [HumanMessage(content='who is the lecturer?'),
  AIMessage(content='The lecturer in this context is Andrew Ng, a well-known artificial intelligence researcher and entrepreneur. At the time of this recording, he was an assistant professor at Stanford University and was teaching a class on machine learning.'),
  HumanMessage(content='what does he teach'),
  AIMessage(content="Andrew Ng is a well-known artificial intelligence (AI) researcher and educator. He teaches various topics related to AI, machine learning, and data science. Specifically, he has taught courses on:\n\n1. Machine Learning: Ng has taught machine learning courses at Stanford University, Coursera, and edX. His courses cover topics such as linear regression, logistic regression, neural networks, and deep learning.\n2. Deep Learning: Ng has also taught deep learning courses, focusing on topics like convolutional neural networks, recurrent neural networks, and gen

In [197]:
for i in memory:
    for j in i:
        print(j)

chat_memory
Human: who is the lecturer?
AI: The lecturer in this context is Andrew Ng, a well-known artificial intelligence researcher and entrepreneur. At the time of this recording, he was an assistant professor at Stanford University and was teaching a class on machine learning.
Human: what does he teach
AI: Andrew Ng is a well-known artificial intelligence (AI) researcher and educator. He teaches various topics related to AI, machine learning, and data science. Specifically, he has taught courses on:

1. Machine Learning: Ng has taught machine learning courses at Stanford University, Coursera, and edX. His courses cover topics such as linear regression, logistic regression, neural networks, and deep learning.
2. Deep Learning: Ng has also taught deep learning courses, focusing on topics like convolutional neural networks, recurrent neural networks, and generative adversarial networks.
3. Artificial Intelligence: Ng has taught AI courses, covering topics such as natural language pro

In [201]:
memory

ConversationBufferMemory(chat_memory=ChatMessageHistory(messages=[HumanMessage(content='who is the lecturer?'), AIMessage(content='The lecturer in this context is Andrew Ng, a well-known artificial intelligence researcher and entrepreneur. At the time of this recording, he was an assistant professor at Stanford University and was teaching a class on machine learning.'), HumanMessage(content='what does he teach'), AIMessage(content="Andrew Ng is a well-known artificial intelligence (AI) researcher and educator. He teaches various topics related to AI, machine learning, and data science. Specifically, he has taught courses on:\n\n1. Machine Learning: Ng has taught machine learning courses at Stanford University, Coursera, and edX. His courses cover topics such as linear regression, logistic regression, neural networks, and deep learning.\n2. Deep Learning: Ng has also taught deep learning courses, focusing on topics like convolutional neural networks, recurrent neural networks, and gener

RetrievalQA

In [167]:
from langchain.prompts import PromptTemplate

template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [168]:
# Run chain
qa_chain = RetrievalQA.from_chain_type(
    model,
    
    retriever=vectordb.as_retriever(search_type = "mmr"),
    # retriever=self_query_retriever,
    # retriever=compression_retriever,  

    return_source_documents=True,
    # chain_type="map_reduce"
    # chain_type="refine"
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}    
)

In [171]:
qa_chain({"query": "who are the TAs"})

{'query': 'who are the TAs',
 'result': 'According to the context, TAs (Teaching Assistants) are mentioned as people who will help prove some of the facts about the trace operator and derivatives in the discussion section or who can verify the proofs of all of these. Thanks for asking!',
 'source_documents': [Document(page_content="And so if you – and this is a parameter of the algorithm that's often set by hand. If you \nchoose alpha to be too small than your steep est descent algorithm will take very tiny \nsteps and take a long time to converge. If alpha  is too large then the steepest descent may \nactually end up overshooting the minimum, if  you're taking too a ggressive a step.  \nYeah?  \nStudent: [Inaudible].  \nInstructor (Andrew Ng) :Say that again?  \nStudent: Isn't there a one over two missing somewhere?  \nInstructor (Andrew Ng) :Is there a one-half missing?  \nStudent: I was [inaudible].  \nInstructor (Andrew Ng) :Thanks. I do make lots of errors  in that. Any questions 