# LangChain: Q&A over Documents

An example might be a tool that would allow you to query a product catalog for items of interest.

In [None]:
#pip install --upgrade langchain

In [164]:
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

In [166]:
_

True

Note: LLM's do not always produce the same results. When executing the code in your notebook, you may get slightly different answers that those in the video.

In [165]:
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq
from langchain.document_loaders import CSVLoader, TextLoader
from langchain_community.vectorstores import DocArrayInMemorySearch, Chroma
from IPython.display import display, Markdown
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings, FastEmbedEmbeddings
from langchain_community import embeddings 

In [167]:
import pandas as pd
file = 'data_base.csv'
df= pd.read_csv(file, encoding='utf-8')
data= df.sample(1000, random_state=42)
data.to_csv('sample_db.csv', index=False)
data.head()

Unnamed: 0,part,fournisseur,description,marque,prix,quantity
41575,FSP:GD5S60Z00DESV6,macle,SP 5Y OS 9X5 4H RT,fujitsu,1584.0,50
265881,TS-464-8G,Ingram,TS-464-8G 4BAY 8GBDDR4 2X2.5GBE,qnap,630.74,1
11669,D-DDR4-4GB-007,convena,ProXtend 4GB DDR4 PC4-21300 2666MHz,proxtend,22.07,3
231168,46372,PCA,Fibre optique Duplex LC / LC OM3 3m,lindy,7.05,1
256239,21.15.3941,Secomp,"Cordon ROLINE Data Center SLIM, UTP Cat6A/Cl.E...",roline,0.86,749


In [153]:
data['description'].apply(str).apply(len).describe()

count    1000.000000
mean       59.882000
std        58.696246
min         1.000000
25%        29.000000
50%        38.500000
75%        63.000000
max       255.000000
Name: description, dtype: float64

In [154]:
# convert to txt
columns= data.columns
doc = '\n'.join([' '.join([f"{col}: {str(row[col]).strip(' ').strip('\n')}" for col in columns]) for idx, row in data.iterrows()])
f= open('sample_db.txt', 'w')
f.write(doc)
f.close()

In [112]:
lengths= []
for line in doc.split('\n'):
    lengths.append(len(line))
import numpy as np
print(np.mean(lengths))
print(np.min(lengths))
print(np.max(lengths))
print(np.median(lengths))

151.5556
79
367
134.0


In [168]:
# load txt file
file= 'sample_db.txt'
loader = TextLoader(file_path=file)

In [169]:
mbd_model= HuggingFaceInferenceAPIEmbeddings(api_key='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', model_name='intfloat/multilingual-e5-small')

In [159]:
from langchain.indexes import VectorstoreIndexCreator

In [170]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
r_splitter= RecursiveCharacterTextSplitter(chunk_size= 100, chunk_overlap= 20)

In [None]:
index = VectorstoreIndexCreator(
    vectorstore_cls=Chroma,
    text_splitter= r_splitter,
    embedding= mbd_model,
    vectorstore_kwargs= {'persist_directory': 'test_chroma'}
).from_loaders([loader])

**Note**:
- The notebook uses `langchain==0.0.179` and `openai==0.27.7`
- For these library versions, `VectorstoreIndexCreator` uses `text-davinci-003` as the base model, which has been deprecated since 1 January 2024.
- The replacement model, `gpt-3.5-turbo-instruct` will be used instead for the `query`.
- The `response` format might be different than the video because of this replacement model.

In [105]:
llm_replacement_model = ChatGroq(temperature=1, 
                               model='llama3-8b-8192',
                               groq_api_key='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')


In [108]:
query ="listez 10 marques les plus frequentes"
response = index.query(query, 
                       llm = llm_replacement_model)
display(Markdown(response))

Here are the descriptions:

1. KNIPEX Couteau pour Câble
2. KNIPEX Pince à préhension frontale
3. One pour All Grundig Télécommande

Here is finished

## Step By Step

In [None]:
from langchain.document_loaders import CSVLoader
loader = CSVLoader(file_path=file)

In [None]:
docs = loader.load()

In [None]:
docs[0]

In [None]:
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [None]:
embed = embeddings.embed_query("Hi my name is Harrison")

In [None]:
print(len(embed))

In [None]:
print(embed[:5])

In [None]:
db = DocArrayInMemorySearch.from_documents(
    docs, 
    embeddings
)

In [None]:
query = "Please suggest a shirt with sunblocking"

In [None]:
docs = db.similarity_search(query)

In [None]:
len(docs)

In [None]:
docs[0]

In [None]:
retriever = db.as_retriever()

In [None]:
llm = ChatOpenAI(temperature = 0.0, model=llm_model)

In [None]:
qdocs = "".join([docs[i].page_content for i in range(len(docs))])


In [None]:
response = llm.call_as_llm(f"{qdocs} Question: Please list all your \
shirts with sun protection in a table in markdown and summarize each one.") 


In [None]:
display(Markdown(response))

In [None]:
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)

In [None]:
query =  "Please list all your shirts with sun protection in a table \
in markdown and summarize each one."

In [None]:
response = qa_stuff.run(query)

In [None]:
display(Markdown(response))

In [None]:
response = index.query(query, llm=llm)

In [None]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding=embeddings,
).from_loaders([loader])

Reminder: Download your notebook to you local computer to save your work.