# load the data from website

In [1]:
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/Ec5f3KYU1CpbKRp1whFLZw/new-Policies.txt")
data = loader.load()

In [2]:
data[0].page_content[:100]
text = data[0].page_content
text

"1. Code of Conduct\n\nOur Code of Conduct establishes the core values and ethical standards that all members of our organization must adhere to. We are committed to fostering a workplace characterized by integrity, respect, and accountability.\n\nIntegrity: We commit to the highest ethical standards by being honest and transparent in all our dealings, whether with colleagues, clients, or the community. We protect sensitive information and avoid conflicts of interest.\n\nRespect: We value diversity and every individual's contribution. Discrimination, harassment, or any form of disrespect is not tolerated. We promote an inclusive environment where differences are respected, and everyone is treated with dignity.\n\nAccountability: We are responsible for our actions and decisions, complying with all relevant laws and regulations. We aim for continuous improvement and report any breaches of this code, supporting investigations into such matters.\n\nSafety: We prioritize the safety of our e

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=200, chunk_overlap=30,
    length_function=len )

text_chunks = splitter.split_text(text)
text_chunks[0]

'1. Code of Conduct'

# Lets Use PreTrained_Embedding model by HuggingFace library

In [4]:
from langchain_community.embeddings import HuggingFaceEmbeddings
model_name = 'sentence-transformers/all-mpnet-base-v2'
embedding_model = HuggingFaceEmbeddings(model_name = model_name)
embedding_model

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-mpnet-base-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

# Now lets Create the embeddings for each chunks

In [5]:
chunk_embeddings = embedding_model.embed_documents(text_chunks)


In [6]:
len(chunk_embeddings)

52

# Now store these chunks_embeddings into Vectordatabase using ChromaDB

In [7]:
idx = []
for i in range(len(text_chunks)):
    idx.append(str(i))
len(idx)

52

In [8]:
from langchain.vectorstores import Chroma
chroma_db = Chroma.from_texts(text_chunks, embedding_model, ids = idx)

In [9]:
query = "Smoking policy"
(chroma_db.similarity_search(query,k = 5))

[Document(page_content='This policy promotes the safe and responsible use of digital communication tools in line with our values and legal obligations. Employees must understand and comply with this policy. Regular reviews'),
 Document(page_content='This policy encourages the responsible use of mobile devices in line with legal and ethical standards. Employees are expected to understand and follow these guidelines. The policy is regularly'),
 Document(page_content='Consequences: Violations of this policy may lead to disciplinary action, including potential termination.'),
 Document(page_content='3. Internet and Email Policy'),
 Document(page_content='4. Mobile Phone Policy\n\nOur Mobile Phone Policy defines standards for responsible use of mobile devices within the organization to ensure alignment with company values and legal requirements.')]

In [10]:
retriever = chroma_db.as_retriever(search_kwargs = ({'k':2}))
retriever.invoke("Email policy")

[Document(page_content='3. Internet and Email Policy'),
 Document(page_content='Confidentiality: Use email for confidential information, trade secrets, and sensitive customer data only with encryption. Be careful when discussing company matters on public platforms or social')]

In [11]:
# print some text_chunks from database
print(chroma_db._collection.get('2')['documents'])

['characterized by integrity, respect, and accountability.']


In [12]:
# print the length of database
chroma_db._collection.count()

52

In [13]:
query = "what is your company policies"
similar_query = chroma_db.similarity_search(query, k=1)
similar_query[0].page_content

'This policy encourages the responsible use of mobile devices in line with legal and ethical standards. Employees are expected to understand and follow these guidelines. The policy is regularly'

In [14]:
# Add data to existing database
Q1 = "we want to add some data into chrome database"
from langchain_core.documents import Document
new_chunk = Document(
    page_content = Q1,
    metadata = {
        'source':'ibm.com',
        'page' : 1
    }
)
new_chunk = [new_chunk]
print(chroma_db._collection.get('542'))

{'ids': [], 'embeddings': None, 'documents': [], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': []}


In [15]:
chroma_db.add_documents(
    documents = new_chunk,
    ids = ['542']
)

['542']

In [16]:
print(chroma_db._collection.get('542'))

{'ids': ['542'], 'embeddings': None, 'documents': ['we want to add some data into chrome database'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'page': 1, 'source': 'ibm.com'}]}


In [17]:
update_chunk = Document(
    page_content = 'We just updated the 542th index content here',
    metadata={
        'source':'ibm.com',
        'page':1
    }
)

In [18]:
chroma_db.update_document(
    '542',
    update_chunk
)

In [19]:
print(chroma_db._collection.get('542'))

{'ids': ['542'], 'embeddings': None, 'documents': ['We just updated the 542th index content here'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'source': 'ibm.com', 'page': 1}]}


In [20]:
# if you want to delete any document or chunk(text)
chroma_db._collection.delete('542')

In [21]:
print(chroma_db._collection.get('542'))

{'ids': [], 'embeddings': None, 'documents': [], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': []}


In [26]:
chroma_db.persist() # save the model to the same directory
chroma_db = Chroma(persist_directory="Chroma_DB", embedding_function = embedding_model)

# Now use a different DataBase like FAISS

In [20]:
from langchain.vectorstores import FAISS
faiss_db = FAISS.from_texts(text_chunks, embedding_model, ids = idx)

In [21]:
#print some stored data from FAISS_DB

In [22]:
query = "what is your company policies"
faiss_db.similarity_search(query, k= 2)

[Document(page_content='child care, to be able to get back to work.'),
 Document(page_content='Tonight, I’m announcing a crackdown on these companies overcharging American businesses and consumers.')]

In [23]:
# faiss_db?

In [24]:
faiss_db.similarity_search(query, k=2) # simple search the closest content from data 

[Document(page_content='child care, to be able to get back to work.'),
 Document(page_content='Tonight, I’m announcing a crackdown on these companies overcharging American businesses and consumers.')]

In [25]:
embedded_query = embedding_model.embed_query(query)
faiss_db.similarity_search_by_vector(embedded_query, k =2)   # it will use only embedded_vector of your query then search

[Document(page_content='child care, to be able to get back to work.'),
 Document(page_content='Tonight, I’m announcing a crackdown on these companies overcharging American businesses and consumers.')]

In [26]:
faiss_db.similarity_search_with_score(query, k =2)   # search the content and score too

[(Document(page_content='child care, to be able to get back to work.'),
  1.3609143),
 (Document(page_content='Tonight, I’m announcing a crackdown on these companies overcharging American businesses and consumers.'),
  1.3922057)]

In [27]:
# Now count the total vector stored in database
faiss_db.index.ntotal

271

# Add data into FAISS_DATABASE

In [28]:
faiss_db.add_texts(["new doc"], metadatas=[{"id": "new"}])

['4117cad1-bf30-4c82-b3a2-0b89a70eb1c0']

In [29]:
faiss_db.index.ntotal   # after adding one text it becomes 272

272

In [30]:
# Save your database in directory
faiss_db.save_local("FAISS_DB")

In [31]:
# load your database for embedding new query for Gen_AI applications
faiss_db = faiss_db.load_local("FAISS_DB", embedding_model, allow_dangerous_deserialization=True)

In [32]:
faiss_db

<langchain_community.vectorstores.faiss.FAISS at 0x2653f0bf860>

In [33]:
retriever = faiss_db.as_retriever(search_type = 'mmr')  # Maximum Marginal Relevance Retrieval
docs = retriever.invoke(Q1)
docs

[Document(page_content='And on testing, we have made hundreds of millions of tests available for you to order for free.'),
 Document(page_content='But cancer from prolonged exposure to burn pits ravaged Heath’s lungs and body. \n\nDanielle says Heath was a fighter to the very end. \n\nHe didn’t know how to stop fighting, and neither did she.'),
 Document(page_content='Intel’s CEO, Pat Gelsinger, who is here tonight, told me they are ready to increase their investment from  \n$20 billion to $100 billion.'),
 Document(page_content='We’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers.')]

In [34]:
faiss_db.as_retriever(search_type = 'mmr').invoke("email policy")

[Document(page_content='And I’m taking robust action to make sure the pain of our sanctions  is targeted at Russia’s economy. And I will use every tool at our disposal to protect American businesses and consumers.'),
 Document(page_content='It’s time to strengthen privacy protections, ban targeted advertising to children, demand tech companies stop collecting personal data on our children.'),
 Document(page_content='We got more than 130 countries to agree on a global minimum tax rate so companies can’t get out of paying their taxes at home by shipping jobs and factories overseas.'),
 Document(page_content='And I ask Congress to pass proven measures to reduce gun violence. Pass universal background checks. Why should anyone on a terrorist list be able to purchase a weapon?')]

In [35]:
faiss_db.similarity_search("email policy")

[Document(page_content='And I’m taking robust action to make sure the pain of our sanctions  is targeted at Russia’s economy. And I will use every tool at our disposal to protect American businesses and consumers.'),
 Document(page_content='It’s time to strengthen privacy protections, ban targeted advertising to children, demand tech companies stop collecting personal data on our children.'),
 Document(page_content='As I have made crystal clear the United States and our Allies will defend every inch of territory of NATO countries with the full force of our collective power.'),
 Document(page_content='And I will keep doing everything in my power to crack down on gun trafficking and ghost guns you can buy online and make at home—they have no serial numbers and can’t be traced.')]

In [36]:
# So we can see that MMR is more effective and efficient than normal similar_search for searching the most similar or relevant content for the same query 