## Vector DataBase

In [1]:
import os
from langchain_community.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings  # or any other embeddings
from dotenv import load_dotenv
load_dotenv()

# Initialize embeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from langchain.schema import Document
# Create documents for founder and team members
doc = Document(page_content="The founder of Pakistan is Muhammad Ali Jinnah, a visionary leader and statesman who played a pivotal role in the creation of the country.")
doc_1 = Document(page_content="Liaquat Ali Khan, the first Prime Minister of Pakistan, was a close associate of Jinnah and contributed significantly to the nation's early development.")
doc_2 = Document(page_content="Fatima Jinnah, the sister of Muhammad Ali Jinnah, was a prominent figure in Pakistan's independence movement and an advocate for women's rights.")
doc_3 = Document(page_content="Allama Iqbal, a philosopher and poet, inspired the idea of a separate homeland for Muslims in the Indian subcontinent.")
doc_4 = Document(page_content="Sir Syed Ahmed Khan, an educationist and reformer, laid the groundwork for the intellectual and cultural awakening of Muslims in South Asia.")



In [3]:
docs=[doc,doc_1,doc_2,doc_3,doc_4]

In [5]:
# Extract the page_content from each Document in docs
texts = [doc.page_content for doc in docs]

# Create the FAISS vector database
vectordb = FAISS.from_texts(texts, embeddings)

In [7]:
vectordb.save_local("faiss_local_index")

In [10]:
# Load later with dangerous deserialization allowed
loaded_db = FAISS.load_local("faiss_local_index", embeddings, allow_dangerous_deserialization=True)

# The `similarity_search` method is used to find the most similar documents to a given query.
# It takes the query string and the number of results `k` as parameters.
results = loaded_db.similarity_search("who is jinnah", k=3)

# Print the page content of the most relevant result
print(results)

[Document(id='39a32930-d0f9-46c0-b491-210ba3ef7c96', metadata={}, page_content='The founder of Pakistan is Muhammad Ali Jinnah, a visionary leader and statesman who played a pivotal role in the creation of the country.'), Document(id='d32008c8-7ba7-43c9-9fd2-ef0432135eda', metadata={}, page_content="Fatima Jinnah, the sister of Muhammad Ali Jinnah, was a prominent figure in Pakistan's independence movement and an advocate for women's rights."), Document(id='6820e9cb-d8c1-4ae9-9890-1da8cb73ea77', metadata={}, page_content="Liaquat Ali Khan, the first Prime Minister of Pakistan, was a close associate of Jinnah and contributed significantly to the nation's early development.")]


In [13]:
loaded_db.get_by_ids(["39a32930-d0f9-46c0-b491-210ba3ef7c96"])

[Document(id='39a32930-d0f9-46c0-b491-210ba3ef7c96', metadata={}, page_content='The founder of Pakistan is Muhammad Ali Jinnah, a visionary leader and statesman who played a pivotal role in the creation of the country.')]

## K Parameter ki Explanation

`k` parameter similarity search mai results ki quantity control karta hai:

- Agar `k=3` hai to top 3 most similar documents return honge
- Agar `k=1` hai to sirf single most similar document milega
- Jitna bara `k` hoga, utne zyada results milenge
- For example: `k=5` means top 5 most matching documents milenge

In [11]:
# Example with different k values
print("With k=1 (single result):")
results_1 = loaded_db.similarity_search("who is jinnah", k=1)
print(f"Number of results: {len(results_1)}\n")

print("With k=3 (three results):")
results_3 = loaded_db.similarity_search("who is jinnah", k=3)
print(f"Number of results: {len(results_3)}")

With k=1 (single result):
Number of results: 1

With k=3 (three results):
Number of results: 1

With k=3 (three results):
Number of results: 3
Number of results: 3


## Checking Generated Embeddings

Here's how to inspect the embeddings generated for our documents:

In [15]:
# Get embeddings for a single document
single_embedding = embeddings.embed_query(docs[0].page_content)

# Print embedding information
print(f"Embedding length: {len(single_embedding)}")
print(f"First few values: {single_embedding[:5]}")

# Get embeddings for multiple texts
multiple_embeddings = embeddings.embed_documents([doc.page_content for doc in docs[:2]])

print(f"\nNumber of document embeddings: {len(multiple_embeddings)}")
print(f"Each embedding length: {len(multiple_embeddings[0])}")

Embedding length: 768
First few values: [0.03253893181681633, -0.05382358282804489, 0.0062140412628650665, 0.0022662095725536346, 0.04724659398198128]

Number of document embeddings: 2
Each embedding length: 768

Number of document embeddings: 2
Each embedding length: 768
