# Setting environment variables to log traces with Langsmith:

In [None]:
import os          # Imports Python's built-in "os module" for interacting with the operating system. (e.g environment variables)
from dotenv import load_dotenv          # Imports the "load_dotenv function" from the "dotenv module" to load environment variables from the .env file. 
import requests          # Imports the "requests library" to make HTTP requests (used to verify the Langsmith API connection)


load_dotenv(          # Loads environment variables from the .env file.
    dotenv_path = ".env",           #Specifies the path to the .env file, which contains environment variables. The default is .env in the current directory.
    override = True           # Allows the loaded environment variables in the .env file to override any existing environment variables. 
)

os.environ["LANGSMITH_TRACING"] = os.getenv("LANGSMITH_TRACING")          # Retrieves the value of the loaded "LANGSMITH_TRACING" variable from the .env file and sets it in Python's runtime environment. This ensures Langsmith can access the value.
os.environ["LANGSMITH_ENDPOINT"] = os.getenv("LANGSMITH_ENDPOINT")          # Retrieves the value of the loaded "LANGSMITH_ENDPOINT" variable from the .env file and sets it in Python's runtime environment. This ensures Langsmith can access the value.
os.environ["LANGSMITH_API_KEY"] = os.getenv("LANGSMITH_API_KEY")          # Retrieves the value of the loaded "LANGSMITH_API_KEY" variable from the .env file and sets it in Python's runtime environment. This ensures Langsmith can access the value.
os.environ["LANGSMITH_PROJECT"] = os.getenv("LANGSMITH_PROJECT")          # Retrieves the value of the loaded "LANGSMITH_PROJECT" variable from the .env file and sets it in Python's runtime environment. This ensures Langsmith can access the value.

headers = {          # Creates a dictionary to store HTTP headers for the request. This particular dictionary is for the "Authorization header" which is required to aunthenticate the request to the Langsmith API.
    "Authorization": f"Bearer {os.getenv("LANGSMITH_API_KEY")}"          # Sets the "Authorization header" with the value of the "LANGSMITH_API_KEY" environment variable. This is used to authenticate the request to the Langsmith API.
}
response = requests.get(          # Makes a GET request to Langsmith's API endpoint to verify the connection.
    "https://api.smith.langchain.com",          # The URL of Langsmith's API endpoint.
    headers=headers          # Passes the dictionary containing the "Authorization header" to authenticate the request.
)

print(response.status_code)          # Prints the HTTP status code of the response. A status code of 200 indicates a successful connection to the Langsmith API.
print(response.json())          # Prints the JSON response from the Langsmith API, which typically contains information about the API connection or any relevant data returned by the request.

#                               INDEXING

# Loading the PDF document:

In [None]:
from langchain_community.document_loaders import PyMuPDFLoader          # Imports the "PyMuPDFLoader" class from LangChain's document loaders. This loader specializes in extracting text and metadata from PDF files using the PyMuPDF library.
import pprint          # Imports the "pprint module" for pretty-printing data structures, making them easier to read in the console.


file_path = r"C:\Users\user\Downloads\HANNY ABUBAKAR CV.pdf"          # Specifies the path to the PDF file that wiill be loaded. The "r" prefix ensures that the backslashes are treated as "literal characters" and not as escape sequences. 
loader = PyMuPDFLoader(          # Instantiates and Initializes the "PyMuPDFLoader" with the specified PDF file path. 
    file_path,          # The path of the file to be loaded.
    # mode="single"          # Specfies the mode in which the document will be loaded. The "single" mode means the entire document will be treated as one, the "page" mode means that each page will be treated as a separate document. 
)          
loaded_doc = loader.load()          # Executes the PDF parsing and text extraction process, returning a list of Document objects.     

print(f"This document has {len(loaded_doc)} pages.")
pprint.pp(loaded_doc)


# Splitting the loaded PDF document into chunks:


In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter          # Imports the "RecursiveCharacterTextSplitter" class from Langchain's text splitters. This class attempts to keep larger units (e.g., paragraphs or sentences) intact while keeping the text within a specified character limit. 


text_splitter = RecursiveCharacterTextSplitter(          # Instantiates and Initializes the "RecursiveCharacterTextSplitter" with specific paramaters on how to split the text.
    chunk_size = 1000,          # Defines the maximum number of characters in each chunk. (the text will be split into chunks that are at most 1000 characters long).
    chunk_overlap = 200,          # Defines the maximum number of characters that can overlap between consecutive chunks. 
)
all_chunks = text_splitter.split_documents(loaded_doc)          # Splits the loaded PDF document into chunks. Each chunk will be a Document object.

print(f"This document has been split into {len(all_chunks)} chunks.")          
for each_chunk in all_chunks:          
    print("")
    print(each_chunk.page_content)
    print("")
    print("-----" * 500)

# Embedding the chunks as vectors:


In [None]:
from langchain_ollama import OllamaEmbeddings          # Imports the "OllamaEmbeddings" class from Langchain's Ollama module, in order to embed the chunks as vectors.


embedding_model = OllamaEmbeddings(model = "nomic-embed-text")          # Instantiates and Initializes the "OllamaEmbeddings" model.

all_chunks_texts = [each_chunk.page_content for each_chunk in all_chunks]          # Extracts the content of each chunk from all the chunks using a "list comprehension", and saves it to a list. This creates a list of strings, where each string is the content of a chunk. 
all_chunks_vectors = embedding_model.embed_documents(all_chunks_texts)          # Embeds the content of all chunks as vectors using the "OllamaEmbeddings" model. This converts the text into numerical representations (vectors) that can be used for similarity search.

for each_chunk_vector in all_chunks_vectors:          
    print(each_chunk_vector)
    print("")
    print("-----" * 25000)
    print("")

# Creating a Vector Store and Storing the vectors:

In [None]:
import faiss          # Imports the "faiss library", which is used for similarity search and clustering of dense vectors.
from langchain_community.docstore.in_memory import InMemoryDocstore          # Imports the "InMemoryDocstore" class from Langchain's community module, which provides an in-memory non-persistent document store for text/metadata storage.
from langchain_community.vectorstores import FAISS          # Imports the "FAISS" class from Langchain's community vector stores, which provides an interface to work with FAISS.


index = faiss.IndexFlatL2(len(embedding_model.embed_query("Hello World")))          # Creates an "IndexFlatL2" FAISS index, which stores vectors and uses L2 distance (Euclidean distance) for similarity search. It's dimensionality is set to length of the vectors produced by the embedding model. 

vector_store = FAISS(          # Instantiates the FAISS vector store with the specified parameters.
    embedding_function = embedding_model,          # Sets the embedding model to be used to generate embeddings. 
    index = index,          # Sets the FAISS index created earlier to be used for storing and searching vectors.
    docstore = InMemoryDocstore(),          # Initializes an in-memory document store for metadata/text. 
    index_to_docstore_id = {},          # Initializes an empty dictionary to map index IDs to document store IDs, in order to keep track of which document corresponds to which vector in the index. 
)

vector_store.add_texts(          # Adds the texts, vectors, and metadata to the vector store.
    texts = all_chunks_texts,          # Adds the texts.
    vectors = all_chunks_vectors,          # Adds the vectors (to skip re-embedding).
    metadatas = [{"Source": f"chunk_{i}"} for i in range(len(all_chunks_texts))],          # Adds metadata to each chunk (for traceability).
)

vector_store.save_local("Vector_Store")          # Saves the vector store to the local directory named "Vector_Store".

new_vector_store = FAISS.load_local(          # Initializes a new FAISS vector store by loading it with the specified parameters.  
    "Vector_Store",          # Sets the path to the initial vector store. (The initial vector store will be part of the new vector store)
    embedding_model,          # Sets the embedding model to be used to generate embeddings. 
    allow_dangerous_deserialization = True,          # Allows deserialization of potentially unsafe data and is useful when loading vector stores.
)

query = "What was Hanny's education history?"          # Defines the search query to retrieve relevant documents.

chunk_retriever = new_vector_store.as_retriever(          # Converts the vector store into a LangChain retriever.
    search_type = "mmr",          # Sets the search type to "mmr" (Maximum Marginal Relevane), which balances relevance with diversity in the retrieved results.
    search_kwargs = {"k":2}           # Retrieves the top 2 most relevant chunks based on the query.
)

all_retrieved_chunks = chunk_retriever.invoke(query)          # Retrieves the relevant chunks based on the query, resulting in a list of Document objects.

context = "\n\n".join([each_retrieved_chunk.page_content for each_retrieved_chunk in all_retrieved_chunks])          # Extracts text from each retrieved chunk and joins them into a single string, separated by double newlines. 

print(context)