# Setting environment variables to log traces with Langsmith:

In [5]:
import os          # Imports Python's built-in "os module" for interacting with the operating system. (e.g environment variables)
from dotenv import load_dotenv          # Imports the "load_dotenv function" from the "dotenv module" to load environment variables from the .env file. 
import requests          # Imports the "requests library" to make HTTP requests (used to verify the Langsmith API connection)


load_dotenv(          # Loads environment variables from the .env file.
    dotenv_path = ".env",           #Specifies the path to the .env file, which contains environment variables. The default is .env in the current directory.
    override = True           # Allows the loaded environment variables in the .env file to override any existing environment variables. 
)

os.environ["LANGSMITH_TRACING"] = os.getenv("LANGSMITH_TRACING")          # Retrieves the value of the loaded "LANGSMITH_TRACING" variable from the .env file and sets it in Python's runtime environment. This ensures Langsmith can access the value.
os.environ["LANGSMITH_ENDPOINT"] = os.getenv("LANGSMITH_ENDPOINT")          # Retrieves the value of the loaded "LANGSMITH_ENDPOINT" variable from the .env file and sets it in Python's runtime environment. This ensures Langsmith can access the value.
os.environ["LANGSMITH_API_KEY"] = os.getenv("LANGSMITH_API_KEY")          # Retrieves the value of the loaded "LANGSMITH_API_KEY" variable from the .env file and sets it in Python's runtime environment. This ensures Langsmith can access the value.
os.environ["LANGSMITH_PROJECT"] = os.getenv("LANGSMITH_PROJECT")          # Retrieves the value of the loaded "LANGSMITH_PROJECT" variable from the .env file and sets it in Python's runtime environment. This ensures Langsmith can access the value.

headers = {          # Creates a dictionary to store HTTP headers for the request. This particular dictionary is for the "Authorization header" which is required to aunthenticate the request to the Langsmith API.
    "Authorization": f"Bearer {os.getenv("LANGSMITH_API_KEY")}"          # Sets the "Authorization header" with the value of the "LANGSMITH_API_KEY" environment variable. This is used to authenticate the request to the Langsmith API.
}
response = requests.get(          # Makes a GET request to Langsmith's API endpoint to verify the connection.
    "https://api.smith.langchain.com",          # The URL of Langsmith's API endpoint.
    headers=headers          # Passes the dictionary containing the "Authorization header" to authenticate the request.
)

print(response.status_code)          # Prints the HTTP status code of the response. A status code of 200 indicates a successful connection to the Langsmith API.
print(response.json())          # Prints the JSON response from the Langsmith API, which typically contains information about the API connection or any relevant data returned by the request.

404
{'detail': 'Not Found'}


# Loading the PDF document:

In [None]:
from langchain_community.document_loaders import PyMuPDFLoader          # Imports the "PyMuPDFLoader" class from LangChain's document loaders. This loader specializes in extracting text and metadata from PDF files using the PyMuPDF library.
import pprint          # Imports the "pprint module" for pretty-printing data structures, making them easier to read in the console.

file_path = r"C:\Users\user\Downloads\HANNY ABUBAKAR CV.pdf"          # Specifies the path to the PDF file that wiill be loaded. The "r" prefix ensures that the backslashes are treated as "literal characters" and not as escape sequences. 
loader = PyMuPDFLoader(          # Initializes the "PyMuPDFLoader" with the specified PDF file path. 
    file_path,          # The path of the file to be loaded.
    # mode="single"          # Specfies the mode in which the document will be loaded. The "single" mode means the entire document will be treated as one, the "page" mode means that each page will be treated as a separate document. 
)          
loaded_doc = loader.load()          # Executes the PDF parsing and text extraction process, returning a list of Document objects.     

print(f"This document has {len(loaded_doc)} pages.")
pprint.pp(loaded_doc)


# Splitting the loaded PDF document into chunks:


In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter          # Imports the "RecursiveCharacterTextSplitter" class from Langchain's text splitters. This class attempts to keep larger units (e.g., paragraphs or sentences) intact while keeping the text within a specified character limit. 

text_splitter = RecursiveCharacterTextSplitter(          # Initializes the "RecursiveCharacterTextSplitter" with specific paramaters on how to split the text.
    chunk_size = 1000,          # Defines the maximum number of characters in each chunk. (the text will be split into chunks that are at most 1000 characters long).
    chunk_overlap = 200,          # Defines the maximum number of characters that can overlap between consecutive chunks. 
)
all_chunks = text_splitter.split_documents(loaded_doc)          # Splits the loaded PDF document into chunks. Each chunk will be a Document object.

print(f"This document has been split into {len(all_chunks)} chunks.")          

for each_chunk in all_chunks:          
    print("")
    print(each_chunk.page_content)
    print("")
    print("-----" * 500)

# Embedding the chunks as vectors:


In [None]:
from langchain_ollama import OllamaEmbeddings          # Imports the "OllamaEmbeddings" class from Langchain's Ollama module, in order to embed the chunks as vectors.

embedding_model = OllamaEmbeddings(model = "nomic-embed-text")          # Initializes the "OllamaEmbeddings" model.

all_chunks_content = [each_chunk.page_content for each_chunk in all_chunks]          # Extracts the content of each chunk from all the chunks using a "list comprehension", and saves it to a list. This creates a list of strings, where each string is the content of a chunk. 
all_chunks_vectors = embedding_model.embed_documents(all_chunks_content)          # Embeds the content of all chunks as vectors using the "OllamaEmbeddings" model. This converts the text into numerical representations (vectors) that can be used for similarity search.

for each_chunk_vector in all_chunks_vectors:          
    print(each_chunk_vector)

[[-0.061645195, 0.038094077, -0.15381801, 0.0052035083, 0.068959616, -0.017385483, 0.017884964, -0.002994838, -0.05155784, -0.023526795, -0.015203721, 0.004151577, 0.13696715, -0.013687797, 0.026754525, -0.009060146, 0.0029857813, -0.04095461, -0.05205735, 0.016666247, -0.02088724, -0.07014071, 0.02848272, -0.0188459, 0.12100627, 0.004003539, 0.028187908, -0.022688996, 0.011822035, 0.0036211056, 0.0106382845, -0.025919037, -0.021019794, -0.045618013, -0.03302269, -0.04961473, 0.037031543, 0.015419727, 0.021745887, 0.01986392, 0.025736753, -0.010055359, 0.0016466082, -0.046166494, 0.05386348, 0.0028140051, 0.037638582, -0.016201919, 0.088717066, -0.027181655, -0.00018230361, 0.010251403, 0.016071038, -0.022727685, 0.073042266, -0.003095908, -0.041297305, 0.03551893, 0.011169256, -0.023614885, 0.120924056, 0.074159116, -0.030721355, 0.08929984, 0.011805209, -0.0048949555, -0.012929123, 0.051845398, -0.027698144, -0.03400192, 0.025753602, -0.0015683455, 0.022020029, 0.034147386, -0.034301