- Used "dotenv" to make OpenAI and Langchain available globally in the script
- Langchain API is used to track the "behind process" through LangSmith. Else not madatory to use

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

# os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
# os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")

Data Ingestion

- Web Scraping is done using "WebBaseLoader". Output type -> list(Documents).

In [3]:
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader(web_path = "https://en.wikipedia.org/wiki/LangChain")
docs = loader.load()
docs

USER_AGENT environment variable not set, consider setting it to identify your requests.


[Document(metadata={'source': 'https://en.wikipedia.org/wiki/LangChain', 'title': 'LangChain - Wikipedia', 'language': 'en'}, page_content='\n\n\n\nLangChain - Wikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMain menu\n\n\n\n\n\nMain menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation\n\t\n\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact us\n\n\n\n\n\n\t\tContribute\n\t\n\n\nHelpLearn to editCommunity portalRecent changesUpload fileSpecial pages\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAppearance\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDonate\n\nCreate account\n\nLog in\n\n\n\n\n\n\n\n\nPersonal tools\n\n\n\n\n\nDonate Create account Log in\n\n\n\n\n\n\t\tPages for logged out editors learn more\n\n\n\nContributionsTalk\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nContents\nmove to sidebar\nhide\n\n\n\n\n(Top)\n\n\n\n\n\n1\nHistory\n\n

Text Splitting

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(chunk_size = 200, chunk_overlap = 50)
splitted_docs = splitter.split_documents(docs)
splitted_docs

[Document(metadata={'source': 'https://en.wikipedia.org/wiki/LangChain', 'title': 'LangChain - Wikipedia', 'language': 'en'}, page_content='LangChain - Wikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMain menu\n\n\n\n\n\nMain menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation'),
 Document(metadata={'source': 'https://en.wikipedia.org/wiki/LangChain', 'title': 'LangChain - Wikipedia', 'language': 'en'}, page_content='Main menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation\n\t\n\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact us\n\n\n\n\n\n\t\tContribute'),
 Document(metadata={'source': 'https://en.wikipedia.org/wiki/LangChain', 'title': 'LangChain - Wikipedia', 'language': 'en'}, page_content='Contribute\n\t\n\n\nHelpLearn to editCommunity portalRecent changesUpload fileSpecial pages\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAppearance\n\n\n

In [5]:
len(splitted_docs)

129

Initializing OpenAI Embeddings model

In [6]:
from langchain_openai.embeddings import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings(model = "text-embedding-3-large")
embeddings_model

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x000001DABDB8E270>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x000001DABDC0A510>, model='text-embedding-3-large', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

Storing the Embeddings in the FAISS database

In [None]:
from langchain.vectorstores import FAISS

vectorDB = FAISS.from_documents(splitted_docs, embeddings_model)
vectorDB  # This won't work since there is no credit balance in my open ai account

VectorDB is converted to retriever (acts as an interface to the context vectors, responsible for optimized context 
                                    vector retrieval.)

In [None]:
vectorDB = vectorDB.as_retriever()
vectorDB

In [None]:
query = "Why Langchain is used?"

response = vectorDB.similarity_search(query)
response


Initializing the ChatOpenAI module

In [None]:
# This won't work since there is no credit balance in my open ai account

from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model = "gpt-4o")

Creating the Prompt template

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain

prompt = ChatPromptTemplate.from_template(
            "Answer to the questions asked by the user"
            "context: {context}"
            "question: {question}"
            "Answer: "
)

document_chain = create_stuff_documents_chain(llm, prompt)

Input & Output

- Here the output is given as the chatgpt. Because the "create_stuff_documents_chain" contains chain such that
  all the parsing related work is done within it.

In [None]:
# from langchain_core.documents import Document
# docs = Document([page_content = ])

input_data = {
    "input" : splitted_docs,
    "question" : "What is LangChain and how is it useful?"
}

Output = document_chain.invoke(input_data)
Output