## Simple Gen AI Web Scraper using Langchain

In [None]:
import os
from dotenv import load_dotenv

In [None]:
load_dotenv()

In [None]:
os.environ['OPENAI_API_KEY']        = os.getenv("OPENAI_API_KEY")
os.environ['LANGCHAIN_API_KEY']     = os.getenv("LANGCHAIN_API_KEY")
os.environ['LANGCHAIN_PROJECT']     = os.getenv("LANGCHAIN_PROJECT")
os.environ["LANGCHAIN_TRACING_V2"]  = "true"

## Data Ingestion -- Scrape Website Data

In [None]:
from langchain_community.document_loaders           import WebBaseLoader
from langchain_text_splitters                       import RecursiveCharacterTextSplitter
from langchain_openai                               import OpenAIEmbeddings
from langchain_community.vectorstores               import FAISS
from langchain.chains.combine_documents             import create_stuff_documents_chain
from langchain_core.prompts                         import ChatPromptTemplate
from langchain_openai                               import ChatOpenAI
from langchain_core.documents                       import Document
from langchain.chains                               import create_retrieval_chain

In [None]:
url = "https://docs.smith.langchain.com/administration/tutorials/manage_spend"

loader = WebBaseLoader(url)
loader

In [None]:
loadedDocument = loader.load()
loadedDocument

In [None]:
textSplitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200
)

In [None]:
documents = textSplitter.split_documents(loadedDocument)
documents

In [None]:
embeddings = OpenAIEmbeddings()

In [None]:
vectorStoreDB = FAISS.from_documents(
    documents, 
    embeddings
)

## Query and Retrieval

In [None]:
llmModel = ChatOpenAI(model = "gpt-4o")

In [None]:
query   = "LangSmith has two usage limits: total traces and extended"

result  = vectorStoreDB.similarity_search(query)
print(f'Result:     {result[0].page_content}')

In [None]:
prompt = ChatPromptTemplate.from_template(
    """
    Answer the following question based only on the provided context: 
    <context>
    {context}
    </context>
    """
)

In [None]:
documentChain = create_stuff_documents_chain(
    llmModel,
    prompt
)

documentChain

In [None]:
documentChain.invoke({
    "input": "Langsmith has two usage limits: total traces and extended",
    "context": [Document(
        page_content = "These correspond to the two metrics we've been tracking on our usage graph. We can use these in tandem to have granular control over spend.")]
})

In [None]:
retriever = vectorStoreDB.as_retriever()

retreivalChain = create_retrieval_chain(
    retriever,
    documentChain   
)
retreivalChain

## Response

In [None]:
response = retreivalChain.invoke({
    "input": "Langsmith has two usage limits: total traces and extended"
})

response

In [None]:
response['answer']

In [None]:
response['context']