In [2]:
from langchain_text_splitters import HTMLHeaderTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


In [4]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
website_url='https://docs.pydantic.dev'
url = website_url+'/sitemap.xml'
urlss=[]
flag=1
while(flag):
    try:
        reqs = requests.get(url)
        soup = BeautifulSoup(reqs.text, 'xml')
        urls = [url.text for url in soup.find_all('loc') if website_url in url.text]
        urlss+=urls
    except requests.exceptions.RequestException as e:
        print(f"Error fetching sitemap: {e}")
        exit()
    except Exception as e:
        print(f"Error parsing sitemap: {e}")
        exit()
    for urll in urlss:
        if urll.endswith('xml'):
            url=urll
            urlss.pop(0) 
            break
        else:
            flag=0
print(f"Found {len(urlss)} URLs from the sitemap to start crawling.")

Found 90 URLs from the sitemap to start crawling.


In [5]:
def get_html(url):

    headers_to_split_on = [
        ("h1", "Header 1"),
        ("h2", "Header 2"),
        ("h3", "Header 3"),
        ("h4", "Header 4"),
    ]

    html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

    # for local file use html_splitter.split_text_from_file(<path_to_file>)
    html_header_splits = html_splitter.split_text_from_url(url)

    chunk_size = 500
    chunk_overlap = 30
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )

    # Split
    splits = text_splitter.split_documents(html_header_splits)

    return splits

In [6]:
emb={}
for url in urlss:
    splits=get_html(url)
    for i in range(len(splits)):
        values = list(splits[i].metadata.values())
        result_string = ", ".join(values)
        emb[i]=result_string+" "+splits[i].page_content

In [14]:
strings = []
for i in range(len(emb)):
    string=Document(
                page_content=emb[i],
            )
    strings.append(string)

In [15]:
strings[0:5]

[Document(metadata={}, page_content=' Skip to content  \nFastUI  \nTypeScript Components  \nInitializing search  \npydantic/FastUI  \nIntroduction Guide API Documentation  \nFastUI  \npydantic/FastUI  \nIntroduction Guide API Documentation API Documentation  \nPython Components TypeScript Components  \nTypeScript Components¶'),
 Document(metadata={}, page_content='TypeScript Components¶ 🚧 Work in Progress  \nThis page is a work in progress.'),
 Document(metadata={}, page_content=' Made with Material for MkDocs'),
 Document(metadata={}, page_content=' name title required error locked description display_mode class_name options multiple initial vanilla placeholder autocomplete type  \nname title required error locked description display_mode class_name search_url multiple initial debounce placeholder type  \nsubmit_url initial method display_mode submit_on_change submit_trigger loading footer class_name model type  \npage page_size total page_query_param class_name type  \ndata columns d

In [16]:
embeddings = OpenAIEmbeddings(check_embedding_ctx_length=False,  openai_api_key="lm-studio", base_url="http://localhost:8080/v1",model="text-embedding-nomic-embed-text-v1.5")


In [11]:
vectorstore = Chroma.from_documents(documents=strings, 
                                    embedding=embeddings)

In [12]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

In [None]:
docs = retriever.get_relevant_documents("What is the main idea of the text?")

In [14]:
docs[0].page_content

"Pydantic Types, pydantic.types ¶, constr ¶ A wrapper around str that allows for additional constraints.  \nfrom pydantic import BaseModel, constr class Foo(BaseModel): bar: constr(strip_whitespace=True, to_upper=True) foo = Foo(bar=' hello ') print(foo) #> bar='HELLO'  \nParameters:  \nWhether to remove leading and trailing whitespace.  \nWhether to turn all characters to uppercase.  \nWhether to turn all characters to lowercase.  \nWhether to validate the string in strict mode.  \nThe minimum length of the string."

In [15]:
# Prompt
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the question based only on the following context:\n{context}\n\nQuestion: {question}\n'), additional_kwargs={})])

In [16]:
llm = ChatOpenAI(base_url="http://127.0.0.1:8080/v1",model="llama-3.2-1b-instruct", api_key="LM")

In [17]:
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [18]:
# Run
rag_chain.invoke("How to use pydantic?")

"To use Pydantic, you can follow these general steps:\n\n1. Import the necessary modules:\n   ```python\nfrom pydantic import BaseModel, EmailStr, NonNegativeInt, Optional, validator\n```\n2. Define your models using classes that inherit from `BaseModel` (or `BaseDataclass` in some cases).\n3. Use validation decorators and methods to specify how to validate data.\n4. Create instances of these models with the required attributes.\n5. You can also use other Pydantic features like JSON schema, data types, and more.\n\nHere is a simple example:\n\n```python\nfrom pydantic import BaseModel\n\nclass User(BaseModel):\n    id: int = NonNegativeInt()\n    name: str\n    age: Optional[int]\n    email: EmailStr\n```\n\nIn this example:\n- `id` is a non-negative integer with optional validation.\n- `name` and `email` are strings with validation using `EmailStr`.\n- `age` can be either an integer or None.\n\nYou can create instances of the model like this:\n\n```python\nuser = User(id=1, name='John