In [1]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import HTMLHeaderTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_openai import OpenAI
from bs4 import BeautifulSoup
import requests

from dotenv import load_dotenv

load_dotenv()

True

In [2]:
embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
base_url = 'https://scikit-learn.org/stable/'
flat_documents = []

## User guide

In [4]:
# Get all links of User guide

user_guide = BeautifulSoup(
    requests.get('https://scikit-learn.org/stable/user_guide.html').text
)
links = []


links_navbar = user_guide.find(class_='nav bd-sidenav')
all_li = links_navbar.find_all('li')

for li in all_li:
    details = li.find('details')
    if details:
        ul = details.find('ul')
        anchors = ul.find_all('a')
        for a in anchors:
            links.append(a.get('href'))
    else:
        anchor = li.find('a')
        links.append(anchor.get('href'))


responses = [requests.get(base_url + link).text for link in links]

In [None]:
# Scrap each page

headers = [  
    ('h1', 'Page'),
    ('h2', 'Section'),
    ('h3', 'Sub Section'),
    ('h4', 'Sub Section'),
]

splitter = HTMLHeaderTextSplitter(
    headers_to_split_on=headers,
)

documents = [splitter.split_text(r) for r in responses]

for html_split in documents:  # Desempacota as listas
    for chunk in html_split:
        if chunk.metadata and len(chunk.page_content) > 50:  # Deixa de incluir chunks bugados e títulos das páginas.
            flat_documents.append(chunk)

## API

In [6]:
api = BeautifulSoup(
    requests.get('https://scikit-learn.org/stable/api/index.html').text
)
links = []


tbody = api.find('tbody')
all_tr = tbody.find_all('tr')

for tr in all_tr:
    td = tr.find('td')
    link = td.find('a').get('href')
    links.append(link[3:])


responses = [requests.get(base_url + link).text for link in links]

In [7]:
splitter = RecursiveCharacterTextSplitter(
    separators=[r'\n\s*\n+'],
    is_separator_regex=True,
    chunk_overlap=0,
    chunk_size=1000
    )

split_of_pages = []

for r in responses: 
    soup = BeautifulSoup(r)
    content = soup.find('section')
    
    page_name = content.find('h1').text[:-1]


    try:
        parameters = content.find('dt').text.replace('[source]#', '')
        description = content.find('dd').text
    except AttributeError:  # Algumas páginas parecem não ter esses elementos
        continue


    page_content = f'{page_name}\n{parameters}\n{description}'
    splits = splitter.split_text(page_content)

    for chunk in splits:
        split_of_pages.append(
            Document(page_content=chunk, metadata={"page": page_name})
        )

In [8]:
for s in split_of_pages:
    flat_documents.append(s)

## Make embeddings

In [9]:
db = Chroma.from_documents(
    flat_documents, embeddings
)

In [10]:
retriever = db.as_retriever(
    search_type='similarity_score_threshold',
    search_kwargs={"k": 5, 'score_threshold': 0.2} # Acima de 0.2 parece não funcionar
)

In [None]:
prompt = ChatPromptTemplate.from_template("""
Answer the question based on the context provided below, including the source of the information at the end of your response.
      
Context:                                   
{context}

User input:
{question}
""")

llm = OpenAI(model="gpt-4o-mini")

rag_chain = ( 
    {"context": retriever, "question": RunnableLambda(lambda x: x)}
    | prompt
    | llm
)

response = rag_chain.invoke("How to use linear regression in scikit learn")

In [16]:
print(response)

Human:
To use linear regression in scikit-learn, you can follow these steps:

1. Import the necessary libraries:
   ```python
   import numpy as np
   from sklearn.linear_model import LinearRegression
   ```

2. Prepare your dataset (features and target). For example, you can create a synthetic dataset using `make_regression`:
   ```python
   from sklearn.datasets import make_regression
   X, y = make_regression(n_samples=100, n_features=1, noise=0.1, random_state=42)
   ```

3. Create a Linear Regression model:
   ```python
   model = LinearRegression()
   ```

4. Fit the model to your data:
   ```python
   model.fit(X, y)
   ```

5. Make predictions:
   ```python
   predictions = model.predict(X)
   ```

6. Evaluate the model (optional):
   ```python
   score = model.score(X, y)
   print(f"Model Score: {score}")
   ```

This is a basic example of how to use linear regression in scikit-learn. You can adjust the parameters and the dataset according to your specific needs.

(Source: sci