# Scrap

In [42]:
import logging
import sys
from dotenv import load_dotenv
from playwright.async_api import async_playwright
from urllib.parse import urljoin, urlparse
from unstructured.partition.html import partition_html
from langchain_core.documents import Document

load_dotenv()

selectors_to_ignore = [
    'iframe', 'head',
    '.ad', '.ads', '.advertisement', '.banner',
    'nav', 'footer', 'header',
    '.navbar', '.menu', '.nav', '.footer', '.header',
    'form', 'button', 'input', 'select', 'textarea',
    '.form', '.button', '.btn',
    '.widget', '.social', '.share', '.tweet', '.like',
    'script', 'noscript', 'style', 'link',
    '.comments', '.comment', '.reply', '.discussion',
    '.forum', '.thread'
]

logger = logging.getLogger("webscapper.py")
logging.basicConfig(
    stream=sys.stdout,  # uncomment this line to redirect output to console
    format="%(message)s",
    level=logging.DEBUG,
)

visited_urls = set()
internal_urls = set()
documents = []

def save_func(url, content):
    metadata = {"source": url}
    documents.append(Document(page_content=content, metadata=metadata))

def clean_content(source):
    elements = partition_html(text=source)
    clean_content =  "\n\n".join([str(el) for el in elements])
    return clean_content

def is_internal_url(base_url, test_url):
    base_netloc = urlparse(base_url).netloc
    test_netloc = urlparse(test_url).netloc
    return base_netloc == test_netloc


async def scrape_page(page, url, process_content_func, current_depth, max_depth):
    if current_depth > max_depth:
        return
    
    logger.info(f"Scraping {url} at depth {current_depth}")
    await page.goto(url)
    await page.wait_for_load_state("networkidle")

    ignore_script = f'''
    (() => {{
        const selectors = {selectors_to_ignore};
        selectors.forEach(selector => {{
            const elements = document.querySelectorAll(selector);
            elements.forEach(element => element.remove());
        }});
    }})()'''

    await page.evaluate(ignore_script)
    content = await page.content()
    content = clean_content(content)
    process_content_func(url, content)

    links_locator = await page.get_by_role('link').all()
    for link in links_locator:
        href = await link.get_attribute('href')
        if href:
            full_url = urljoin(url, href)
            if is_internal_url(url, full_url):
                internal_urls.add((full_url, current_depth + 1))

async def process_url(playwright, url, process_content_func, current_depth, max_depth):
    browser = await playwright.chromium.launch(headless=True)
    page = await browser.new_page()
    try:
        await scrape_page(page, url, process_content_func, current_depth, max_depth)
    except Exception as ex:
        logger.error(f"Error scraping {url}: {str(ex)}")
    finally:
        await browser.close()

async def main(url, max_depth=2, max_pages=100, concurrency=5):
    internal_urls.add((url, 0))
    async with async_playwright() as playwright:
        page_count = 0
        while internal_urls and page_count < max_pages:
            next_url, depth = internal_urls.pop()
            if next_url not in visited_urls:
                await process_url(playwright, next_url, save_func, depth, max_depth)
                visited_urls.add(next_url)
                page_count += 1

await main("https://lespotevry.fr", max_pages=1)

Scraping https://lespotevry.fr at depth 0
Not narrative. Text does not contain a verb:

Aujourd'hui, votre centre est ouvert jusqu'à 19:00
Not narrative. Text does not contain a verb:

Aujourd'hui votre hypermarché est ouvert jusqu'à 19:00
Not narrative. Text does not contain a verb:

Vos restaurants sont ouverts jusqu'à 23:00
Sentence does not exceed 3 word tokens, it will not count toward sentence count.
cest quoi 
Sentence does not exceed 3 word tokens, it will not count toward sentence count.
cest quoi 
Not narrative. Text does not contain a verb:

c'est quoi ?
Sentence does not exceed 5 word tokens, it will not count toward sentence count.
cest quoi 
Not narrative. Text does not contain a verb:

Pour tous nos bons plans, c'est par ici !
Sentence does not exceed 3 word tokens, it will not count toward sentence count.
Facebook
Not narrative. Text exceeds cap ratio 0.5:

Facebook
Sentence does not exceed 5 word tokens, it will not count toward sentence count.
Facebook
Sentence does n

## Split

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)

documents = text_splitter.split_documents(documents)


# Save


In [3]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings()
db = Chroma.from_documents(documents, embedding_model, persist_directory="../db/chroma/HS-01")
db.persist()

NameError: name 'documents' is not defined