In [7]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

def extract_html_from_url(url):
    # Fetch HTML content from the URL using selenium
    
    options = webdriver.ChromeOptions()
    options.add_argument('--ignore-certificate-errors')
    options.add_argument('--incognito')
    options.add_argument('--headless')
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    driver.implicitly_wait(2)
    response_content = driver.execute_script("return document.documentElement.outerHTML;")

    driver.quit()
    # Parse HTML content using BeautifulSoup
    soup = BeautifulSoup(response_content, "html.parser")
    excluded_tagNames = ["footer", "nav"]
    # Exclude elements with tag names 'footer' and 'nav'
    for tag_name in excluded_tagNames:
        for unwanted_tag in soup.find_all(tag_name):
            unwanted_tag.extract()

    # Process the soup to maintain hrefs in anchor tags
    for a_tag in soup.find_all("a"):
        href = a_tag.get("href")
        if href:
            a_tag.string = f"{a_tag.get_text()} ({href})"

    return ' '.join(soup.stripped_strings)  # Return text content with preserved hrefs



In [5]:
from pydantic import BaseModel, Field
from typing import Optional
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

load_dotenv()

class Restaurant(BaseModel):
    position: int = Field(description="Position")
    name: str = Field(description="Name of the restaurant")
    ranking: float = Field(description="Score/Ranking of the restaurant by the users")
    opinions: Optional[int] = Field(description="Number of the opinions given to the restaurant")
    price: Optional[str] = Field(description="Average range of price for the restaurant")
    type: str = Field(description="Type of restaurante: japanese, mediterranean, etc.")
    address: str = Field(description="Address of the restaurant")
    phone_number: Optional[str] = Field(description="Phone number of the restaurant")
    time_table: str = Field(description="Timetable of the restaurant")
    
class RestaurantScrapper(BaseModel):
    Restaurants: list[Restaurant] = Field("List of all the activities listed in the text")


llm = ChatOpenAI(temperature=0)
output_parser = PydanticOutputParser(pydantic_object = RestaurantScrapper)

prompt_template = """
You are an expert making web scrapping and analyzing HTML raw code.
If there is no explicit information don't make any assumption.
Extract all objects that matched the instructions from the following html
{html_text}
Provide them in a list, also if there is a next page link remember to add it to the object.
Please, follow carefulling the following instructions
{format_instructions}
"""

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["html_text"],
    partial_variables={"format_instructions": output_parser.get_format_instructions}
)

chain = prompt | llm | output_parser


In [8]:
url = "https://www.google.com/maps/search/restaurantes/@40.4439646,-3.6728576,16.03z?hl=es&entry=ttu"
html_text_parsed = extract_html_from_url(url)
restaurants = chain.invoke(input={
    "html_text": html_text_parsed
})
restaurants.Restaurants

[Restaurant(position=1, name='Restaurant A', ranking=4.5, opinions=200, price='$$$', type='Mediterranean', address='123 Main St, City, Country', phone_number='123-456-7890', time_table='Mon-Sun: 11am-10pm'),
 Restaurant(position=2, name='Restaurant B', ranking=4.2, opinions=150, price='$$', type='Japanese', address='456 Elm St, City, Country', phone_number='987-654-3210', time_table='Tue-Sat: 12pm-9pm')]