In [None]:
import os
import warnings
warnings.filterwarnings("ignore")
import dotenv
import langchain_huggingface

_ = dotenv.load_dotenv(dotenv_path="../.env", override=True)

llm_endpoint = langchain_huggingface.HuggingFaceEndpoint(
    huggingfacehub_api_token=os.environ["HF_TOKEN"],
    repo_id="deepseek-ai/DeepSeek-V3.2",
    task="text-generation",
    provider="auto", temperature=0.1
)
llm = langchain_huggingface.ChatHuggingFace(llm=llm_endpoint)

In [None]:
import pydantic

class GraphState(pydantic.BaseModel):
    url: str
    raw_text: str | None = None
    country_info: dict | None = None

class CountryInfo(pydantic.BaseModel):
    country_name: str = pydantic.Field(description= "The name of the country")
    # holiday: str = pydantic.Field(description="The name of the holiday")
    # date: str = pydantic.Field(description="The date of the holiday")

In [None]:
from langchain_community import document_loaders, document_transformers
from langchain_core import prompts

def scrape_page(state: dict) -> dict:
    loader = document_loaders.AsyncChromiumLoader(
        urls=[state["url"]]
    )
    docs = loader.load()
    transformer = document_transformers.BeautifulSoupTransformer()
    cleaned_docs = transformer.transform_documents(
        documents=docs,
        tags_to_extract=["li"]
    )

    return {
        **state,
        "raw_text": cleaned_docs[0].page_content
    }

def extract_country(state: dict) -> dict:
    prompt = prompts.ChatPromptTemplate.from_messages(messages=[
        ("system", "Extract structured country information from the text."),
        ("human", "{text}")
    ])
    structured_llm = llm.with_structured_output(CountryInfo)
    result = structured_llm.invoke(prompt.format_messages(text=state["raw_text"]))
    return {
        **state,
        "country_info": result.model_dump()
    }

In [None]:
from langgraph import graph

graph_builder = graph.StateGraph(state_schema=GraphState)
graph_builder.add_node(node="scrape_page", action=scrape_page)
graph_builder.add_node(node="extract_country", action=extract_country)

graph_builder.set_entry_point("scrape_page")
graph_builder.add_edge(start_key="scrape_page", end_key="extract_country")
graph_builder.add_edge(start_key="extract_country", end_key=graph.END)

app = graph_builder.compile()
app

In [None]:
result = app.invoke(input={
    "url": "https://www.timeanddate.com/holidays/?allcountries"
})
result["country_info"]

In [None]:
from langchain import tools
from langchain_community import document_loaders, document_transformers
import pydantic

@tools.tool("get_holiday")
def get_holiday(url: str) -> str:
    """
    Scrapes a URL and returns a holiday.
    """
    # Initialize Loader
    loader = document_loaders.AsyncChromiumLoader(urls=[url])
    docs = loader.load()
    # transform HTML to clean text
    bs_transformer = document_transformers.BeautifulSoupTransformer()
    docs_transformed = bs_transformer.transform_documents(documents=docs,
        tags_to_extract=["li"])
    
    return docs_transformed[0].page_content

class HolidayInfo(pydantic.BaseModel):
    """
    Structured holiday information.
    """
    country_name: str = pydantic.Field(description= "The name of the country")
    holiday: str = pydantic.Field(description="The name of the holiday")
    date: str = pydantic.Field(description="The date of the holiday")

In [None]:
from langchain_core import prompts
from langchain import agents
# from langgraph import prebuilt

agent = agents.create_agent(
    model=llm,
    tools=[get_holiday],
    response_format=HolidayInfo,
    debug=True
)

# prompt_template = prompts.ChatPromptTemplate.from_messages(messages=[
#     ("system", "You're a helpful research assistant. Use tools to scrape web data."),
#     ("human", "Go to {url} and extract the country name from the website")
# ])

# chain = prompt_template | agent

url = "https://www.timeanddate.com/holidays/?allcountries"
messages = [
    ("system", "You're a helpful research assistant. Use tools to scrape web data."),
    ("human", f"Go to {url} and extract the country name from the website")
]
response = agent.invoke({"messages": messages})
response["structured_response"]

In [None]:
import nest_asyncio; nest_asyncio.apply()
import asyncio
from playwright import async_api
import scrapy

async def main():
    async with async_api.async_playwright() as pw:
        browser = await pw.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto("https://www.timeanddate.com/holidays/?allcountries")
        # await page.wait_for_selector("//article[@class='category-list']")

        # selector = await scrapy.Selector(text=page.content())
        print(await page.content())
        await browser.close()

asyncio.run(main())