# Playground

## Import libraries

In [3]:
import getpass
import os
from langchain.chat_models import init_chat_model
from langchain_core.messages import HumanMessage, SystemMessage
from dotenv import load_dotenv
from urllib.parse import urlparse
from agent import clean_html_for_llm, parse_repertoires_from_page, extract_links_from_page, identify_page_and_get_repertoire_links, get_repertoire_links

load_dotenv();

## Test api connection

In [14]:
if not os.environ.get("TOGETHER_API_KEY"):
    os.environ["TOGETHER_API_KEY"] = getpass.getpass("Enter API key for Together AI: ")

# meta-llama/Llama-3.3-70B-Instruct-Turbo-Free   inputs` tokens + `max_new_tokens` must be <= 8193
# Qwen/Qwen2.5-Coder-32B-Instruct                inputs` tokens + `max_new_tokens` must be <= 32769
# meta-llama/Llama-3.2-3B-Instruct-Turbo         gives extra text
# meta-llama/Llama-4-Scout-17B-16E-Instruct
model = init_chat_model("Qwen/Qwen2.5-Coder-32B-Instruct", model_provider="together")


  model = init_chat_model("Qwen/Qwen2.5-Coder-32B-Instruct", model_provider="together")


In [3]:
messages = [
    SystemMessage("Translate the following from English into Italian"),
    HumanMessage("hi!")
]

model.invoke(messages)

AIMessage(content='Ciao!', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 4, 'prompt_tokens': 22, 'total_tokens': 26, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'Qwen/Qwen2.5-Coder-32B-Instruct', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-ea974058-f35f-4ad7-92ff-a01aae5b51d6-0', usage_metadata={'input_tokens': 22, 'output_tokens': 4, 'total_tokens': 26, 'input_token_details': {}, 'output_token_details': {}})

In [31]:
from google import genai

client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])

response = client.models.generate_content(
    model="gemini-2.0-flash", contents="Explain how AI works in a few words"
)
print(response.text)

AI works by learning patterns from data to make predictions or decisions.



In [14]:
url = "https://teatrdramatyczny.pl/"
content = clean_html_for_llm(url)
links = extract_links_from_page(content, url)
is_repertoire_page = identify_page_and_get_repertoire_links(content, links, model)
# repertoires = parse_repertoires_from_page(content, model)

In [30]:
from agent import extract_pure_json

def parse_repertoires_from_page(html_content: str, client) -> list[dict]:
    """Parse the given HTML and return a JSON list of repertoire items found on the page."""
    system_message = """
        I will provide you with an HTML snippet containing information about theater performances. Extract all performances, including their titles, dates, and times, and return the result as a JSON array with the following format:
        ```
        [
        {
            "title": "Performance name",
            "date": "YYYY-MM-DD",
            "time": "HH:MM",
            "status": "Performance status",
            "place": "Performance place",
        }
        ]
        ```
        Important rules:
        - Return ONLY valid JSON (no extra text or markdown).
        - If you are reaching the token limit or need to stop, DO NOT cut off in the middle of a JSON object. 
        Finish the current object fully and stop after a comma (`,`) or after the closing bracket (`]`) if at the end.
        - When continuing later, start exactly where you left off, starting with the next JSON object or the closing bracket.
    """
    messages = F"""
        {system_message}

        {html_content}
    """
    response = client.models.generate_content(
         model="gemini-2.0-flash", contents=messages
    )
    pure_json = extract_pure_json(response.text)
    return pure_json

In [29]:
def get_theatre_name(url):
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    if domain.startswith("www."):
        domain = domain[4:]
    main_part = domain.rsplit('.', 1)[0]
    return main_part

In [32]:
theatres = ["https://teatrdramatyczny.pl/", "https://www.teatr2strefa.pl", "https://teatrstudio.pl"]

for theatre_url in theatres:
    theater_name = get_theatre_name(theatre_url)
    print(f"Parsing theatre {theater_name}, url {theatre_url}")
    content = clean_html_for_llm(theatre_url)
    links = extract_links_from_page(content, theatre_url)
    repertoire_links = get_repertoire_links(links, model)

    if len(repertoire_links) > 0:
        first_url = repertoire_links[0]
        if first_url["confidence"] >= 0.7:
            url = first_url['url']
        else:
            url = theatre_url
        print(f"Repertoire link {url}")
        content = clean_html_for_llm(url)   
        performances = parse_repertoires_from_page(content, client)
        with open(f"temp/{theater_name}.json", "w") as file:
            file.write(performances)


Parsing theatre teatrdramatyczny, url https://teatrdramatyczny.pl/
Repertoire link https://teatrdramatyczny.pl/repertuar
Parsing theatre teatr2strefa, url https://www.teatr2strefa.pl
Repertoire link https://www.teatr2strefa.pl
Parsing theatre teatrstudio, url https://teatrstudio.pl
Repertoire link https://teatrstudio.pl/pl/repertuar
