# Playground

## Import libraries

In [None]:
import getpass
import os
from langchain.chat_models import init_chat_model
from dotenv import load_dotenv
from agent import clean_html_for_llm, extract_links_from_page, get_repertoire_links, extract_pure_json
from geminy_agent import parse_repertoires_from_page
from google import genai
import subprocess

load_dotenv();

In [5]:
if not os.environ.get("TOGETHER_API_KEY"):
    os.environ["TOGETHER_API_KEY"] = getpass.getpass("Enter API key for Together AI: ")

# meta-llama/Llama-3.3-70B-Instruct-Turbo-Free   inputs` tokens + `max_new_tokens` must be <= 8193
# Qwen/Qwen2.5-Coder-32B-Instruct                inputs` tokens + `max_new_tokens` must be <= 32769
# meta-llama/Llama-3.2-3B-Instruct-Turbo         gives extra text
# meta-llama/Llama-4-Scout-17B-16E-Instruct
# meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 give a try
model = init_chat_model("Qwen/Qwen2.5-Coder-32B-Instruct", model_provider="together")
client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])


  model = init_chat_model("Qwen/Qwen2.5-Coder-32B-Instruct", model_provider="together")


In [2]:
url = "https://garnizonsztuki.org.pl/repertuar/"
html_content = subprocess.check_output(["curl", "-sL", url])
html_content

b'<!DOCTYPE html> <!--[if IE 7]><html class="ie ie7" dir="ltr" lang="pl-PL"\n prefix="og: https://ogp.me/ns#" > <![endif]--> <!--[if IE 8]><html class="ie ie8" dir="ltr" lang="pl-PL"\n prefix="og: https://ogp.me/ns#" > <![endif]--> <!--[if !(IE 7) & !(IE 8)]><!--><html lang="pl-PL"> <!--<![endif]--><head>  <script async src="https://www.googletagmanager.com/gtag/js?id=UA-171679420-1"></script> <script>window.dataLayer = window.dataLayer || [];\n    function gtag(){dataLayer.push(arguments);}\n    gtag(\'js\', new Date());\n\n    gtag(\'config\', \'UA-171679420-1\');\n    gtag(\'config\', \'AW-574896801\');</script> <meta charset="UTF-8" /><meta name="viewport" content="width=device-width" /><meta name="description" content="GARNIZON SZTUKI-teatr pozytywnych emocji."><meta name="format-detection" content="telephone=no"><title>Repertuar - Garnizon Sztuki</title><link rel="profile" href="https://gmpg.org/xfn/11" /><link rel="pingback" href="https://garnizonsztuki.org.pl/xmlrpc.php" /> <!-

In [None]:
url = "https://garnizonsztuki.org.pl/repertuar/"
content = clean_html_for_llm(url)
content

Error: It looks like you are using Playwright Sync API inside the asyncio loop.
Please use the Async API instead.

In [7]:
with open(f"temp/garnizonsztuki.html", "w") as file:
    file.write(content)

In [6]:
performances = parse_repertoires_from_page(content, client)
performances

[]

In [5]:
system_message = """
        I will provide you with an HTML snippet containing information about theater performances. Extract all performances, including their titles, dates, and times, and return the result as a JSON array with the following format:
        ```
        [
        {
            "title": "Performance name",
            "date": "YYYY-MM-DD",
            "time": "HH:MM",
            "status": "Performance status",
            "place": "Performance place",
        }
        ]
        ```
        Important rules:
        - Return ONLY valid JSON (no extra text or markdown).
        - If you are reaching the token limit or need to stop, DO NOT cut off in the middle of a JSON object. 
        Finish the current object fully and stop after a comma (`,`) or after the closing bracket (`]`) if at the end.
        - When continuing later, start exactly where you left off, starting with the next JSON object or the closing bracket.
"""
messages = F"""
        {system_message}

        {content}
    """
response = client.models.generate_content(
         model="gemini-2.0-flash", contents=messages
    )

In [9]:
response.text

'```json\n[]\n```'

In [15]:
pure_json = extract_pure_json(response.text)

In [16]:
pure_json

'[]'

In [16]:
theatre_url = "https://nowyteatr.org/pl"
links = extract_links_from_page(content, theatre_url)

In [19]:
repertoire_links = get_repertoire_links(links, model)

In [11]:
client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])

In [12]:
performances = parse_repertoires_from_page(content, client)

In [14]:
content

'<body>\n<aside>\n<div>\n<button>\n<span>Zamknij</span>\n\n</button>\n<div>\n<p>\nNowy Teatr korzysta z plików cookies. Zapoznaj się z naszą <a href="/pl/polityka-prywatnosci">Polityką plików cookies</a> w której informujemy o celu używanych przez nas cookies. Jeśli klikniesz Akceptuj - zgadzasz się na instalację plików cookies w Twoim urządzeniu w celu dostosowania naszych reklam do Twoich potrzeb.\n</p>\n</div>\n<div>\n<button>Akceptuj<span>Akceptuj cookies</span></button>\n</div>\n</div>\n</aside>\n<div>\n<header>\n<div>\n<div>\n<div>\n<button>\n\n<span>Szukaj</span>\n</button>\n</div>\n<div>\n<nav aria-label="Główna">\n<div>\n<div data-gradient-options=\'{"image": "/public/upload/misc/jpg/5d1373855a1bf___1568930328.jpg", "shift": 80, "destroy": false, "breakpoints": ["desktop"]}\'></div>\n</div>\n\n<ul>\n<li><a href="/pl/kalendarz"><span>Kalendarz</span></a></li><li><a href="/pl/program"><span>Program</span></a><ul><li><a href="/pl/program/teatr">Teatr</a></li><li><a href="/pl/prog

In [13]:
performances

'[]'

In [15]:
with open(f"temp/nowyteatr.html", "w") as file:
    file.write(content)

In [None]:
theatres = ["https://teatrdramatyczny.pl/", "https://www.teatr2strefa.pl", "https://teatrstudio.pl", "https://teatrpolski.waw.pl/", "https://www.wspolczesny.pl/"]
theatres = ["https://www.wspolczesny.pl/"]

for theatre_url in theatres:
    theater_name = get_theatre_name(theatre_url)
    print(f"Parsing theatre {theater_name}, url {theatre_url}")
    content = clean_html_for_llm(theatre_url)
    links = extract_links_from_page(content, theatre_url)
    repertoire_links = get_repertoire_links(links, model)

    if len(repertoire_links) > 0:
        first_url = repertoire_links[0]
        if first_url["confidence"] >= 0.7:
            url = first_url['url']
        else:
            url = theatre_url
        print(f"Repertoire link {url}")
        content = clean_html_for_llm(url)   
        performances = parse_repertoires_from_page(content, client)
        with open(f"temp/{theater_name}.json", "w") as file:
            file.write(performances)


Parsing theatre wspolczesny, url https://www.wspolczesny.pl/




Repertoire link https://www.wspolczesny.pl/repertuar/biezacy


