# Playground

## Import libraries

In [1]:
import getpass
import os
from langchain.chat_models import init_chat_model
from langchain_core.messages import HumanMessage, SystemMessage
import requests
from dotenv import load_dotenv

from agent import clean_html_for_llm, parse_repertoires_from_page, extract_links_from_page, identify_page_and_get_repertoire_links, get_repertoire_links


load_dotenv();

## Test api connection

In [2]:
if not os.environ.get("TOGETHER_API_KEY"):
    os.environ["TOGETHER_API_KEY"] = getpass.getpass("Enter API key for Together AI: ")

# meta-llama/Llama-3.3-70B-Instruct-Turbo-Free   inputs` tokens + `max_new_tokens` must be <= 8193
# Qwen/Qwen2.5-Coder-32B-Instruct                inputs` tokens + `max_new_tokens` must be <= 32769
# meta-llama/Llama-3.2-3B-Instruct-Turbo         gives extra text
model = init_chat_model("Qwen/Qwen2.5-Coder-32B-Instruct", model_provider="together")

In [36]:
messages = [
    SystemMessage("Translate the following from English into Italian"),
    HumanMessage("hi!")
]

model.invoke(messages)

AIMessage(content='Ciao!', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 4, 'prompt_tokens': 22, 'total_tokens': 26, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'Qwen/Qwen2.5-Coder-32B-Instruct', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-1dd40201-e1bd-41ad-ba47-08d4275bafdf-0', usage_metadata={'input_tokens': 22, 'output_tokens': 4, 'total_tokens': 26, 'input_token_details': {}, 'output_token_details': {}})

In [None]:
url = "https://teatrdramatyczny.pl/whats-on"
cleaned_html = clean_html_for_llm(url)
with open("temp/cleaned_repertoire.html", "w", encoding="utf-8") as file:
    file.write(cleaned_html)
print("Cleaned HTML content saved to 'cleaned_repertoire.html'")
original_size = len(requests.get(url).text)
cleaned_size = len(cleaned_html)
reduction = (1 - cleaned_size/original_size) * 100
print(f"Original size: {original_size} characters")
print(f"Cleaned size: {cleaned_size} characters")
print(f"Size reduction: {reduction:.2f}%")

Cleaned HTML content saved to 'cleaned_repertoire.html'
Original size: 403735 characters
Cleaned size: 46764 characters
Size reduction: 88.42%


## Parse HTML with LLM

In [None]:
with open("temp/cleaned_repertoire.html", "r", encoding="utf-8") as file:
    html_content = file.read()

full_response = parse_repertoires_from_page(html_content, model)

with open("temp/llm_response.json", "w") as file:
    file.write(full_response)

⚠️ Detected incomplete JSON... requesting continuation (attempt 1)


## Searching repertoire page

In [14]:
url = "https://teatrdramatyczny.pl/"
content = clean_html_for_llm(url)
links = extract_links_from_page(content, url)
is_repertoire_page = identify_page_and_get_repertoire_links(content, links, model)
# repertoires = parse_repertoires_from_page(content, model)

In [3]:
url = "https://teatrdramatyczny.pl/"
content = clean_html_for_llm(url)
links = extract_links_from_page(content, url)
repertoire_links = get_repertoire_links(links, model)

if len(repertoire_links) > 0:
    url = repertoire_links[0]['url']
    content = clean_html_for_llm(url)   
    performances = parse_repertoires_from_page(content, model)
    with open("temp/performances.json", "w") as file:
        file.write(performances)

⚠️ Detected incomplete JSON... requesting continuation (attempt 1)


In [16]:
url = "https://teatrdramatyczny.pl/whats-on"
content = clean_html_for_llm(url)
links = extract_links_from_page(content, url)
is_repertoire_page = identify_page_and_get_repertoire_links(content, links, model)
is_repertoire_page

{'is_repertoire_page': True,
 'recommended_links': [{'url': 'https://teatrdramatyczny.pl/whats-on',
   'confidence': 0.95}]}