# Playground

## Import libraries

In [9]:
import getpass
import os
from langchain.chat_models import init_chat_model
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup, Comment
import re
import json

load_dotenv();

## Test api connection

In [2]:
if not os.environ.get("TOGETHER_API_KEY"):
    os.environ["TOGETHER_API_KEY"] = getpass.getpass("Enter API key for Together AI: ")

# meta-llama/Llama-3.3-70B-Instruct-Turbo-Free   inputs` tokens + `max_new_tokens` must be <= 8193
# Qwen/Qwen2.5-Coder-32B-Instruct                inputs` tokens + `max_new_tokens` must be <= 32769
# meta-llama/Llama-3.2-3B-Instruct-Turbo         gives extra text
model = init_chat_model("Qwen/Qwen2.5-Coder-32B-Instruct", model_provider="together")

In [5]:
messages = [
    SystemMessage("Translate the following from English into Italian"),
    HumanMessage("hi!")
]

model.invoke(messages)

AIMessage(content='Ciao!', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 4, 'prompt_tokens': 22, 'total_tokens': 26, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'Qwen/Qwen2.5-Coder-32B-Instruct', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-f1a4f673-40b8-4718-b1fc-49c64cec2cb9-0', usage_metadata={'input_tokens': 22, 'output_tokens': 4, 'total_tokens': 26, 'input_token_details': {}, 'output_token_details': {}})

## Python HTML request

In [3]:

def clean_html_for_llm(url):
    response = requests.get(url)  
    if response.status_code != 200:
        return f"Failed to retrieve content: {response.status_code}"
    html_content = response.text
    soup = BeautifulSoup(html_content, "html.parser")
    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
        comment.extract()
    for element in soup.find_all(['script', 'style', 'noscript', 'iframe', 'svg', 'canvas']):
        element.extract()
    for element in soup.find_all(style=True):
        if any(pattern in element.get('style').lower() for pattern in ["display:none", "display: none", "visibility:hidden", "visibility: hidden"]):
            element.extract()
    for element in soup.find_all(class_=True):
        classes = element.get('class')
        if any(cls in str(classes).lower() for cls in ["hidden", "d-none", "hide", "invisible", "visually-hidden"]):
            element.extract()
    if soup.head:
        for tag in soup.head.find_all(['meta', 'link']):
            tag.extract()
    content_str = str(soup.body) if soup.body else str(soup)
    content_str = re.sub(r'\n\s*\n', '\n', content_str)
    content_str = re.sub(r'^\s+|\s+$', '', content_str, flags=re.MULTILINE)
    content_str = re.sub(r' {2,}', ' ', content_str)
    content_str = re.sub(r' data-[^=]*="[^"]*"', '', content_str)
    content_str = re.sub(r' class="[^"]*"', '', content_str)
    content_str = re.sub(r' id="[^"]*"', '', content_str)
    content_str = re.sub(r' style="[^"]*"', '', content_str)
    for _ in range(3):
        content_str = re.sub(r'<([a-z0-9]+)>\s*</\1>', '', content_str, flags=re.IGNORECASE)
    return content_str

url = "https://teatrdramatyczny.pl/whats-on"
cleaned_html = clean_html_for_llm(url)
with open("cleaned_repertoire.html", "w", encoding="utf-8") as file:
    file.write(cleaned_html)
print("Cleaned HTML content saved to 'cleaned_repertoire.html'")
original_size = len(requests.get(url).text)
cleaned_size = len(cleaned_html)
reduction = (1 - cleaned_size/original_size) * 100
print(f"Original size: {original_size} characters")
print(f"Cleaned size: {cleaned_size} characters")
print(f"Size reduction: {reduction:.2f}%")


Cleaned HTML content saved to 'cleaned_repertoire.html'
Original size: 427992 characters
Cleaned size: 50105 characters
Size reduction: 88.29%


## Parse HTML with LLM

In [10]:
with open("cleaned_repertoire.html", "r", encoding="utf-8") as file:
    html_content = file.read()
system_message = """
I will provide you with an HTML snippet containing information about theater performances. Extract all performances, including their titles, dates, and times, and return the result as a JSON array with the following format:
```
[
  {
    "title": "Performance name",
    "date": "YYYY-MM-DD",
    "time": "HH:MM",
    "status": "Performance status",
    "place": "Performance place",
  }
]
```
Do not include any extra text in the response—only return valid JSON.
"""
def is_json_complete(text: str) -> bool:
    try:
        json.loads(text)
        return True
    except json.JSONDecodeError:
        return False

messages = [
    SystemMessage(system_message),
    HumanMessage(html_content)
]

full_response = ""
max_loops = 5
loop_count = 0

while loop_count < max_loops:
    response = model.invoke(messages)
    full_response += response.content

    if is_json_complete(full_response):
        break

    print(f"⚠️ Detected incomplete JSON... requesting continuation (attempt {loop_count + 1})")

    messages.append(AIMessage(response.content))
    messages.append(HumanMessage("Please continue from where you left off. Only continue the JSON."))
    loop_count += 1

with open("llm_response.txt", "w") as file:
    file.write(full_response)

⚠️ Detected incomplete JSON... requesting continuation (attempt 1)
