In [55]:
import time

In [None]:
from src.api.llm_base_client import LLMBaseClient

In [86]:
PROMPT_TEMPLATE = """
You are an AI Assistant that assists in scraping web pages. In particular you find and extract sporting bets from betting sites. You are provided the page content from some webpage.

Always follow the following instructions and never deviate from them:
- Always follow the instructions and only respond with information extracted from provided sources. Never invent information not present in any source material provided to you.
- Carefully read the provided page content and analyze its contents.
- Find all mentioned sporting bets and extract the required information in JSON format as a list of dictionaries in the following format:
    [{{
        "name": <name of the bet as string>,
        "odds": <dictionary with containing the odds of each possible event. The event name as string key, the odds as floating point value.>
    }}, ...]
- Respond only with the JSON content and do not add any explanations or elaborations to your response.
- If you cannot find any bets simply respond with an empty list and no additional words.

An example response could look as follows:
[{{
    "name": "Football game Team A vs Team B",
    "odds": {{"Team A": 2.5,
              "Draw": 44,
              "Team B": 1.5}}
}},
{{
    "name": "Hockey game Team C vs Team D",
    "odds": {{"Team C": 3.5,
              "Draw": 2,
              "Team D": 0.5}}
}}
"""

In [87]:
class BettingScraper(LLMBaseClient):
    def __init__(self):
        super().__init__(PROMPT_TEMPLATE)

In [88]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def fetch_page_content(url):
    """
    Fetches the entire page content from a given URL using Selenium to avoid scraping detections.

    Args:
        url (str): The URL of the webpage to fetch.

    Returns:
        str: The entire content of the webpage as a string.

    Raises:
        Exception: If there is an issue with the WebDriver or fetching the page content.
    """
    try:
        # Set up Selenium WebDriver with headless option
        options = webdriver.FirefoxOptions()
        #options.add_argument("--headless")  # Hide browsera
        options.set_preference("browser.download.folderList", 2)
        options.set_preference("browser.download.manager.showWhenStarting", False)
        options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/x-gzip")
        options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36')
        options.add_argument('--window-size=1920,1080')
        options.add_argument('--disable-blink-features=AutomationControlled')
        driver = webdriver.Firefox(options=options)

        # Navigate to the URL
        driver.get(url)

        # Wait for the initial page load and JavaScript to finish rendering
        WebDriverWait(driver, 30).until(
            lambda d: d.execute_script('return document.readyState') == 'complete'
        )

        # Ensure the page has stabilized and dynamic content is loaded
        previous_content = ""
        stable_count = 0
        max_stable_checks = 5
        while stable_count < max_stable_checks:
            current_content = driver.page_source
            if current_content == previous_content:
                stable_count += 1
            else:
                stable_count = 0
            previous_content = current_content
            if stable_count >= 3:
                break
            time.sleep(0.1)  # Wait briefly before checking again

        # Scroll to the bottom to load all dynamic content
        for _ in range(3):  # Scroll down three times
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(0.5)  # Allow time for content to load
        
        # Get the page source
        content = driver.page_source

        # Quit the driver
        driver.quit()

        return content
    except Exception as e:
        print(f"An error occurred while fetching the URL using Selenium: {e}")
        return None

In [89]:
content = fetch_page_content("https://www.bahigo92.com/de/sportwetten/")

In [68]:
content



In [90]:
scraper = BettingScraper()

In [91]:
result = scraper.invoke(f"This is the page content from which you should extract the bets in JSON format:  {content}")

In [93]:
import json

json.loads(result)

{'bets': [{'name': 'England (W) - USA (W)',
   'odds': {'England (W)': 14.0, 'USA (W)': 1.05, 'Draw': 11.0}},
  {'name': 'West Ham United - Arsenal',
   'odds': {'West Ham United': 1.4, 'Arsenal': 3.5, 'Draw': 13.0}},
  {'name': 'Borussia Dortmund - Bayern Munich',
   'odds': {'Borussia Dortmund': 1.4, 'Bayern Munich': 3.5, 'Draw': 13.0}},
  {'name': 'Espanyol - Celta de Vigo',
   'odds': {'Espanyol': 1.02, 'Celta de Vigo': 11.0, 'Draw': 75.0}},
  {'name': 'Cadiz CF - Deportivo La Coruña',
   'odds': {'Cadiz CF': 2.9, 'Deportivo La Coruña': 1.6, 'Draw': 9.0}},
  {'name': 'Racing Santander - CD Mirandes',
   'odds': {'Racing Santander': 2.7, 'CD Mirandes': 1.6, 'Draw': 14.0}},
  {'name': 'AFC Wimbledon - Dag & Red FC',
   'odds': {'AFC Wimbledon': 1.55, 'Dag & Red FC': 4.0, 'Draw': 5.0}},
  {'name': 'FC Zürich - Grasshopper',
   'odds': {'FC Zürich': 2.25, 'Grasshopper': 3.2, 'Draw': 3.3}},
  {'name': 'Bologna FC - FC Venedig',
   'odds': {'Bologna FC': 1.8, 'FC Venedig': 3.4, 'Draw': 4

