In [1]:
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json
from urllib.parse import urljoin, urlparse
import re

class PinnaclePageSaver:
    def __init__(self, save_directory="pinnacle_page_data"):
        self.save_dir = save_directory
        self.setup_directories()
        self.setup_driver()
        
    def setup_directories(self):
        """Create directory structure for saving page assets"""
        directories = [
            self.save_dir,
            os.path.join(self.save_dir, "css"),
            os.path.join(self.save_dir, "js"), 
            os.path.join(self.save_dir, "images")
        ]
        
        for directory in directories:
            os.makedirs(directory, exist_ok=True)
            
    def setup_driver(self):
        """Setup Chrome driver with Pinnacle-specific options"""
        chrome_options = Options()
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        
        # Pinnacle-specific user agent
        chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
        
        self.driver = webdriver.Chrome(options=chrome_options)
        self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    def save_complete_page(self, url):
        """Save the complete Pinnacle page with all assets"""
        print(f"Loading Pinnacle page: {url}")
        
        try:
            # Load the page
            self.driver.get(url)
            
            # Wait for Pinnacle-specific elements to load
            print("Waiting for Pinnacle odds to load...")
            WebDriverWait(self.driver, 20).until(
                EC.any_of(
                    EC.presence_of_element_located((By.CSS_SELECTOR, ".bet-price")),
                    EC.presence_of_element_located((By.CSS_SELECTOR, ".odds-button")),
                    EC.presence_of_element_located((By.CSS_SELECTOR, "[class*='market']")),
                    EC.presence_of_element_located((By.CSS_SELECTOR, "[class*='participant']")),
                    EC.presence_of_element_located((By.CSS_SELECTOR, "[data-cy*='price']")),
                    EC.presence_of_element_located((By.CSS_SELECTOR, "button[class*='price']"))
                )
            )
            
            # Give extra time for Pinnacle's dynamic odds loading
            print("Waiting for dynamic odds updates...")
            time.sleep(6)
            
            print("Pinnacle page loaded successfully!")
            
            # Save HTML
            self.save_html()
            
            # Save CSS files
            self.save_css_files()
            
            # Save page metadata
            self.save_page_metadata(url)
            
            # Create a simplified offline version
            self.create_offline_version()
            
            print(f"✅ Pinnacle page saved successfully to: {self.save_dir}")
            
        except Exception as e:
            print(f"❌ Error saving Pinnacle page: {e}")
    
    def save_html(self):
        """Save the current HTML content"""
        print("Saving Pinnacle HTML...")
        
        html_content = self.driver.page_source
        
        # Save original HTML
        with open(os.path.join(self.save_dir, "original.html"), "w", encoding="utf-8") as f:
            f.write(html_content)
            
        # Save cleaned HTML
        cleaned_html = self.clean_html_for_offline(html_content)
        with open(os.path.join(self.save_dir, "cleaned.html"), "w", encoding="utf-8") as f:
            f.write(cleaned_html)
            
        print("✅ Pinnacle HTML saved")
    
    def save_css_files(self):
        """Extract and save CSS files"""
        print("Saving Pinnacle CSS files...")
        
        try:
            css_links = self.driver.find_elements(By.CSS_SELECTOR, "link[rel='stylesheet']")
            
            for i, link in enumerate(css_links):
                try:
                    href = link.get_attribute("href")
                    if href and "pinnacle" in href:  # Only save Pinnacle CSS
                        response = requests.get(href, timeout=10)
                        if response.status_code == 200:
                            filename = f"pinnacle_styles_{i}.css"
                            with open(os.path.join(self.save_dir, "css", filename), "w", encoding="utf-8") as f:
                                f.write(response.text)
                            print(f"  ✅ Saved: {filename}")
                except Exception as e:
                    print(f"  ❌ Failed to save CSS: {e}")
            
            # Save inline styles
            inline_styles = self.driver.find_elements(By.TAG_NAME, "style")
            for i, style in enumerate(inline_styles):
                style_content = style.get_attribute("innerHTML")
                if style_content:
                    with open(os.path.join(self.save_dir, "css", f"inline_{i}.css"), "w", encoding="utf-8") as f:
                        f.write(style_content)
                        
        except Exception as e:
            print(f"❌ Error saving CSS: {e}")
    
    def save_page_metadata(self, url):
        """Save Pinnacle-specific page metadata"""
        print("Saving Pinnacle page metadata...")
        
        try:
            metadata = {
                "url": url,
                "site": "Pinnacle",
                "title": self.driver.title,
                "timestamp": time.strftime('%Y-%m-%d %H:%M:%S'),
                "page_structure": self.analyze_pinnacle_structure(),
                "odds_elements": self.find_pinnacle_odds_elements(),
                "useful_selectors": self.find_pinnacle_selectors()
            }
            
            with open(os.path.join(self.save_dir, "metadata.json"), "w", encoding="utf-8") as f:
                json.dump(metadata, f, indent=2)
                
            print("✅ Pinnacle metadata saved")
            
        except Exception as e:
            print(f"❌ Error saving metadata: {e}")
    
    def analyze_pinnacle_structure(self):
        """Analyze Pinnacle-specific page structure"""
        try:
            structure = {
                "total_elements": len(self.driver.find_elements(By.XPATH, "//*")),
                "data_cy_elements": len(self.driver.find_elements(By.CSS_SELECTOR, "[data-cy]")),
                "bet_price_elements": len(self.driver.find_elements(By.CSS_SELECTOR, ".bet-price, [class*='price']")),
                "odds_buttons": len(self.driver.find_elements(By.CSS_SELECTOR, ".odds-button, button[class*='odds']")),
                "market_elements": len(self.driver.find_elements(By.CSS_SELECTOR, "[class*='market']")),
                "participant_elements": len(self.driver.find_elements(By.CSS_SELECTOR, "[class*='participant'], [class*='team']"))
            }
            return structure
        except Exception as e:
            return {"error": str(e)}
    
    def find_pinnacle_odds_elements(self):
        """Find Pinnacle odds-related elements"""
        odds_info = []
        
        try:
            # Pinnacle-specific selectors
            pinnacle_selectors = [
                ".bet-price",
                ".odds-button", 
                "[data-cy*='price']",
                "[data-cy*='odds']",
                "[class*='market']",
                "[class*='participant']",
                "button[class*='price']",
                "button[class*='odds']",
                "[class*='decimal']"
            ]
            
            for selector in pinnacle_selectors:
                elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                if elements:
                    odds_info.append({
                        "selector": selector,
                        "count": len(elements),
                        "sample_texts": [elem.text[:50] for elem in elements[:3] if elem.text.strip()],
                        "sample_attributes": [
                            {
                                "data-cy": elem.get_attribute("data-cy"),
                                "class": elem.get_attribute("class"),
                                "role": elem.get_attribute("role")
                            } for elem in elements[:2]
                        ]
                    })
            
            return odds_info
            
        except Exception as e:
            return [{"error": str(e)}]
    
    def find_pinnacle_selectors(self):
        """Find useful selectors for Pinnacle scraping"""
        selectors = {}
        
        try:
            # Look for game/match info
            game_elements = self.driver.find_elements(By.CSS_SELECTOR, 
                "h1, h2, [class*='event'], [class*='match'], [data-cy*='event']")
            if game_elements:
                selectors["game_info"] = {
                    "selector": "h1, h2, [class*='event'], [class*='match']",
                    "count": len(game_elements),
                    "samples": [elem.text[:50] for elem in game_elements[:3] if elem.text.strip()]
                }
            
            # Look for moneyline/match winner
            moneyline_elements = self.driver.find_elements(By.CSS_SELECTOR, 
                "[data-cy*='moneyline'], [data-cy*='match-winner'], [class*='match-winner']")
            if moneyline_elements:
                selectors["moneyline"] = {
                    "selector": "[data-cy*='moneyline'], [data-cy*='match-winner']",
                    "count": len(moneyline_elements),
                    "samples": [elem.text[:50] for elem in moneyline_elements[:3] if elem.text.strip()]
                }
            
            # Look for decimal odds patterns (Pinnacle uses decimal)
            all_text = self.driver.find_element(By.TAG_NAME, "body").text
            decimal_odds_matches = re.findall(r'\b\d+\.\d{2,3}\b', all_text)
            if decimal_odds_matches:
                # Filter for likely odds (between 1.01 and 50.00)
                likely_odds = [odds for odds in decimal_odds_matches 
                             if 1.01 <= float(odds) <= 50.0]
                if likely_odds:
                    selectors["decimal_odds_found"] = {
                        "pattern": r'\\b\\d+\\.\\d{2,3}\\b',
                        "samples": likely_odds[:10]
                    }
            
            return selectors
            
        except Exception as e:
            return {"error": str(e)}
    
    def clean_html_for_offline(self, html_content):
        """Clean HTML for offline testing"""
        # Remove external scripts
        html_content = re.sub(r'<script[^>]+src="[^"]*"[^>]*></script>', '', html_content)
        
        # Remove external CSS
        html_content = re.sub(r'<link[^>]+rel="stylesheet"[^>]*>', '', html_content)
        
        # Remove problematic inline scripts
        html_content = re.sub(r'<script[^>]*>(.*?)</script>', '', html_content, flags=re.DOTALL)
        
        # Add note
        note = """
        <!-- 
        This is a cleaned version of the Pinnacle page for scraping tests.
        External scripts and stylesheets have been removed.
        Look for data-cy attributes, .bet-price, and decimal odds patterns.
        -->
        """
        
        html_content = html_content.replace('<html', note + '\n<html')
        return html_content
    
    def create_offline_version(self):
        """Create a simplified Pinnacle test page"""
        print("Creating Pinnacle offline test version...")
        
        offline_html = f"""
<!DOCTYPE html>
<html>
<head>
    <title>Pinnacle Test Page - Padres vs Brewers</title>
    <style>
        body {{ font-family: Arial, sans-serif; margin: 20px; background: #f5f5f5; }}
        .event-header {{ background: #003d82; color: white; padding: 20px; margin-bottom: 20px; }}
        .market-container {{ background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
        .bet-price {{ 
            background: #e8f4fd; border: 1px solid #ccc; padding: 12px 20px; 
            margin: 5px; border-radius: 4px; cursor: pointer; display: inline-block;
            font-weight: bold; color: #003d82;
        }}
        .bet-price:hover {{ background: #d0e8f7; }}
        .participant-row {{ display: flex; justify-content: space-between; align-items: center; margin: 15px 0; }}
        .participant-name {{ font-weight: bold; font-size: 16px; }}
        .odds-display {{ font-size: 18px; font-weight: bold; color: #003d82; }}
    </style>
</head>
<body>
    <div class="event-header">
        <h1 data-cy="event-title">San Diego Padres vs Milwaukee Brewers</h1>
        <p>MLB - Match Winner</p>
    </div>
    
    <div class="market-container" data-cy="match-winner-market">
        <h2>Match Winner</h2>
        
        <div class="participant-row" data-cy="padres-participant">
            <span class="participant-name">San Diego Padres</span>
            <div class="bet-price" data-cy="padres-price">
                <span class="odds-display">2.050</span>
            </div>
        </div>
        
        <div class="participant-row" data-cy="brewers-participant">
            <span class="participant-name">Milwaukee Brewers</span>
            <div class="bet-price" data-cy="brewers-price">
                <span class="odds-display">1.884</span>
            </div>
        </div>
    </div>
    
    <!-- Additional test elements -->
    <div class="odds-container" data-cy="all-odds">
        <button class="odds-button" data-cy="padres-odds">2.050</button>
        <button class="odds-button" data-cy="brewers-odds">1.884</button>
    </div>
    
</body>
</html>
        """
        
        with open(os.path.join(self.save_dir, "test_page.html"), "w", encoding="utf-8") as f:
            f.write(offline_html)
            
        print("✅ Pinnacle offline test page created")
    
    def close(self):
        """Close the browser"""
        if hasattr(self, 'driver'):
            self.driver.quit()
    
    def __enter__(self):
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

# Usage
def main():
    # Pinnacle URL (corrected)
    pinnacle_url = "https://www.pinnacle.com/en/baseball/mlb/san-diego-padres-vs-milwaukee-brewers/1610489833/#all"
    
    with PinnaclePageSaver() as saver:
        saver.save_complete_page(pinnacle_url)
        
        print("\n" + "="*50)
        print("PINNACLE FILES SAVED:")
        print("="*50)
        print("📁 pinnacle_page_data/")
        print("   📄 original.html      - Full Pinnacle page source")
        print("   📄 cleaned.html       - Cleaned for testing")
        print("   📄 test_page.html     - Simplified test version")
        print("   📄 metadata.json      - Pinnacle page analysis")
        print("   📁 css/               - Pinnacle stylesheets")
        print("\n💡 Key Pinnacle selectors to look for:")
        print("   - [data-cy*='price']")
        print("   - .bet-price")
        print("   - [data-cy*='participant']")
        print("   - Decimal odds pattern: \\d+\\.\\d{2,3}")

if __name__ == "__main__":
    main()

Loading Pinnacle page: https://www.pinnacle.com/en/baseball/mlb/san-diego-padres-vs-milwaukee-brewers/1610489833/#all
Waiting for Pinnacle odds to load...
Waiting for dynamic odds updates...
Pinnacle page loaded successfully!
Saving Pinnacle HTML...
✅ Pinnacle HTML saved
Saving Pinnacle CSS files...
  ✅ Saved: pinnacle_styles_0.css
  ✅ Saved: pinnacle_styles_2.css
  ✅ Saved: pinnacle_styles_3.css
  ✅ Saved: pinnacle_styles_4.css
  ✅ Saved: pinnacle_styles_5.css
  ✅ Saved: pinnacle_styles_6.css
  ✅ Saved: pinnacle_styles_7.css
  ✅ Saved: pinnacle_styles_8.css
  ✅ Saved: pinnacle_styles_9.css
  ✅ Saved: pinnacle_styles_10.css
  ✅ Saved: pinnacle_styles_11.css
  ✅ Saved: pinnacle_styles_12.css
  ✅ Saved: pinnacle_styles_13.css
  ✅ Saved: pinnacle_styles_14.css
  ✅ Saved: pinnacle_styles_15.css
Saving Pinnacle page metadata...
✅ Pinnacle metadata saved
Creating Pinnacle offline test version...
✅ Pinnacle offline test page created
✅ Pinnacle page saved successfully to: pinnacle_page_data

P