In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import json
import time
from urllib.parse import urljoin, urlparse
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


In [3]:
# Base URLs and configuration
BASE_URL = "https://www.newrecruit.eu"
LADDER_URL = f"{BASE_URL}/ladder"

# Headers to avoid being blocked
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Rate limiting - be respectful to the server
REQUEST_DELAY = 1  # seconds between requests

# Game system identifier (may need to be adjusted based on actual dropdown value)
TOW_GAME_SYSTEM = "Warhammer The Old World"

print(f"Base URL: {BASE_URL}")
print(f"Ladder URL: {LADDER_URL}")
print(f"Target Game System: {TOW_GAME_SYSTEM}")


Base URL: https://www.newrecruit.eu
Ladder URL: https://www.newrecruit.eu/ladder
Target Game System: Warhammer The Old World


In [4]:
def make_request(url, session=None):
    """Make a web request with error handling and rate limiting."""
    if session is None:
        session = requests.Session()
    
    try:
        time.sleep(REQUEST_DELAY)  # Rate limiting
        response = session.get(url, headers=HEADERS)
        response.raise_for_status()
        return response
    except requests.RequestException as e:
        logger.error(f"Error fetching {url}: {e}")
        return None

def safe_extract_text(element, default=""):
    """Safely extract text from a BeautifulSoup element."""
    return element.get_text(strip=True) if element else default

def extract_id_from_url(url, param_name='id'):
    """Extract ID parameter from URL."""
    try:
        from urllib.parse import urlparse, parse_qs
        parsed = urlparse(url)
        query_params = parse_qs(parsed.query)
        return query_params.get(param_name, [None])[0]
    except:
        return None

# Test the utility functions
print("Utility functions defined successfully!")


Utility functions defined successfully!


In [5]:
def get_player_ids_from_ladder(session=None):
    """
    Get player IDs from the New Recruit ladder for Warhammer The Old World.
    
    Based on user's navigation:
    1. Go to https://www.newrecruit.eu/ladder
    2. Select "Warhammer The Old World" from dropdown
    3. Extract player profile links like /app/Profile?id=65c1ed233771bd78230a7539
    """
    if session is None:
        session = requests.Session()
    
    logger.info("Fetching player IDs from ladder...")
    
    response = make_request(LADDER_URL, session)
    if not response:
        return []
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Look for player profile links
    # Pattern: <a data-v-e0ae6b26="" class="blue" href="/app/Profile?id=65c1ed233771bd78230a7539">Arvid_dc</a>
    player_links = soup.find_all('a', {'class': 'blue', 'href': re.compile(r'/app/Profile\?id=')})
    
    player_data = []
    for link in player_links:
        href = link.get('href', '')
        player_name = safe_extract_text(link)
        player_id = extract_id_from_url(f"{BASE_URL}{href}")
        
        if player_id and player_name:
            player_data.append({
                'player_id': player_id,
                'player_name': player_name,
                'profile_url': f"{BASE_URL}{href}"
            })
    
    logger.info(f"Found {len(player_data)} players")
    return player_data

# Test function (commented out to avoid making actual requests during setup)
# players = get_player_ids_from_ladder()
# print(f"Function defined. Would find players like: {players[:3] if players else 'None found'}")
print("Player discovery function defined!")


Player discovery function defined!


In [6]:
def get_match_history_urls(player_profile_url, session=None):
    """
    Extract match history from a player's profile page.
    
    Based on user's navigation:
    1. Go to player profile
    2. Look for Match History section: <h3>Match History <span>(68)</span></h3>
    3. Find Match Details links
    4. Extract army list URLs from match details
    """
    if session is None:
        session = requests.Session()
    
    logger.info(f"Fetching match history from {player_profile_url}")
    
    response = make_request(player_profile_url, session)
    if not response:
        return []
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Look for Match History section
    # Pattern: <h3 data-v-d3008f2d="" class="arrowTitle collapsed titleClickEffect titreCategory">
    #          <img data-v-d3008f2d="" src="/assets/icons/right1.png" class="icon arrow"> 
    #          Match History <span data-v-5d2bbe3b="" class="matchNum">(68)</span></h3>
    
    match_history_section = soup.find('h3', string=re.compile(r'Match History'))
    if not match_history_section:
        logger.warning(f"No match history section found for {player_profile_url}")
        return []
    
    # Look for Match Details links
    # Pattern: <a data-v-c0edcf0d="" href="#">Match Details</a>
    match_detail_links = soup.find_all('a', string=re.compile(r'Match Details'))
    
    match_urls = []
    for link in match_detail_links:
        href = link.get('href', '')
        if href and href != '#':
            full_url = urljoin(player_profile_url, href)
            match_urls.append(full_url)
    
    logger.info(f"Found {len(match_urls)} match detail links")
    return match_urls


In [7]:
def extract_army_list_urls_from_match(match_detail_url, session=None):
    """
    Extract army list URLs from a match details page.
    
    Based on user's navigation:
    1. Go to match details page
    2. Look for eye icon: <img data-v-983b100b="" src="/assets/icons/eye.png">
    3. Extract the army list URL that the eye icon links to
    """
    if session is None:
        session = requests.Session()
    
    logger.info(f"Extracting army lists from {match_detail_url}")
    
    response = make_request(match_detail_url, session)
    if not response:
        return {}
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Look for eye icons that lead to army lists
    # Pattern: <img data-v-983b100b="" src="/assets/icons/eye.png">
    eye_icons = soup.find_all('img', {'src': '/assets/icons/eye.png'})
    
    army_list_data = {
        'player_list_url': None,
        'opponent_list_url': None,
        'match_result': None,
        'opponent_faction': None
    }
    
    # Extract army list URLs from eye icon parent links
    for i, eye_icon in enumerate(eye_icons):
        parent_link = eye_icon.find_parent('a')
        if parent_link:
            href = parent_link.get('href', '')
            if href and '/app/list/' in href:
                full_url = urljoin(match_detail_url, href)
                if i == 0:
                    army_list_data['player_list_url'] = full_url
                elif i == 1:
                    army_list_data['opponent_list_url'] = full_url
    
    # Try to extract match result and opponent info (this will depend on page structure)
    # This is a placeholder - actual implementation depends on how match results are displayed
    result_elements = soup.find_all(string=re.compile(r'(Win|Loss|Draw|Victory|Defeat)'))
    if result_elements:
        army_list_data['match_result'] = result_elements[0].strip()
    
    return army_list_data

print("Match detail extraction function defined!")


Match detail extraction function defined!


In [8]:
def parse_army_list(list_url, session=None):
    """
    Parse an army list page to extract detailed composition data.
    
    Target data to extract:
    - Faction name
    - Total points
    - Units with quantities, upgrades, equipment
    - Character details
    - Point costs per unit/upgrade
    """
    if session is None:
        session = requests.Session()
    
    logger.info(f"Parsing army list from {list_url}")
    
    response = make_request(list_url, session)
    if not response:
        return None
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    army_data = {
        'list_url': list_url,
        'faction': None,
        'list_name': None,
        'total_points': None,
        'characters': [],
        'core_units': [],
        'special_units': [],
        'rare_units': [],
        'raw_html': str(soup)  # For debugging and manual review
    }
    
    # Extract list title and faction
    # This will depend on the specific HTML structure of New Recruit list pages
    title_element = soup.find('title')
    if title_element:
        title_text = safe_extract_text(title_element)
        army_data['list_name'] = title_text
        
        # Try to extract faction from title
        # Common patterns: \"Faction Name - List Name - [Points]\"
        if ' - ' in title_text:
            parts = title_text.split(' - ')
            if len(parts) >= 1:
                army_data['faction'] = parts[0].strip()
    
    # Extract total points
    # Look for patterns like \"[1495pts]\" or \"1495 points\"
    points_pattern = re.compile(r'\\[?(\\d+)\\s*pts?\\]?', re.IGNORECASE)
    page_text = soup.get_text()
    points_match = points_pattern.search(page_text)
    if points_match:
        army_data['total_points'] = int(points_match.group(1))
    
    # Extract units by category
    # This is complex and will depend on New Recruit's HTML structure
    # For now, we'll store the raw HTML for manual inspection
    
    return army_data

def extract_units_from_list_section(soup, section_name):
    """Helper function to extract units from a specific army list section."""
    units = []
    
    # Look for section headers like \"Characters\", \"Core\", \"Special\", \"Rare\"
    section_header = soup.find(string=re.compile(section_name, re.IGNORECASE))
    if not section_header:
        return units
    
    # Navigate to the parent element and find following units
    # This is highly dependent on New Recruit's HTML structure
    
    return units

print("Army list parsing functions defined!")


Army list parsing functions defined!


In [None]:
# Selenium-based approach for dynamic content
def get_players_with_selenium(headless=True, timeout=10):
    """Use Selenium to interact with the dynamic ladder page."""
    
    try:
        from selenium import webdriver
        from selenium.webdriver.common.by import By
        from selenium.webdriver.support.ui import WebDriverWait, Select
        from selenium.webdriver.support import expected_conditions as EC
        from selenium.webdriver.chrome.options import Options
        from webdriver_manager.chrome import ChromeDriverManager
        from selenium.webdriver.chrome.service import Service
        
        # Set up Chrome options
        chrome_options = Options()
        if headless:
            chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        
        # Create driver
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)
        
        try:
            print("🚀 Opening New Recruit ladder page...")
            driver.get(LADDER_URL)
            
            # Wait for page to load
            wait = WebDriverWait(driver, timeout)
            
            # Look for the game system dropdown
            print("🔍 Looking for game system dropdown...")
            try:
                # Try to find dropdown with "Warhammer The Old World" option
                dropdown_selectors = [
                    "select",  # Generic select element
                    "[data-v-5065dd38]",  # From user's HTML snippet
                    ".select",
                    ".dropdown"
                ]
                
                dropdown = None
                for selector in dropdown_selectors:
                    try:
                        dropdown = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
                        print(f"✅ Found dropdown with selector: {selector}")
                        break
                    except:
                        continue
                
                if dropdown:
                    # Try to select "Warhammer The Old World"
                    select = Select(dropdown)
                    
                    # Print all available options
                    options = select.options
                    print(f"📋 Found {len(options)} dropdown options:")
                    for i, option in enumerate(options):
                        option_text = option.get_attribute('textContent') or option.text
                        print(f"  {i}: {option_text}")
                        
                        # Try to select Old World option
                        if 'old world' in option_text.lower():
                            print(f"🎯 Selecting: {option_text}")
                            select.select_by_visible_text(option_text)
                            time.sleep(2)  # Wait for content to load
                            break
                
                # Now look for player links
                print("🔍 Looking for player profile links...")
                player_links = driver.find_elements(By.CSS_SELECTOR, "a[href*='Profile']")
                
                players = []
                for link in player_links:
                    href = link.get_attribute('href')
                    text = link.text.strip()
                    if href and text and 'id=' in href:
                        player_id = extract_id_from_url(href)
                        if player_id:
                            players.append({
                                'player_id': player_id,
                                'player_name': text,
                                'profile_url': href
                            })
                
                print(f"✅ Found {len(players)} players!")
                return players
                
            except Exception as e:
                print(f"❌ Error finding dropdown or players: {e}")
                
                # Fallback: Save page source for debugging
                with open('selenium_page_source.html', 'w', encoding='utf-8') as f:
                    f.write(driver.page_source)
                print("💾 Saved page source to selenium_page_source.html")
                
                return []
        
        finally:
            driver.quit()
            
    except ImportError as e:
        print(f"❌ Selenium not available: {e}")
        print("Install with: pip install selenium webdriver-manager")
        return []
    except Exception as e:
        print(f"❌ Selenium error: {e}")
        return []

print("✅ Selenium scraper function defined!")


In [None]:
# Let's first test what we're actually getting from the ladder page
response = make_request(LADDER_URL)
if response:
    soup = BeautifulSoup(response.content, 'html.parser')
    
    print("=== PAGE TITLE ===")
    title = soup.find('title')
    if title:
        print(title.get_text())
    
    print("\n=== LOOKING FOR GAME SYSTEM DROPDOWN ===")
    # Look for dropdown options or game system selection
    options = soup.find_all('option')
    for option in options:
        text = safe_extract_text(option)
        if text:
            print(f"Option: {text}")
            
    print("\n=== LOOKING FOR GAME SYSTEM REFERENCES ===")
    # Look for any text containing "Warhammer The Old World"
    tow_references = soup.find_all(string=re.compile(r'Warhammer.*Old.*World', re.IGNORECASE))
    for ref in tow_references[:5]:  # First 5 matches
        print(f"Found: {ref.strip()}")
    
    print("\n=== LOOKING FOR PROFILE LINKS ===")
    # Look for any profile links
    profile_links = soup.find_all('a', href=re.compile(r'Profile'))
    print(f"Found {len(profile_links)} profile links")
    for link in profile_links[:3]:  # First 3
        href = link.get('href', '')
        text = safe_extract_text(link)
        print(f"Link: {text} -> {href}")
        
    print("\n=== LOOKING FOR DATA ATTRIBUTES ===")
    # Look for Vue.js data attributes that might indicate dynamic content
    vue_elements = soup.find_all(attrs={"data-v-": True})
    print(f"Found {len(vue_elements)} Vue.js elements")
    
else:
    print("Failed to fetch the ladder page")


In [None]:
# Let's run the Selenium scraper to get players from the ladder
print("🚀 Starting New Recruit ladder scraping with Selenium...")
selenium_players = get_players_with_selenium(headless=False, timeout=15)
print(f"✅ Found {len(selenium_players)} players")

if selenium_players:
    print("\n📋 First few players:")
    for i, player in enumerate(selenium_players[:5]):
        print(f"  {i+1}. {player['player_name']} (ID: {player['player_id']})")
        print(f"     URL: {player['profile_url']}")
else:
    print("❌ No players found. Let's debug the issue...")


In [None]:
# Simple test to see the page content
response = make_request(LADDER_URL)
if response:
    soup = BeautifulSoup(response.content, 'html.parser')
    page_text = soup.get_text()
    
    print(f"Page length: {len(page_text)} characters")
    print(f"Response status: {response.status_code}")
    print(f"Content type: {response.headers.get('content-type', 'unknown')}")
    
    # Check if this is a single-page application that loads content dynamically
    if 'vue' in page_text.lower() or 'react' in page_text.lower() or len(page_text) < 1000:
        print("\\nThis appears to be a single-page application with dynamic content loading")
        print("We may need to use different approach or find API endpoints")
    
    # Look for any script tags that might indicate API endpoints
    scripts = soup.find_all('script')
    print(f"\\nFound {len(scripts)} script tags")
    
    # Look for any obvious game system identifiers
    if 'old world' in page_text.lower():
        print("\\n✅ Found 'Old World' reference in page")
    else:
        print("\\n❌ No 'Old World' reference found")
        
    # Save the raw HTML for inspection
    with open('ladder_page.html', 'w', encoding='utf-8') as f:
        f.write(response.text)
    print("\\nSaved raw HTML to 'ladder_page.html' for inspection")
    
else:
    print("Failed to fetch ladder page")


In [None]:
# Alternative approach: Try to find API endpoints or different URL patterns
def explore_newrecruit_structure(session=None):
    """Explore the New Recruit site structure to find the right endpoints."""
    if session is None:
        session = requests.Session()
    
    # Try common URL patterns that might have game-specific data
    test_urls = [
        f"{BASE_URL}/ladder",
        f"{BASE_URL}/ladder/tow",  # The Old World abbreviation
        f"{BASE_URL}/ladder/warhammer-the-old-world",
        f"{BASE_URL}/api/ladder",
        f"{BASE_URL}/api/players",
        f"{BASE_URL}/api/games",
        f"{BASE_URL}/app/MySystems",  # From the screenshot URL
    ]
    
    for url in test_urls:
        print(f"\nTesting: {url}")
        response = make_request(url, session)
        if response and response.status_code == 200:
            content_length = len(response.text)
            content_type = response.headers.get('content-type', 'unknown')
            print(f"  ✅ Success! Length: {content_length}, Type: {content_type}")
            
            # Check if it contains useful data
            if 'json' in content_type:
                try:
                    data = response.json()
                    print(f"  📊 JSON data with {len(data) if isinstance(data, (list, dict)) else 'unknown'} items")
                    if isinstance(data, dict) and 'players' in str(data).lower():
                        print("  🎯 Contains player data!")
                except:
                    print("  ⚠️ Invalid JSON")
            elif 'old world' in response.text.lower():
                print("  🎯 Contains Old World reference!")
        else:
            status = response.status_code if response else 'No response'
            print(f"  ❌ Failed: {status}")

# Run the exploration
explore_newrecruit_structure()


In [None]:
# Since we know from the tournament reports that there are lists with specific IDs,
# let's try to directly access some known army list URLs to understand the pattern
def test_known_army_lists():
    """Test parsing known army list URLs from tournament reports."""
    
    # Known URLs from our research
    known_list_urls = [
        "https://www.newrecruit.eu/app/list/Wfirs",  # Empire list from Woehammer
        # We can add more as we find them
    ]
    
    for list_url in known_list_urls:
        print(f"\n=== TESTING KNOWN LIST: {list_url} ===")
        response = make_request(list_url)
        if response:
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Try to extract basic info
            title = soup.find('title')
            if title:
                print(f"Title: {title.get_text()}")
                
            # Look for faction and points info
            page_text = soup.get_text()
            
            # Look for point values
            points_matches = re.findall(r'\[(\d+)pts\]', page_text)
            if points_matches:
                print(f"Found point values: {points_matches[:5]}")  # First 5
                
            # Look for faction names
            if 'Empire' in page_text:
                print("✅ Contains Empire faction")
            if 'Old World' in page_text:
                print("✅ Contains Old World reference")
                
            # Save this list for detailed analysis
            filename = f"list_{list_url.split('/')[-1]}.html"
            with open(filename, 'w', encoding='utf-8') as f:
                f.write(response.text)
            print(f"Saved to {filename}")
            
        else:
            print(f"❌ Failed to fetch {list_url}")

test_known_army_lists()


In [None]:
# Updated approach: Since the main ladder is dynamic, let's try browser automation
# First, let's see if we can detect what we need for Selenium

def check_selenium_requirements():
    """Check if we can use Selenium for browser automation."""
    try:
        from selenium import webdriver
        from selenium.webdriver.common.by import By
        from selenium.webdriver.support.ui import WebDriverWait
        from selenium.webdriver.support import expected_conditions as EC
        from selenium.webdriver.chrome.options import Options
        print("✅ Selenium is available")
        return True
    except ImportError as e:
        print(f"❌ Selenium not available: {e}")
        print("To install: pip install selenium")
        return False

# Alternative: Parse the page source more carefully for hidden data
def extract_vue_data_from_page():
    """Try to extract Vue.js data from the initial page load."""
    response = make_request(LADDER_URL)
    if not response:
        return None
        
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Look for script tags that might contain initial data
    scripts = soup.find_all('script')
    
    for script in scripts:
        script_content = script.string if script.string else ""
        
        # Look for common patterns in Vue apps
        if any(pattern in script_content for pattern in ['window.__INITIAL_STATE__', 'window.config', 'gameSystem', 'players']):
            print("Found potential data in script tag!")
            print(script_content[:500])  # First 500 chars
            return script_content
    
    return None

print("Checking requirements and looking for embedded data...")
selenium_available = check_selenium_requirements()
vue_data = extract_vue_data_from_page()


In [None]:
# Selenium-based approach for dynamic content
def get_players_with_selenium(headless=True, timeout=10):
    """Use Selenium to interact with the dynamic ladder page."""
    
    try:
        from selenium import webdriver
        from selenium.webdriver.common.by import By
        from selenium.webdriver.support.ui import WebDriverWait, Select
        from selenium.webdriver.support import expected_conditions as EC
        from selenium.webdriver.chrome.options import Options
        from webdriver_manager.chrome import ChromeDriverManager
        from selenium.webdriver.chrome.service import Service
        
        # Set up Chrome options
        chrome_options = Options()
        if headless:
            chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        
        # Create driver
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)
        
        try:
            print("🚀 Opening New Recruit ladder page...")
            driver.get(LADDER_URL)
            
            # Wait for page to load
            wait = WebDriverWait(driver, timeout)
            
            # Look for the game system dropdown
            print("🔍 Looking for game system dropdown...")
            try:
                # Try to find dropdown with "Warhammer The Old World" option
                dropdown_selectors = [
                    "select",  # Generic select element
                    "[data-v-5065dd38]",  # From user's HTML snippet
                    ".select",
                    ".dropdown"
                ]
                
                dropdown = None
                for selector in dropdown_selectors:
                    try:
                        dropdown = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
                        print(f"✅ Found dropdown with selector: {selector}")
                        break
                    except:
                        continue
                
                if dropdown:
                    # Try to select "Warhammer The Old World"
                    select = Select(dropdown)
                    
                    # Print all available options
                    options = select.options
                    print(f"📋 Found {len(options)} dropdown options:")
                    for i, option in enumerate(options):
                        option_text = option.get_attribute('textContent') or option.text
                        print(f"  {i}: {option_text}")
                        
                        # Try to select Old World option
                        if 'old world' in option_text.lower():
                            print(f"🎯 Selecting: {option_text}")
                            select.select_by_visible_text(option_text)
                            time.sleep(2)  # Wait for content to load
                            break
                
                # Now look for player links
                print("🔍 Looking for player profile links...")
                player_links = driver.find_elements(By.CSS_SELECTOR, "a[href*='Profile']")
                
                players = []
                for link in player_links:
                    href = link.get_attribute('href')
                    text = link.text.strip()
                    if href and text and 'id=' in href:
                        player_id = extract_id_from_url(href)
                        if player_id:
                            players.append({
                                'player_id': player_id,
                                'player_name': text,
                                'profile_url': href
                            })
                
                print(f"✅ Found {len(players)} players!")
                return players
                
            except Exception as e:
                print(f"❌ Error finding dropdown or players: {e}")
                
                # Fallback: Save page source for debugging
                with open('selenium_page_source.html', 'w', encoding='utf-8') as f:
                    f.write(driver.page_source)
                print("💾 Saved page source to selenium_page_source.html")
                
                return []
        
        finally:
            driver.quit()
            
    except ImportError as e:
        print(f"❌ Selenium not available: {e}")
        print("Install with: pip install selenium webdriver-manager")
        return []
    except Exception as e:
        print(f"❌ Selenium error: {e}")
        return []

# Test the selenium approach
print("\\n" + "="*50)
print("TESTING SELENIUM APPROACH")
print("="*50)
selenium_players = get_players_with_selenium(headless=False)  # Set to False to see the browser
print(f"Result: {len(selenium_players)} players found")


In [9]:
def scrape_all_army_lists(max_players=10, max_matches_per_player=5):
    """
    Main function to scrape army lists from New Recruit ladder.
    
    Args:
        max_players: Maximum number of players to scrape (for testing)
        max_matches_per_player: Maximum matches to check per player
    
    Returns:
        List of army list data dictionaries
    """
    session = requests.Session()
    all_army_data = []
    
    logger.info("Starting army list scraping...")
    
    # Step 1: Get player IDs from ladder
    players = get_player_ids_from_ladder(session)
    if not players:
        logger.error("No players found on ladder!")
        return []
    
    logger.info(f"Found {len(players)} players, processing first {max_players}")
    
    # Step 2: For each player, get their match history
    for i, player in enumerate(players[:max_players]):
        logger.info(f"Processing player {i+1}/{min(len(players), max_players)}: {player['player_name']}")
        
        # Get match URLs from player profile
        match_urls = get_match_history_urls(player['profile_url'], session)
        
        # Step 3: For each match, extract army list URLs
        for j, match_url in enumerate(match_urls[:max_matches_per_player]):
            logger.info(f"  Processing match {j+1}/{min(len(match_urls), max_matches_per_player)}")
            
            match_data = extract_army_list_urls_from_match(match_url, session)
            
            # Step 4: Parse player's army list
            if match_data.get('player_list_url'):
                army_data = parse_army_list(match_data['player_list_url'], session)
                if army_data:
                    # Add match context
                    army_data.update({
                        'player_name': player['player_name'],
                        'player_id': player['player_id'],
                        'match_url': match_url,
                        'match_result': match_data.get('match_result'),
                        'opponent_list_url': match_data.get('opponent_list_url')
                    })
                    all_army_data.append(army_data)
            
            # Optional: Also parse opponent's army list for matchup data
            if match_data.get('opponent_list_url'):
                opponent_army_data = parse_army_list(match_data['opponent_list_url'], session)
                if opponent_army_data:
                    opponent_army_data.update({
                        'player_name': 'Opponent',  # We might not have opponent name easily
                        'match_url': match_url,
                        'is_opponent': True
                    })
                    all_army_data.append(opponent_army_data)
    
    logger.info(f"Scraping complete! Collected {len(all_army_data)} army lists")
    return all_army_data

print("Main scraping function defined!")


Main scraping function defined!


In [10]:
def analyze_faction_performance(army_data):
    """Analyze performance by faction to identify the best lists."""
    faction_stats = {}
    
    for army in army_data:
        faction = army.get('faction', 'Unknown')
        if faction not in faction_stats:
            faction_stats[faction] = {
                'total_lists': 0,
                'wins': 0,
                'losses': 0,
                'win_rate': 0,
                'sample_lists': []
            }
        
        faction_stats[faction]['total_lists'] += 1
        
        # Analyze match results (this depends on how we capture win/loss data)
        result = army.get('match_result', '').lower()
        if 'win' in result or 'victory' in result:
            faction_stats[faction]['wins'] += 1
        elif 'loss' in result or 'defeat' in result:
            faction_stats[faction]['losses'] += 1
        
        # Store sample list for this faction
        if len(faction_stats[faction]['sample_lists']) < 3:
            faction_stats[faction]['sample_lists'].append({
                'list_url': army.get('list_url'),
                'player_name': army.get('player_name'),
                'total_points': army.get('total_points')
            })
    
    # Calculate win rates
    for faction, stats in faction_stats.items():
        total_games = stats['wins'] + stats['losses']
        if total_games > 0:
            stats['win_rate'] = stats['wins'] / total_games
    
    return faction_stats

def save_data_to_files(army_data, base_filename='warhammer_old_world_data'):
    """Save scraped data to multiple formats."""
    
    # Save raw data as JSON
    json_filename = f"{base_filename}.json"
    with open(json_filename, 'w', encoding='utf-8') as f:
        json.dump(army_data, f, indent=2, ensure_ascii=False)
    print(f"Raw data saved to {json_filename}")
    
    # Save as CSV for easy analysis
    csv_filename = f"{base_filename}.csv"
    
    # Flatten the data for CSV
    flattened_data = []
    for army in army_data:
        flat_army = {
            'faction': army.get('faction', ''),
            'list_name': army.get('list_name', ''),
            'total_points': army.get('total_points', 0),
            'player_name': army.get('player_name', ''),
            'player_id': army.get('player_id', ''),
            'match_result': army.get('match_result', ''),
            'list_url': army.get('list_url', ''),
            'match_url': army.get('match_url', ''),
            'opponent_list_url': army.get('opponent_list_url', ''),
            'characters_count': len(army.get('characters', [])),
            'core_units_count': len(army.get('core_units', [])),
            'special_units_count': len(army.get('special_units', [])),
            'rare_units_count': len(army.get('rare_units', []))
        }
        flattened_data.append(flat_army)
    
    df = pd.DataFrame(flattened_data)
    df.to_csv(csv_filename, index=False)
    print(f"CSV data saved to {csv_filename}")
    
    # Save faction analysis
    faction_stats = analyze_faction_performance(army_data)
    faction_filename = f"{base_filename}_faction_analysis.json"
    with open(faction_filename, 'w', encoding='utf-8') as f:
        json.dump(faction_stats, f, indent=2, ensure_ascii=False)
    print(f"Faction analysis saved to {faction_filename}")
    
    return df, faction_stats

print("Data analysis and export functions defined!")


Data analysis and export functions defined!


In [11]:
# Test the scraper with a small sample first
print("="*50)
print("WARHAMMER THE OLD WORLD ARMY LIST SCRAPER")
print("="*50)
print()
print("This scraper will:")
print("1. Access the New Recruit ladder for Warhammer The Old World")
print("2. Extract player profiles and match histories")  
print("3. Parse army lists from match details")
print("4. Analyze faction performance")
print("5. Export data for machine learning and analysis")
print()
print("Before running the full scraper, test individual functions:")
print("- Run get_player_ids_from_ladder() to test ladder access")
print("- Test a single player profile with get_match_history_urls()")
print("- Parse a known army list URL with parse_army_list()")
print()
print("To run the full scraper, call:")
print("army_data = scrape_all_army_lists(max_players=5, max_matches_per_player=3)")
print("df, faction_stats = save_data_to_files(army_data)")
print()
print("READY TO START SCRAPING!")


WARHAMMER THE OLD WORLD ARMY LIST SCRAPER

This scraper will:
1. Access the New Recruit ladder for Warhammer The Old World
2. Extract player profiles and match histories
3. Parse army lists from match details
4. Analyze faction performance
5. Export data for machine learning and analysis

Before running the full scraper, test individual functions:
- Run get_player_ids_from_ladder() to test ladder access
- Test a single player profile with get_match_history_urls()
- Parse a known army list URL with parse_army_list()

To run the full scraper, call:
army_data = scrape_all_army_lists(max_players=5, max_matches_per_player=3)
df, faction_stats = save_data_to_files(army_data)

READY TO START SCRAPING!


In [12]:
players = get_player_ids_from_ladder()

INFO:__main__:Fetching player IDs from ladder...
INFO:__main__:Found 0 players
