In [None]:
import numpy as np 
import pandas as pd 
import re, glob, os
import math

In [49]:
# download match data from lol fandom
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

# Setup Chrome driver
driver = webdriver.Chrome()

url = "https://lol.fandom.com/Special:RunQuery/MatchHistoryGame?MHG%5Bpreload%5D=Tournament&MHG%5Bspl%5D=yes&MHG%5Btournament%5D=2025%20Season%20World%20Championship/Main%20Event&_run="

try:
    driver.get(url)
    
    # Wait for table to load and images to render
    wait = WebDriverWait(driver, 15)
    wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))
    time.sleep(3)  # Additional wait for dynamic content
    
    # Get page source
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    
    # Debug: Save HTML to file
    with open('debug_page.html', 'w', encoding='utf-8') as f:
        f.write(html)
    print("HTML saved to debug_page.html for inspection\n")
    
    # Helper function to extract champion names from various sources
    def extract_champions(cell):
        champions = []
        
        # Method 1: Check for CSS sprite elements (spans/divs with champion classes)
        for elem in cell.find_all(['span', 'div', 'a']):
            # Check class names for champion names
            classes = elem.get('class', [])
            for cls in classes:
                if 'champion' in cls.lower() and 'sprite' not in cls.lower():
                    parts = cls.split('-')
                    if len(parts) > 1:
                        champ_name = parts[-1].replace('_', ' ').title()
                        if champ_name and len(champ_name) > 1 and champ_name.lower() != 'sprite':
                            champions.append(champ_name)
            
            # Check title attribute (hover tooltip)
            title = elem.get('title', '').strip()
            if title and not any(x in title.lower() for x in ['edit', 'create', 'redirect', 'file:', 'image:']):
                champions.append(title)
            
            # Check data attributes
            for attr in ['data-champion', 'data-param1', 'data-tooltip-content', 'aria-label', 'data-game-name', 'data-name']:
                value = elem.get(attr)
                if value and value.strip():
                    champions.append(value.strip())
        
        # Method 2: Check img elements
        for img in cell.find_all('img'):
            name = img.get('title') or img.get('alt') or img.get('data-image-name')
            if name and name.strip():
                clean_name = re.sub(r'(Link|Image|Square).*$', '', name.strip(), flags=re.IGNORECASE).strip()
                if clean_name and len(clean_name) > 1:
                    champions.append(clean_name)
        
        # Method 3: Check anchor/link elements
        for link in cell.find_all('a'):
            title = link.get('title', '').strip()
            href = link.get('href', '')
            
            if title and not any(x in title.lower() for x in ['edit', 'create', 'redirect', 'file:', 'image:']):
                champions.append(title)
            elif '/wiki/' in href and 'File:' not in href and 'Image:' not in href:
                champ_name = href.split('/wiki/')[-1].split('#')[0].replace('_', ' ')
                if champ_name and len(champ_name) > 1:
                    champions.append(champ_name)
        
        # Remove duplicates while preserving order
        seen = set()
        unique_champions = []
        for champ in champions:
            # Filter out "Sprite" and other unwanted values
            if champ not in seen and champ and champ.lower() not in ['sprite', 'champion']:
                seen.add(champ)
                unique_champions.append(champ)
        
        return unique_champions
    
    # Helper function to extract team name
    def extract_team_name(cell):
        # Method 1: Check img title (tooltip on hover)
        img = cell.find('img')
        if img:
            team = img.get('title') or img.get('alt') or img.get('data-image-name')
            if team and team.strip():
                clean_team = re.sub(r'logo.*$', '', team.strip(), flags=re.IGNORECASE).strip()
                return clean_team
        
        # Method 2: Check link title
        link = cell.find('a')
        if link:
            team = link.get('title', '').strip()
            if team and not any(x in team.lower() for x in ['edit', 'create']):
                return team
            
            href = link.get('href', '')
            if '/wiki/' in href:
                team_name = href.split('/wiki/')[-1].split('#')[0].replace('_', ' ')
                if team_name:
                    return team_name
        
        # Method 3: Check data attributes
        for elem in cell.find_all(['span', 'div', 'img', 'a']):
            for attr in ['data-team', 'data-tooltip-content', 'aria-label']:
                value = elem.get(attr)
                if value and value.strip():
                    return value.strip()
        
        # Last resort: text content
        text = cell.get_text(strip=True)
        if text and text not in ['⁠', '⁠⁠', '⁠⁠⁠', '']:
            return text
        
        return 'Unknown'
    
    # Find the main table
    table = soup.find('table')
    
    # Extract data
    matches = []
    
    rows = table.find_all('tr')[1:]  # Skip header row
    
    print(f"Found {len(rows)} rows in table\n")
    
    for idx, row in enumerate(rows):
        cells = row.find_all('td')
        
        if len(cells) < 10:
            continue
        
        # Debug first row
        if idx == 0:
            print("DEBUG - First row structure:")
            print(f"\nCell 5 (Blue Bans):")
            print(f"  Spans: {len(cells[5].find_all('span'))}")
            print(f"  Divs: {len(cells[5].find_all('div'))}")
            print(f"  Links: {len(cells[5].find_all('a'))}")
            print(f"  HTML sample: {str(cells[5])[:800]}")
            
            first_span = cells[5].find('span')
            if first_span:
                print(f"\nFirst span element:")
                print(f"  Classes: {first_span.get('class', [])}")
                print(f"  All attributes: {first_span.attrs}")
            
            first_link = cells[5].find('a')
            if first_link:
                print(f"\nFirst link element:")
                print(f"  href: {first_link.get('href', '')}")
                print(f"  title: {first_link.get('title', '')}")
                print(f"  All attributes: {first_link.attrs}")
            print()
        
        match_data = {
            'Date': cells[0].get_text(strip=True),
            'Patch': cells[1].get_text(strip=True),
            'Blue_Team': extract_team_name(cells[2]),
            'Red_Team': extract_team_name(cells[3]),
            'Winner': extract_team_name(cells[4]),
            'Blue_Bans': extract_champions(cells[5]),
            'Red_Bans': extract_champions(cells[6]),
            'Blue_Picks': extract_champions(cells[7]),
            'Red_Picks': extract_champions(cells[8]),
            'Blue_Roster': cells[9].get_text(strip=True) if len(cells) > 9 else '',
            'Red_Roster': cells[10].get_text(strip=True) if len(cells) > 10 else ''
        }
        
        matches.append(match_data)
    
    # Convert to DataFrame
    df = pd.DataFrame(matches)
    
    # Display results
    print(f"\nTotal matches found: {len(matches)}\n")

    # group matches by series, some are best of 1, some best of 3 or 5. Continuous matches with same teams (Blue or Red) are in same series
    df['Series_ID'] = (((df['Blue_Team'] != df['Blue_Team'].shift()) & (df['Blue_Team'] != df['Red_Team'].shift()))
                       |((df['Blue_Team'] != df['Red_Team'].shift()) & (df['Red_Team'] != df['Red_Team'].shift()))).cumsum()
    
    # Save to CSV
    df.to_csv('lol_matches.csv', index=False)
    print("Data saved to 'lol_matches.csv'")

finally:
    driver.quit()

HTML saved to debug_page.html for inspection

Found 82 rows in table


Total matches found: 80

Data saved to 'lol_matches.csv'


In [50]:
# split the bans, picks, roster into separate columns
df_expanded = df.copy()

def expand_ban_lists(row, col_name, prefix):
    champ_list = row[col_name]
    for i in range(5):  # Assuming max 5 bans/picks
        champ = champ_list[i] if i < len(champ_list) else None
        row[f"{prefix}{i+1}"] = champ
    return row

def expand_champ_lists(row, col_name, prefix):
    champ_list = row[col_name]
    for i in range(5):  # Assuming max 5 bans/picks
        champ = champ_list[i] if i < len(champ_list) else None
        roles = ['Top', 'Jungle', 'Mid', 'ADC', 'Support']
        row[f"{prefix}{roles[i]}"] = champ
    return row

def expand_roster(row, col_name, prefix):
    roster = [name.strip() for name in row[col_name].split(',')] if row[col_name] else []
    for i in range(5):  # Assuming max 5 players
        player = roster[i] if i < len(roster) else None
        roles = ['Top', 'Jungle', 'Mid', 'ADC', 'Support']
        row[f"{prefix}{roles[i]}"] = player
    return row

# expand picks and bans lists
df_expanded = df_expanded.apply(expand_ban_lists, axis=1, col_name='Blue_Bans', prefix='Blue_Ban_')
df_expanded = df_expanded.apply(expand_ban_lists, axis=1, col_name='Red_Bans', prefix='Red_Ban_')
df_expanded = df_expanded.apply(expand_champ_lists, axis=1, col_name='Blue_Picks', prefix='Blue_Pick_')
df_expanded = df_expanded.apply(expand_champ_lists, axis=1, col_name='Red_Picks', prefix='Red_Pick_')

# expand roster lists
df_expanded = df_expanded.apply(expand_roster, axis=1, col_name='Blue_Roster', prefix='Blue_Player_')
df_expanded = df_expanded.apply(expand_roster, axis=1, col_name='Red_Roster', prefix='Red_Player_')

df_expanded.drop(columns=['Patch', 'Blue_Bans', 'Red_Bans', 'Blue_Picks', 'Red_Picks', 'Blue_Roster', 'Red_Roster'], inplace=True)

df_expanded.to_csv('lol_matches_expanded.csv', index=False)
print("Expanded Data saved to 'lol_matches_expanded.csv'")

Expanded Data saved to 'lol_matches_expanded.csv'
