In [1]:
from bs4 import BeautifulSoup

# 1. Lire le fichier
with open(r'C:\Users\jeanb\Documents\misc-code\loldle-versus\backend\all_champs.txt', 'r', encoding='utf-8') as f:
    html_content = f.read()

# 2. Créer l'objet BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

In [5]:
all_champions = soup.find_all('div', class_='classic-answer')

In [14]:
def extract_champion_info(soup_element):
    """
    Extract champion information from a BeautifulSoup grid element.
    
    Args:
        soup_element: BeautifulSoup object containing the champion grid squares
        
    Returns:
        dict: Dictionary with extracted champion information
    """
    # Initialize the result dictionary
    champion_data = {
        'Champion': None,
        'Image': None,
        'Genre': None,
        'Rôle': None,
        'Espèce': None,
        'Ressource': None,
        'Type de portée': None,
        'Région(s)': None,
        'Année de sortie': None
    }
    
    # Find all square elements
    squares = soup_element.find_all('div', class_='square')
    
    if not squares:
        return champion_data
    
    # Extract champion name and image from the first square
    first_square = squares[0]
    
    # Get champion name
    champion_name_div = first_square.find('div', class_='champion-icon-name')
    if champion_name_div:
        champion_data['Champion'] = champion_name_div.get_text(strip=True)
    
    # Get champion image URL
    img_tag = first_square.find('img')
    if img_tag and img_tag.get('src'):
        champion_data['Image'] = img_tag.get('src')
    
    # Map the remaining squares to their respective fields
    # The order is: Genre, Rôle, Espèce, Ressource, Type de portée, Région, Année
    field_mapping = [
        'Genre',
        'Rôle', 
        'Espèce',
        'Ressource',
        'Type de portée',
        'Région(s)',
        'Année de sortie'
    ]
    
    # Fields that can have multiple values (will be returned as lists)
    multi_value_fields = {'Espèce', 'Région(s)'}
    
    # Extract data from squares 1-7 (index 1-7 in the list)
    for i, field in enumerate(field_mapping, start=1):
        if i < len(squares):
            square = squares[i]
            square_content = square.find('div', class_='square-content')
            
            if square_content:
                # Get all text from the square, handling multiple divs
                text_elements = square_content.find_all('div', recursive=False)
                if text_elements:
                    # For nested divs (like Espèce with multiple values)
                    inner_divs = text_elements[0].find_all('div', class_='')
                    if inner_divs:
                        # Collect all unique values
                        values = []
                        seen = set()
                        for div in inner_divs:
                            text = div.get_text(strip=True).strip(',').strip()
                            if text and text not in seen:
                                values.append(text)
                                seen.add(text)
                        
                        if values:
                            if field in multi_value_fields:
                                champion_data[field] = values
                            else:
                                # For single-value fields, join with comma if somehow multiple
                                champion_data[field] = values[0] if len(values) == 1 else values
                    else:
                        # Single value (like Genre, Ressource, etc.)
                        spans = square_content.find_all('span')
                        if spans:
                            champion_data[field] = spans[0].get_text(strip=True)
                        else:
                            champion_data[field] = text_elements[0].get_text(strip=True)
    
    return champion_data

In [15]:
all_champions_data = []

for champion in all_champions:
    champion_infos_raw = champion.find_all('div')

    champion_infos = extract_champion_info(champion)

    all_champions_data.append(champion_infos)

import json
with open('champions_data.json', 'w', encoding='utf-8') as f:
    json.dump(all_champions_data, f, ensure_ascii=False, indent=4)
