In [10]:
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
import time
from typing import List, Tuple, Dict
from bs4.element import Tag

In [11]:
comics_url = "https://marvel.fandom.com/wiki/Category:Collections"
domain = "https://marvel.fandom.com"

In [20]:
class ComicsScraper:
    def __init__(self, comics_url: str, domain: str):
        self.comics_url = comics_url
        self.domain = domain
    
    def scrape_comics(self) -> List[Tuple[str, str]]:
        """
            Scrape comics from the homepage
        """
        try:
            response = requests.get(self.comics_url)
            response.raise_for_status()  
        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch page: {e}")
            return None
        
        comics = []
        soup = BeautifulSoup(response.text, 'html.parser')
        content = soup.find("div", {"class": "category-page__members-wrapper"})
        links = content.find_all('a', href=True) 

        for link in links:
            comic_title = link.text.strip()
            comic_link = link['href']
            if comic_title and comic_link.startswith("/"): 
                comic_link = f"{self.domain}{comic_link}"
                        
                comics.append((comic_title, comic_link))

        return comics

    def _process_ul(self, ul: Tag) -> List[str]:
        """
            Recursively process a <ul> element to extract all <li> items and their nested lists
            It accounts for cases where we have a main theme of characters as an <li>, not their names
        """
        items = []
        for li in ul.find_all('li', recursive=False):  # only direct children
            text = li.get_text(strip=True)
            items.append(text)

            # check for nested uls
            nested_ul = li.find('ul')
            if nested_ul:
                # recursively process the nested ul and replace the current list
                nested_items = self._process_ul(nested_ul)
                items = nested_items

        return items
    
    def _clean_up_characters_dict(self, characters_dict: Dict[str, Dict[str, List[str]]]) -> Dict[str, Dict[str, List[str]]]:
        """
            Remove roles without a character (empty ones)
        """
        to_remove = []
        for title, characters_sections in characters_dict.items():
            for section_name, section_characters in characters_sections.items():
                if section_characters[0] == '' and len(section_characters) == 1:
                    to_remove.append((title, section_name))

        for title, section_name in to_remove:
            characters_dict[title].pop(section_name)

        return characters_dict
            


    def get_character_info(self) -> Dict[str, Dict[str, List[str]]]:
        """
            Get characters per comic with nested <ul> handling
        """
        comics = self.scrape_comics()
        if not comics:
            return None

        characters_dict = {}

        for title, link in comics:
            try:
                response = requests.get(link)
                response.raise_for_status()
            except requests.exceptions.RequestException as e:
                print(f"Failed to fetch character info for {title}: {e}")
                continue

            time.sleep(1)
            soup = BeautifulSoup(response.text, 'html.parser')

            try:
                content = soup.find("div", {"class": "marvel_database_section"})

                character_sections = {}
                section_titles = content.find_all("p")
                for section_title in section_titles:
                    heading_text = section_title.get_text(strip=True)

                    # find the next <ul> sibling for the current heading (e.g. for featured characters)
                    ul = section_title.find_next_sibling('ul')
                    if ul:
                        items = self._process_ul(ul)
                        character_sections[heading_text] = items

                    if heading_text == "Other Characters:":
                        break

                characters_dict[title] = character_sections
            except Exception as e:
                print(f"Failed to fetch character info for {title}: {str(e)}")

        characters_dict = self._clean_up_characters_dict(characters_dict)

        return characters_dict


In [21]:
comics_scraper = ComicsScraper(comics_url=comics_url, domain=domain)
characters_dict = comics_scraper.get_character_info()
characters_dict

Failed to fetch character info for Ultimate Silver Surfer (novel): 'NoneType' object has no attribute 'find_all'
Failed to fetch character info for Ultimate Spider-Man (novel): 'NoneType' object has no attribute 'find_all'
Failed to fetch character info for Spider-Man: Deadly Cure: 'NoneType' object has no attribute 'find_all'
Failed to fetch character info for Ultimate X-Men (novel): 'NoneType' object has no attribute 'find_all'
Failed to fetch character info for Iron Man: Steel Terror: 'NoneType' object has no attribute 'find_all'
Failed to fetch character info for Spider-Man: Global War: 'NoneType' object has no attribute 'find_all'
Failed to fetch character info for Ultimate Hulk (novel): 'NoneType' object has no attribute 'find_all'
Failed to fetch character info for X-Men: Shadows of the Past: 'NoneType' object has no attribute 'find_all'
Failed to fetch character info for Iron Man: The Junior Novel: 'NoneType' object has no attribute 'find_all'


{'Avengers: Battle the Earth-Wrecker': {'Featured Characters:': ['Captain America (Steve Rogers)',
   'Goliath (Hank Pym)',
   'Hawkeye (Clint Barton)',
   'Wasp (Janet Van Dyne)',
   'Iron Man (Tony Stark)'],
  'Antagonists:': ['Karzz the Conqueror']},
 'Fantastic Four in The House of Horrors': {'Featured Characters:': ['Mr. Fantastic (Reed Richards)',
   'Invisible Girl (Susan Storm)',
   'Thing (Ben Grimm)',
   'Human Torch (Johnny Storm)'],
  'Antagonists:': ['Doctor Weird(First appearance)']},
 'Spider-Man Zaps Mr. Zodiac': {'Featured Characters:': ['Spider-Man (Peter Parker)'],
  'Supporting Characters:': ['J. Jonah Jameson', 'Betty Brant', 'Aunt May'],
  'Villains:': ['Mr. Zodiac (Astro)(First appearance; dies)(Main story and flashback)'],
  'Other Characters:': ['Ben', 'Uncle Ben(Mentioned)']},
 'Amazing Spider-Man: Mayhem in Manhattan': {},
 'Incredible Hulk: Stalker From the Stars': {'Featured Characters:': ['Hulk (Bruce Banner)'],
  'Supporting Characters:': ['Richard Jones'

In [22]:
len(characters_dict.keys())

191

In [23]:
import json
with open("comic_characters_dict.json", "w") as fp:
    json.dump(characters_dict , fp) 

In [27]:
characters_dict["Mighty Thor: An Origin Story"]

{'Featured Characters:': ['Thor/Don Blake(First appearance)(Origin revealed)'],
 'Supporting Characters:': ['Fandrall(First appearance)',
  'Volstagg(First appearance)',
  'Hogun(First appearance)',
  'Balder(First appearance)',
  'Lady Sif(First appearance)',
  'Odin(First appearance)',
  'Frigga(First appearance)'],
 'Antagonists:': ['Loki(First appearance)'],
 'Other Characters:': ['Hela(First appearance)(Cameo)',
  'Heimdall(First appearance)(Mentioned)',
  'Munin(First appearance)(Cameo)',
  'Hugin(First appearance)(Cameo)',
  'Freki(First appearance)(Cameo)',
  'Geri(First appearance)(Cameo)']}