In [9]:
#!pip install networkx
#!pip install requests
#!pip install bs4
#!pip install pandas

In [100]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from collections import defaultdict

In [101]:
domain = "https://marvelcinematicuniverse.fandom.com"
movie_url = "https://marvelcinematicuniverse.fandom.com/wiki/Marvel_Cinematic_Universe_Wiki#Movies"

In [159]:
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
import time
from typing import List, Tuple, Dict

class MovieScraper:
    def __init__(self, movie_url, domain):
        self.movie_url = movie_url
        self.domain = domain

    def scrape_mcu_movies(self) -> List[Tuple[str, str]]:
        """
            Scrape all marvel movies and store as a list
        """
        try:
            response = requests.get(self.movie_url)
            response.raise_for_status()  
        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch page: {e}")
            return None

        movies = []
        soup = BeautifulSoup(response.text, 'html.parser')
        tabber_div = soup.find("div", {"class": "tabber"})
        sections = tabber_div.find_all('tr')
        for section in sections:
            phase_name = section.find('b')

            links = section.find_all('a', href=True)  # ensure 'href' exists
            for link in links:
                if "Phase" not in link.text:
                    movie_title = link.text.strip()
                    movie_link = link['href']
                    if movie_link.startswith("/"):  # ensure relative links
                        movie_link = f"{self.domain}{movie_link}"
                    movies.append((movie_title, movie_link))
            
            if phase_name and phase_name.text.strip() == "Unknown Phase":
                break

        return movies

    def get_character_info(self) -> Dict[str, List[Dict[str, str]]]:
        """
            Get characters per movie
        """
        movies = self.scrape_mcu_movies()
        if not movies:
            return None
        
        characters_dict = defaultdict(list)
        for title, link in movies:
            extended_link = f"{link}/Portal"
            try:
                response = requests.get(extended_link)
                response.raise_for_status()
            except requests.exceptions.RequestException as e:
                print(f"Failed to fetch character info for {title}: {e}")
                continue
            
            time.sleep(1) 
            soup = BeautifulSoup(response.text, 'html.parser')
            content_element = soup.find("div", {"class": "page-content"})
            if not content_element:
                continue

            character_links = content_element.find_all("a", href=True)
            for character_link in character_links:
                if (
                    "image" not in character_link.get("class", []) and
                    "lightbox" not in character_link.get("class", []) and
                    not character_link.find("img") and
                    character_link.text != title
                ):
                    characters_dict[title].append({
                        "name": character_link.text.strip(),
                        "href": character_link['href']
                    })

        return characters_dict

In [None]:
movie_scraper = MovieScraper(movie_url=movie_url, domain=domain)
characters_dict = movie_scraper.get_character_info()

In [162]:
characters_dict["The Incredible Hulk"]

[{'name': 'Bruce Banner/Hulk', 'href': '/wiki/Hulk'},
 {'name': 'Betty Ross', 'href': '/wiki/Betty_Ross'},
 {'name': 'Emil Blonsky/Abomination', 'href': '/wiki/Abomination'},
 {'name': 'Thaddeus Ross', 'href': '/wiki/Thaddeus_Ross'}]

In [167]:
characters_dict["Iron Man"]

[{'name': 'Tony Stark/Iron Man', 'href': '/wiki/Iron_Man'},
 {'name': 'Obadiah Stane/Iron Monger', 'href': '/wiki/Iron_Monger'},
 {'name': 'Pepper Potts', 'href': '/wiki/Pepper_Potts'}]

In [168]:
characters_dict.pop('', None)

len(characters_dict.keys())

35

In [171]:
import json
with open("movie_characters_dict.json", "w") as fp:
    json.dump(characters_dict , fp) 