In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time

# === Your initial list of movies ===
movies = [
    {
        "title": "The Amazing Spider-Man",
        "link": "https://marvel.fandom.com/wiki/The_Amazing_Spider-Man_(2012_film)",
        "universe": "Earth-199999",
        "release_date": "03-May-2002"
    },
    {
        "title": "The Amazing Spider-Man 2",
        "link": "https://marvel.fandom.com/wiki/The_Amazing_Spider-Man_2_(film)",
        "universe": "Earth-199999",
        "release_date": "16-Apr-2014"
    },
    
    

    # Add more movies here...
]

# === Set up headless Chrome for Selenium ===
options = Options()
options.add_argument("--headless")  # Run in headless mode (no GUI)
options.add_argument("--disable-gpu")  # Disable GPU acceleration
options.add_argument("--no-sandbox")  # Required in some restricted environments like Kaggle
options.add_argument("--disable-dev-shm-usage")  # Avoid shared memory crashes
options.add_argument("--window-size=1920,1080")  # Set viewport size
options.add_argument("--ignore-certificate-errors")  # Ignore SSL cert errors
options.add_argument("--allow-insecure-localhost")  # Allow localhost with bad certs
options.add_argument("--allow-running-insecure-content")  # Allow mixed content (HTTP inside HTTPS)
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")  # Realistic UA
driver = webdriver.Chrome(options=options)

# === Helper to extract a section (plot/synopsis) ===
def extract_section(soup, section_id):
    section = soup.find("span", {"class": "mw-headline", "id": section_id})
    if not section:
        return None
    content = []
    for sibling in section.parent.find_next_siblings():
        if sibling.name and sibling.name.startswith("h"):
            break
        content.append(sibling.get_text(strip=True))
    return "\n".join(content)

# === Main scraping loop ===
for movie in movies:
    print(f"Scraping: {movie['title']}")
    driver.get(movie['link'])
    time.sleep(3)  # Wait for the page to load fully

    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    movie["synopsis"] = extract_section(soup, "Synopsis")
    print(movie["synopsis"])
    movie["plot"] = extract_section(soup, "Plot")
    print(movie["plot"])


driver.quit()

# === Output result ===
import pprint
pprint.pprint(movies)

import json

with open("mcu_earth3.json", "w", encoding="utf-8") as f:
    json.dump(movies, f, ensure_ascii=False, indent=4)

print("✅ Data exported to mcu_earth3.json.json")

print(f"Total movies scraped: {len(movies)}")



Scraping: The Amazing Spider-Man
None
This life is not an easy one... I've made enemies.Powerfulenemies. Put those I love in danger. But the one thing that has haunted me, my entire life, is find the truth about my parents.—Spider-Man (Peter Parker)YoungPeter Parkerwas left with his aunt and uncle,BenandMay, when his parents,RichardandMarywere forced to leave. Peter grew up with the mystery of his parents' disappearance, becoming an outcast in high school.Flash Thompsonbullied Peter, who caught the eye of fellow classmate and crush,Gwen Stacy.Peter finds his father's briefcase in his house's basement, which reveals to him the plane crash that killed his parents, and that his father had worked forOsCorpwithDr. Curt Connorsin cross-species genetics. He goes to OsCorp searching for Connors and makes his way in by pretending to be a member of the high-school interns. Gwen, who is the guide of the group and assistant to Connors, recognizes Peter very quickly. She allows Peter to stay, so lo