In [1]:
import urllib.request
import urllib.parse
import json
import regex as re
import pandas as pd
from tqdm import tqdm
import os

In [2]:
# List of disney movies
# wiki_url = https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films

baseurl = "https://en.wikipedia.org/w/api.php?"
action = "action=query"
title = "titles=List_of_Walt_Disney_Pictures_films"
content = "prop=revisions&rvprop=content"
dataformat ="format=json"
rvslots = "rvslots=main"

query = "{}{}&{}&{}&{}&{}".format(baseurl, action, content, title, dataformat, rvslots)

headers = {"User-Agent" : "MyWikipediaClient/1.0 (example@example.com)"} # just use this dict as-is.
wikirequest = urllib.request.Request(query,None,headers)    # Needed to pass error 403
wikiresponse = urllib.request.urlopen(wikirequest)
wikidata = wikiresponse.read()
wikitext = wikidata.decode('utf-8')

In [3]:
# Load JSON and extract page content
wiki_json = json.loads(wikitext)
pages = wiki_json['query']['pages']['1970335']['revisions'][0]['slots']['main']['*']

In [4]:
# Example of a movie name listed -- {{Date table sorting|March 10, 2000}}\n| \'\'[[Whispers: An Elephant\'s Tale]]\'\'\n|\n|-\n|
# Build regex to extract all movie names
pattern = r"Date table sorting[^}]+}}\n\|\s*''?\[\[([^\]|]+?)(?:\|[^\]]+)?\]\]"
movies = re.findall(pattern, pages)

In [5]:
# Count opening and closing braces to find the true end (generated by an LLM)
def extract_infobox_manual(text):
    """Extract infobox by counting nested braces"""
    start = text.find('{{Infobox')
    if start == -1:
        return None
    
    count = 0
    i = start
    while i < len(text):
        if text[i:i+2] == '{{':
            count += 1
            i += 2
        elif text[i:i+2] == '}}':
            count -= 1
            i += 2
            if count == 0:
                return text[start:i]
        else:
            i += 1
    return None

def clean_wiki_links(text):
    """
    Replace [[Page]] with 'Page' and [[Page|Display]] with 'Display'.
    """
    # Replace [[Page|Display]] with Display
    text = re.sub(r'\[\[[^\]|]+\|([^\]]+)\]\]', r'\1', text)
    # Replace [[Page]] with Page
    text = re.sub(r'\[\[([^\]]+)\]\]', r'\1', text)
    return text

def clean_wiki_markup(text):
    # Remove references
    text = re.sub(r'<ref.*?>.*?</ref>', '', text, flags=re.S)
    # Remove templates like {{cite web|...}} or {{Long plot|...}}
    text = re.sub(r'\{\{.*?\}\}', '', text, flags=re.S)
    return text


In [6]:
# Extract infobox and plot for each movie
for name in tqdm(movies):

    # Properly encode the title to handle special characters like :, -, etc.
    encoded_title = urllib.parse.quote(name, safe='')
    movie_name = f"titles={encoded_title}"

    query = "{}{}&{}&{}&{}&{}".format(baseurl, action, content, movie_name, dataformat, rvslots)

    try:
        movie_wikirequest = urllib.request.Request(query, None, headers)    # Needed to pass error 403
        movie_wikiresponse = urllib.request.urlopen(movie_wikirequest)
        movie_wikidata = movie_wikiresponse.read()
        movie_wikitext = movie_wikidata.decode('utf-8')

        movie_wikijson = json.loads(movie_wikitext)
        movie_pageid = list(movie_wikijson['query']['pages'].keys())[0]
        movie_pagecontent = movie_wikijson['query']['pages'][movie_pageid]['revisions'][0]['slots']['main']['*']
        
        # Separate extraction of infobox and plot to avoid unwanted content

        # Regex pattern matching for extracting plot
        plot_pattern = r'==\s*Plot\s*==\s*(?:<!--.*?-->\s*)*(.*?)(?=\n==|\Z)'
        plot_match = re.search(plot_pattern, movie_pagecontent, re.S)

        # Extract infobox by matching brackets
        infobox_content = extract_infobox_manual(movie_pagecontent)

        # Only save file if a plot exists
        if plot_match:
            plot_content = plot_match.group(1).strip()

            # Remove all HTML comments inside the plot
            plot_content = re.sub(r'<!--.*?-->', '', plot_content, flags=re.S).strip()

            # Remove wiki links and other markup
            plot_content = clean_wiki_links(plot_content)
            plot_content = clean_wiki_markup(plot_content)

            plot_content = "PLOT: " + plot_content

            # Combine infobox and plot if infobox exists
            if infobox_content:
                combined_content = infobox_content + "\n\n" + plot_content
            else:
                combined_content = plot_content

            # Clean movie name for file saving
            clean_movie_name = name.replace('_', ' ')

            # Save to a text file
            with open(f"movies/{clean_movie_name}.txt", "w", encoding='utf-8') as f:
                f.write(combined_content)
        else:
            print(f"Plot not found for {name}. File not saved.")

    except Exception as e:
        print(f"Error retrieving data for {name}: {e}.")

  1%|          | 4/516 [00:02<04:50,  1.76it/s]

Plot not found for The Reluctant Dragon (1941 film). File not saved.


  1%|▏         | 7/516 [00:03<04:06,  2.06it/s]

Plot not found for Saludos Amigos. File not saved.


  2%|▏         | 8/516 [00:04<03:48,  2.22it/s]

Plot not found for Victory Through Air Power (film). File not saved.


  2%|▏         | 10/516 [00:04<03:37,  2.33it/s]

Plot not found for Make Mine Music. File not saved.


  3%|▎         | 13/516 [00:06<04:22,  1.91it/s]

Plot not found for Melody Time. File not saved.


  4%|▍         | 22/516 [00:10<03:16,  2.52it/s]

Plot not found for The Living Desert. File not saved.


  5%|▍         | 24/516 [00:11<02:53,  2.83it/s]

Plot not found for The Vanishing Prairie. File not saved.


  5%|▌         | 26/516 [00:12<03:10,  2.58it/s]

Plot not found for Davy Crockett: King of the Wild Frontier (film). File not saved.


  5%|▌         | 28/516 [00:13<03:23,  2.40it/s]

Plot not found for The African Lion. File not saved.


  6%|▌         | 32/516 [00:14<02:37,  3.07it/s]

Plot not found for Secrets of Life. File not saved.


  7%|▋         | 35/516 [00:15<02:17,  3.49it/s]

Plot not found for Perri (film). File not saved.


  7%|▋         | 38/516 [00:15<02:08,  3.72it/s]

Plot not found for White Wilderness (film). File not saved.


  9%|▊         | 44/516 [00:18<02:45,  2.85it/s]

Plot not found for Toby Tyler or 10 Weeks with a Circus (film). File not saved.


  9%|▉         | 48/516 [00:19<02:17,  3.41it/s]

Plot not found for Jungle Cat (film). File not saved.


 23%|██▎       | 121/516 [00:42<01:48,  3.64it/s]

Plot not found for The Best of Walt Disney's True-Life Adventures. File not saved.


 25%|██▍       | 127/516 [00:43<01:42,  3.81it/s]

Plot not found for Escape from the Dark. File not saved.


 29%|██▉       | 151/516 [00:52<02:00,  3.04it/s]

Plot not found for Trenchcoat (film). File not saved.


 39%|███▉      | 201/516 [01:12<01:40,  3.13it/s]

Plot not found for Frank and Ollie. File not saved.


 44%|████▎     | 225/516 [01:21<01:35,  3.05it/s]

Plot not found for Serengeti Symphony. File not saved.


 45%|████▍     | 231/516 [01:23<01:28,  3.23it/s]

Plot not found for Endurance (film). File not saved.


 46%|████▌     | 236/516 [01:26<02:11,  2.12it/s]

Plot not found for Fantasia 2000. File not saved.


 55%|█████▌    | 286/516 [01:48<01:32,  2.49it/s]

Plot not found for Sacred Planet. File not saved.


 56%|█████▌    | 288/516 [01:48<01:18,  2.91it/s]

Plot not found for America's Heart and Soul. File not saved.


 60%|██████    | 311/516 [01:57<01:13,  2.80it/s]

Plot not found for Roving Mars. File not saved.


 64%|██████▍   | 329/516 [02:04<01:11,  2.62it/s]

Plot not found for The Pixar Story. File not saved.


 65%|██████▍   | 333/516 [02:06<01:12,  2.51it/s]

Plot not found for Hannah Montana and Miley Cyrus: Best of Both Worlds Concert. File not saved.


 68%|██████▊   | 351/516 [02:12<00:53,  3.10it/s]

Plot not found for The Boys: The Sherman Brothers' Story. File not saved.


 69%|██████▉   | 356/516 [02:14<01:00,  2.66it/s]

Plot not found for Walt & El Grupo. File not saved.


 71%|███████   | 365/516 [02:18<01:10,  2.15it/s]

Plot not found for Waking Sleeping Beauty. File not saved.


 77%|███████▋  | 398/516 [02:33<00:49,  2.37it/s]

Plot not found for Wings of Life. File not saved.


 84%|████████▎ | 431/516 [02:51<00:49,  1.70it/s]

Plot not found for March of the Penguins#Sequel. File not saved.


 84%|████████▍ | 436/516 [02:53<00:38,  2.10it/s]

Plot not found for Ghost of the Mountains. File not saved.


 88%|████████▊ | 453/516 [03:02<00:26,  2.33it/s]

Plot not found for Frozen II. File not saved.


 89%|████████▉ | 460/516 [03:06<00:26,  2.15it/s]

Plot not found for Hamilton (2020 film). File not saved.


 92%|█████████▏| 477/516 [03:14<00:20,  1.92it/s]

Plot not found for The Beatles: Get Back#The Beatles: Get Back – The Rooftop Concert. File not saved.


 94%|█████████▍| 485/516 [03:18<00:14,  2.08it/s]

Plot not found for Rise (2022 film). File not saved.


 96%|█████████▋| 497/516 [03:24<00:08,  2.35it/s]

Plot not found for World's Best (film). File not saved.


 98%|█████████▊| 504/516 [03:27<00:04,  2.64it/s]

Plot not found for The Beach Boys (film). File not saved.


100%|██████████| 516/516 [03:33<00:00,  2.42it/s]


In [8]:
len(os.listdir('movies'))

478