In [1]:
import urllib.request
import urllib.parse
import json
import regex as re
import pandas as pd
from tqdm import tqdm
import os

In [2]:
# List of disney movies
# wiki_url = https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films

baseurl = "https://en.wikipedia.org/w/api.php?"
action = "action=query"
title = "titles=List_of_Walt_Disney_Pictures_films"
content = "prop=revisions&rvprop=content"
dataformat ="format=json"
rvslots = "rvslots=main"

query = "{}{}&{}&{}&{}&{}".format(baseurl, action, content, title, dataformat, rvslots)

headers = {"User-Agent" : "MyWikipediaClient/1.0 (example@example.com)"} # just use this dict as-is.
wikirequest = urllib.request.Request(query,None,headers)    # Needed to pass error 403
wikiresponse = urllib.request.urlopen(wikirequest)
wikidata = wikiresponse.read()
wikitext = wikidata.decode('utf-8')

In [3]:
# Load JSON and extract page content
wiki_json = json.loads(wikitext)
pages = wiki_json['query']['pages']['1970335']['revisions'][0]['slots']['main']['*']

In [4]:
# Example of a movie name listed -- {{Date table sorting|March 10, 2000}}\n| \'\'[[Whispers: An Elephant\'s Tale]]\'\'\n|\n|-\n|
# Build regex to extract all movie names
pattern = r"Date table sorting[^}]+}}\n\|\s*''?\[\[([^\]|]+?)(?:\|[^\]]+)?\]\]"
movies = re.findall(pattern, pages)

In [5]:
# Count opening and closing braces to find the true end (generated by an LLM)
def extract_infobox_manual(text):
    """Extract infobox by counting nested braces"""
    start = text.find('{{Infobox')
    if start == -1:
        return None
    
    count = 0
    i = start
    while i < len(text):
        if text[i:i+2] == '{{':
            count += 1
            i += 2
        elif text[i:i+2] == '}}':
            count -= 1
            i += 2
            if count == 0:
                return text[start:i]
        else:
            i += 1
    return None

In [6]:
# Extract infobox and plot for each movie
for name in tqdm(movies):

    # Properly encode the title to handle special characters like :, -, etc.
    encoded_title = urllib.parse.quote(name, safe='')
    movie_name = f"titles={encoded_title}"

    query = "{}{}&{}&{}&{}&{}".format(baseurl, action, content, movie_name, dataformat, rvslots)

    try:
        movie_wikirequest = urllib.request.Request(query, None, headers)    # Needed to pass error 403
        movie_wikiresponse = urllib.request.urlopen(movie_wikirequest)
        movie_wikidata = movie_wikiresponse.read()
        movie_wikitext = movie_wikidata.decode('utf-8')

        movie_wikijson = json.loads(movie_wikitext)
        movie_pageid = list(movie_wikijson['query']['pages'].keys())[0]
        movie_pagecontent = movie_wikijson['query']['pages'][movie_pageid]['revisions'][0]['slots']['main']['*']
        
        # Separate extraction of infobox and plot to avoid unwanted content

        # Regex pattern matching for extracting plot
        plot_pattern = r'==\s*Plot\s*==.*?(?=\n==(?!=))'
        plot_match = re.search(plot_pattern, movie_pagecontent, re.DOTALL)

        # Extract infobox by matching brackets since every infobox is not the same
        infobox_content = extract_infobox_manual(movie_pagecontent)

        # Save them together if they both exist
        if infobox_content and plot_match:

            # Get clean plot content
            plot_content = plot_match.group(0)

            # Combine infobox and plot contents
            combined_content = infobox_content + "\n\n" + plot_content

            # Clean movie name for file saving
            clean_movie_name = name.replace('_', ' ')

            # Save to a text file
            with open(f"movies\{clean_movie_name}.txt", "w", encoding='utf-8') as f:
                f.write(combined_content)
        else:
            if not infobox_content:
                print(f"Infobox not found for {name}.")
            if not plot_match:
                print(f"Plot not found for {name}.")

    except Exception as e:
        print(f"Error retrieving data for {name}: {e}.")

  1%|          | 4/516 [00:04<08:11,  1.04it/s]

Plot not found for The Reluctant Dragon (1941 film).


  1%|▏         | 7/516 [00:06<07:17,  1.16it/s]

Plot not found for Saludos Amigos.


  2%|▏         | 8/516 [00:07<07:04,  1.20it/s]

Plot not found for Victory Through Air Power (film).


  2%|▏         | 10/516 [00:09<07:15,  1.16it/s]

Plot not found for Make Mine Music.


  3%|▎         | 13/516 [00:11<07:04,  1.19it/s]

Plot not found for Melody Time.


  4%|▍         | 22/516 [00:19<07:07,  1.16it/s]

Plot not found for The Living Desert.


  5%|▍         | 24/516 [00:21<06:43,  1.22it/s]

Plot not found for The Vanishing Prairie.


  5%|▌         | 26/516 [00:23<06:55,  1.18it/s]

Infobox not found for Davy Crockett: King of the Wild Frontier (film).
Plot not found for Davy Crockett: King of the Wild Frontier (film).


  5%|▌         | 28/516 [00:25<07:12,  1.13it/s]

Plot not found for The African Lion.


  6%|▌         | 32/516 [00:28<06:45,  1.19it/s]

Plot not found for Secrets of Life.


  7%|▋         | 35/516 [00:31<06:49,  1.17it/s]

Plot not found for Perri (film).


  7%|▋         | 38/516 [00:33<06:50,  1.16it/s]

Plot not found for White Wilderness (film).


  9%|▊         | 44/516 [00:38<06:31,  1.20it/s]

Infobox not found for Toby Tyler or 10 Weeks with a Circus (film).
Plot not found for Toby Tyler or 10 Weeks with a Circus (film).


  9%|▉         | 48/516 [00:42<06:54,  1.13it/s]

Plot not found for Jungle Cat (film).


 23%|██▎       | 121/516 [01:49<05:22,  1.22it/s]

Plot not found for The Best of Walt Disney's True-Life Adventures.


 25%|██▍       | 127/516 [01:53<05:02,  1.29it/s]

Infobox not found for Escape from the Dark.
Plot not found for Escape from the Dark.


 29%|██▉       | 151/516 [02:14<05:15,  1.16it/s]

Plot not found for Trenchcoat (film).


 39%|███▉      | 201/516 [03:04<04:30,  1.17it/s]

Plot not found for Frank and Ollie.


 44%|████▎     | 225/516 [03:24<03:56,  1.23it/s]

Plot not found for Serengeti Symphony.


 45%|████▍     | 231/516 [03:30<04:47,  1.01s/it]

Plot not found for Endurance (film).


 46%|████▌     | 236/516 [03:36<04:50,  1.04s/it]

Plot not found for Fantasia 2000.


 55%|█████▌    | 286/516 [04:20<03:10,  1.21it/s]

Plot not found for Sacred Planet.


 56%|█████▌    | 288/516 [04:24<05:45,  1.52s/it]

Plot not found for America's Heart and Soul.


 60%|██████    | 311/516 [04:46<02:52,  1.19it/s]

Plot not found for Roving Mars.


 64%|██████▍   | 329/516 [05:03<02:47,  1.11it/s]

Plot not found for The Pixar Story.


 65%|██████▍   | 333/516 [05:06<02:39,  1.15it/s]

Infobox not found for Hannah Montana and Miley Cyrus: Best of Both Worlds Concert.
Plot not found for Hannah Montana and Miley Cyrus: Best of Both Worlds Concert.


 68%|██████▊   | 351/516 [05:22<02:13,  1.23it/s]

Plot not found for The Boys: The Sherman Brothers' Story.


 69%|██████▉   | 356/516 [05:26<02:21,  1.13it/s]

Plot not found for Walt & El Grupo.


 71%|███████   | 365/516 [05:34<02:12,  1.14it/s]

Plot not found for Waking Sleeping Beauty.


 77%|███████▋  | 398/516 [06:04<01:42,  1.16it/s]

Plot not found for Wings of Life.


 84%|████████▎ | 431/516 [06:36<01:36,  1.13s/it]

Plot not found for March of the Penguins#Sequel.


 84%|████████▍ | 436/516 [06:43<01:36,  1.21s/it]

Plot not found for Ghost of the Mountains.


 88%|████████▊ | 453/516 [07:00<00:58,  1.08it/s]

Infobox not found for Frozen II.
Plot not found for Frozen II.


 89%|████████▉ | 460/516 [07:07<00:56,  1.01s/it]

Plot not found for Hamilton (2020 film).


 92%|█████████▏| 477/516 [07:25<00:37,  1.05it/s]

Plot not found for The Beatles: Get Back#The Beatles: Get Back – The Rooftop Concert.


 94%|█████████▍| 485/516 [07:32<00:28,  1.09it/s]

Infobox not found for Rise (2022 film).
Plot not found for Rise (2022 film).


 96%|█████████▋| 497/516 [07:44<00:17,  1.07it/s]

Infobox not found for World's Best (film).
Plot not found for World's Best (film).


 98%|█████████▊| 504/516 [07:50<00:10,  1.14it/s]

Plot not found for The Beach Boys (film).


100%|██████████| 516/516 [08:02<00:00,  1.07it/s]


In [8]:
len(os.listdir('movies'))

469