In [38]:
import requests
from bs4 import BeautifulSoup
import csv
import re

# Fetch the Wikipedia page
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []  # Main data list
PeopleLinks = set()  # Global set to store unique producer links

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None  # Track current year
current_iteration = None  # Track current iteration

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for row in rows:
        cols = row.find_all(['td', 'th'])

        # Handle rows with <th> (release year and iteration)
        if cols[0].name == 'th':  # Year and iteration are in <th>
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            if match_year:
                current_year = int(match_year.group(1))
            if match_iteration:
                current_iteration = match_iteration.group(1)

            # Continue processing this row as it may also contain movie/producers info
            cols.pop(0)  # Remove <th> from columns to process remaining <td> tags

        # Skip rows without enough columns to process (e.g., empty rows)
        if len(cols) < 2:
            print(f"Skipping row due to insufficient columns: {row}")
            continue

        # Extract film title from the <i> tag within the first <td>
        film_cell = cols[0]  # First <td> contains the movie title
        film_title_tag = film_cell.find('i')  # Find the <i> tag inside <td>
        if not film_title_tag:
            print(f"Skipping row due to missing <i> tag")
            continue

        film_title_link_tag = film_title_tag.find('a')  # Check for nested <a> tag

        movie_title = (
            film_title_link_tag.get_text(strip=True)
            if film_title_link_tag
            else film_title_tag.get_text(strip=True)
        )

        # Check winner status (bold indicates winner)
        is_winner = bool(film_cell.find('b'))
        winner_status = "yes" if is_winner else "no"

        # Extract producers from the second column (<td>)
        producers_info = []
        producer_cell = cols[1]  # Second <td> contains producer info
        producer_links = producer_cell.find_all('a')  # Find all <a> tags in the cell

        if producer_links:  # Case: Hyperlinked producer names
            for link in producer_links:
                producer_name = link.get_text(strip=True)  # Extract producer name
                producer_url = f"https://en.wikipedia.org{link['href']}"  # Construct full Wikipedia URL

                producers_info.append({
                    'producerName': producer_name,
                    'producerLink': producer_url
                })
                PeopleLinks.add(producer_url)  # Add to global set of unique links
        else:  # Case: No hyperlinked producers, ignore entry
            print(f"Ignoring entry due to no hyperlinked producers: {row}")
            continue

        entry_data = {
            'movieTitle': movie_title,
            'releaseYear': current_year,
            'categoryName': "Best Picture",
            'iteration': current_iteration,
            'isWinner': winner_status,
            'producers': producers_info
        }

        data.append(entry_data)

# Save data to CSV files

# CSV 1: firstName, lastName, movieTitle, releaseYear, categoryName, iteration, isWinner
with open('oscar_full_details2.csv', mode='w', encoding='utf-8', newline='') as file1:
    writer1 = csv.writer(file1)
    writer1.writerow(['firstName', 'lastName', 'movieTitle', 'releaseYear',
                      'categoryName', 'iteration', 'isWinner'])
    for entry in data:
        for producer in entry['producers']:
            full_name = producer['producerName']
            name_parts = full_name.split(" ", 1) if " " in full_name else [full_name, ""]
            first_name, last_name = name_parts[0], name_parts[1]
            writer1.writerow([
                first_name,
                last_name,
                entry['movieTitle'],
                entry['releaseYear'],
                entry['categoryName'],
                entry['iteration'],
                entry['isWinner']
            ])

# CSV 2: firstName, lastName, movieTitle, releaseYear
with open('oscar_producers_basic2.csv', mode='w', encoding='utf-8', newline='') as file2:
    writer2 = csv.writer(file2)
    writer2.writerow(['firstName', 'lastName', 'movieTitle', 'releaseYear'])
    for entry in data:
        for producer in entry['producers']:
            full_name = producer['producerName']
            name_parts = full_name.split(" ", 1) if " " in full_name else [full_name, ""]
            first_name, last_name = name_parts[0], name_parts[1]
            writer2.writerow([
                first_name,
                last_name,
                entry['movieTitle'],
                entry['releaseYear']
            ])

print("Scraped data saved to two CSV files.")
print(f"Total unique people links collected: {len(PeopleLinks)}")


Skipping row due to insufficient columns: <tr>
<th rowspan="6" style="text-align:center"><a href="/wiki/1929_in_film" title="1929 in film">1928/29</a><br/><span style="font-size:85%;"><a href="/wiki/2nd_Academy_Awards" title="2nd Academy Awards">(2nd)</a></span><br/><sup class="reference" id="cite_ref-77"><a href="#cite_note-77"><span class="cite-bracket">[</span>a<span class="cite-bracket">]</span></a></sup>
</th></tr>
Skipping row due to insufficient columns: <tr>
<th rowspan="6" style="text-align:center"><a href="/wiki/1931_in_film" title="1931 in film">1930/31</a><br/><span style="font-size:85%;"><a href="/wiki/4th_Academy_Awards" title="4th Academy Awards">(4th)</a></span>
</th></tr>
Ignoring entry due to no hyperlinked producers: <tr>
<td><i><a href="/wiki/East_Lynne_(1931_film)" title="East Lynne (1931 film)">East Lynne</a></i>
</td>
<td>Fox
</td></tr>
Ignoring entry due to no hyperlinked producers: <tr style="background:#eee;">
<td><i><a href="/wiki/Bad_Girl_(1931_film)" title=

In [41]:


# Fetch the Wikipedia page
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Director"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []  # Main data list

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None  # Track current year
current_iteration = None  # Track current iteration

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for row in rows:
        cols = row.find_all(['td', 'th'])

        # Handle rows with <th> (release year and iteration)
        if cols[0].name == 'th':  # Year and iteration are in <th>
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            if match_year:
                current_year = int(match_year.group(1))
            if match_iteration:
                current_iteration = match_iteration.group(1)

            # Continue processing this row as it may also contain director/film info
            cols.pop(0)  # Remove <th> from columns to process remaining <td> tags

        # Skip rows without enough columns to process (e.g., empty rows)
        if len(cols) < 2:
            print(f"Skipping row due to insufficient columns: {row}")
            continue

        # Extract director's name and link from the first <td>
        director_cell = cols[0]  # First <td> contains the director info
        director_link_tag = director_cell.find('a')  # Find the <a> tag inside <td>

        if not director_link_tag:  # If no hyperlink is found, skip this entry
            print(f"Skipping row due to missing director hyperlink: {row}")
            continue

        director_name = director_link_tag.get_text(strip=True)
        director_link = f"https://en.wikipedia.org{director_link_tag['href']}"
        PeopleLinks.add(director_link)  # Add director link to global set

        # Extract film title from the second <td>
        film_cell = cols[1]  # Second <td> contains the film info
        film_title_tag = film_cell.find('i')  # Find the <i> tag inside <td>
        film_title_link_tag = film_title_tag.find('a') if film_title_tag else None

        film_title = (
            film_title_link_tag.get_text(strip=True)
            if film_title_link_tag
            else (film_title_tag.get_text(strip=True) if film_title_tag else None)
        )

        if not film_title:  # Skip if no film title is found
            print(f"Skipping row due to missing film title: {row}")
            continue

        # Check winner status (bold indicates winner)
        is_winner = bool(director_cell.find('b')) or bool(film_cell.find('b'))
        winner_status = "yes" if is_winner else "no"

        entry_data = {
            'movieTitle': film_title,
            'releaseYear': current_year,
            'categoryName': "Best Directing",
            'iteration': current_iteration,
            'isWinner': winner_status,
            'director': {
                'firstName': director_name.split(" ", 1)[0],
                'lastName': director_name.split(" ", 1)[1] if " " in director_name else "",
                'link': director_link
            }
        }

        data.append(entry_data)

# Save data to CSV files

# CSV 1: firstName, lastName, movieTitle, releaseYear, categoryName, iteration, isWinner
with open('best_directing_full_details.csv', mode='w', encoding='utf-8', newline='') as file1:
    writer1 = csv.writer(file1)
    writer1.writerow(['firstName', 'lastName', 'movieTitle', 'releaseYear',
                      'categoryName', 'iteration', 'isWinner'])
    for entry in data:
        writer1.writerow([
            entry['director']['firstName'],
            entry['director']['lastName'],
            entry['movieTitle'],
            entry['releaseYear'],
            entry['categoryName'],
            entry['iteration'],
            entry['isWinner']
        ])

print("Scraped data saved to best_directing_full_details.csv.")
print(f"Total unique people links collected: {len(PeopleLinks)}")


Skipping row due to missing director hyperlink: <tr>
<td rowspan="2"><span data-sort-value="Lloyd !">Frank Lloyd</span>
</td>
<td><i><a href="/wiki/Drag_(film)" title="Drag (film)">Drag</a></i>
</td></tr>
Skipping row due to insufficient columns: <tr>
<td><i><a href="/wiki/Weary_River" title="Weary River">Weary River</a></i>
</td></tr>
Skipping row due to insufficient columns: <tr>
<td><i><a href="/wiki/Romance_(1930_film)" title="Romance (1930 film)">Romance</a></i>
</td></tr>
Skipping row due to insufficient columns: <tr>
<td><i><a href="/wiki/Four_Daughters_(1938_film)" title="Four Daughters (1938 film)">Four Daughters</a></i>
</td></tr>
Skipping row due to missing director hyperlink: <tr>
<td><span data-sort-value="Soderbergh !">Steven Soderbergh</span>
</td>
<td><i><a href="/wiki/Erin_Brockovich_(film)" title="Erin Brockovich (film)">Erin Brockovich</a></i>
</td></tr>
Skipping row due to missing director hyperlink: <tr>
<td><div class="center" style="width:auto; margin-left:auto; 

In [44]:
print(len(PeopleLinks))

740


In [45]:


# Fetch the Wikipedia page
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actor"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []  # Main data list

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None  # Track current year
current_iteration = None  # Track current iteration

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for row in rows:
        cols = row.find_all(['td', 'th'])

        # Handle rows with <th> (release year and iteration)
        if cols[0].name == 'th':  # Year and iteration are in <th>
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            if match_year:
                current_year = int(match_year.group(1))
            if match_iteration:
                current_iteration = match_iteration.group(1)

            # Continue processing this row as it may also contain actor/film info
            cols.pop(0)  # Remove <th> from columns to process remaining <td> tags

        # Skip rows without enough columns to process (e.g., empty rows)
        if len(cols) < 3:  # Ensure there are enough columns for actor and film info
            print(f"Skipping row due to insufficient columns: {row}")
            continue

        # Extract actor's name and link from the first <td>
        actor_cell = cols[0]  # First <td> contains the actor info
        actor_link_tag = actor_cell.find('a')  # Find the <a> tag inside <td>

        if not actor_link_tag:  # If no hyperlink is found, skip this entry
            print(f"Skipping row due to missing actor hyperlink: {row}")
            continue

        actor_name = actor_link_tag.get_text(strip=True)
        actor_link = f"https://en.wikipedia.org{actor_link_tag['href']}"
        PeopleLinks.add(actor_link)  # Add actor link to global set

        # Extract film title from the fourth <td>
        film_cell = cols[2] if len(cols) > 2 else None  # Fourth <td> contains the film info
        film_title_tag = film_cell.find('i') if film_cell else None  # Find the <i> tag inside <td>
        film_title_link_tag = film_title_tag.find('a') if film_title_tag else None

        film_title = (
            film_title_link_tag.get_text(strip=True)
            if film_title_link_tag
            else (film_title_tag.get_text(strip=True) if film_title_tag else None)
        )

        if not film_title:  # Skip if no film title is found
            print(f"Skipping row due to missing film title: {row}")
            continue

        # Check winner status (bold indicates winner)
        is_winner = bool(actor_cell.find('b')) or bool(film_cell.find('b'))
        winner_status = "yes" if is_winner else "no"

        entry_data = {
            'movieTitle': film_title,
            'releaseYear': current_year,
            'categoryName': "Best Actor in a Leading Role",
            'iteration': current_iteration,
            'isWinner': winner_status,
            'actor': {
                'firstName': actor_name.split(" ", 1)[0],
                'lastName': actor_name.split(" ", 1)[1] if " " in actor_name else "",
                'link': actor_link
            }
        }

        data.append(entry_data)

Skipping row due to insufficient columns: <tr>
<th scope="row" style="background:#FAEB86;">§
</th>
<td style="background:#FAEB86;"><b>Indicates winner who refused the award</b>
</td></tr>
Skipping row due to insufficient columns: <tr>
<th scope="row" style="background:#FAEB86;">†
</th>
<td style="background:#FAEB86;"><b>Indicates <a href="/wiki/List_of_posthumous_Academy_Award_winners_and_nominees" title="List of posthumous Academy Award winners and nominees">a posthumous winner</a></b>
</td></tr>
Skipping row due to insufficient columns: <tr>
<th scope="row" style="text-align:center">†
</th>
<td>Indicates <a href="/wiki/List_of_posthumous_Academy_Award_winners_and_nominees" title="List of posthumous Academy Award winners and nominees">a posthumous nominee</a>
</td></tr>
Skipping row due to insufficient columns: <tr>
<td style="background:#FAEB86;"><b><span data-sort-value="Schilling !">August Schilling</span></b>
</td>
<td style="background:#FAEB86;"><b><span data-sort-value="Way !"><

In [47]:
print(PeopleLinks)

{'https://en.wikipedia.org/wiki/Alix_Madigan', 'https://en.wikipedia.org/wiki/Emil_Jannings', 'https://en.wikipedia.org/wiki/Ang_Lee', 'https://en.wikipedia.org/wiki/Chris_Columbus_(filmmaker)', 'https://en.wikipedia.org/wiki/Maurice_Chevalier', 'https://en.wikipedia.org/wiki/Ido_Ostrowsky', 'https://en.wikipedia.org/wiki/John_Cassavetes', 'https://en.wikipedia.org/wiki/Pawe%C5%82_Pawlikowski', 'https://en.wikipedia.org/wiki/Todd_Field', 'https://en.wikipedia.org/wiki/Rock_Hudson', 'https://en.wikipedia.org/wiki/John_G._Avildsen', 'https://en.wikipedia.org/wiki/Ron_Yerxa', 'https://en.wikipedia.org/wiki/Steve_Coogan', 'https://en.wikipedia.org/wiki/Andrew_Garfield', 'https://en.wikipedia.org/wiki/Charlie_Chaplin', 'https://en.wikipedia.org/wiki/Jane_Rosenthal', 'https://en.wikipedia.org/wiki/Hugh_Hudson', 'https://en.wikipedia.org/wiki/Uberto_Pasolini', 'https://en.wikipedia.org/wiki/B.P._Schulberg', 'https://en.wikipedia.org/wiki/Charles_B._Wessler', 'https://en.wikipedia.org/wiki/Kei

In [48]:
import requests
from bs4 import BeautifulSoup
import csv
import re

# Fetch the Wikipedia page
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actress"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []  # Main data list

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None  # Track current year
current_iteration = None  # Track current iteration

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for row in rows:
        cols = row.find_all(['td', 'th'])

        # Handle rows with <th> (release year and iteration)
        if cols[0].name == 'th':  # Year and iteration are in <th>
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            if match_year:
                current_year = int(match_year.group(1))
            if match_iteration:
                current_iteration = match_iteration.group(1)

            # Continue processing this row as it may also contain actor/film info
            cols.pop(0)  # Remove <th> from columns to process remaining <td> tags

        # Skip rows without enough columns to process (e.g., empty rows)
        if len(cols) < 3:  # Ensure there are enough columns for actor and film info
            print(f"Skipping row due to insufficient columns: {row}")
            continue

        # Extract actor's name and link from the first <td>
        actor_cell = cols[0]  # First <td> contains the actor info
        actor_link_tag = actor_cell.find('a')  # Find the <a> tag inside <td>

        if not actor_link_tag:  # If no hyperlink is found, skip this entry
            print(f"Skipping row due to missing actor hyperlink: {row}")
            continue

        actor_name = actor_link_tag.get_text(strip=True)
        actor_link = f"https://en.wikipedia.org{actor_link_tag['href']}"
        PeopleLinks.add(actor_link)  # Add actor link to global set

        # Extract film title from the fourth <td>
        film_cell = cols[2] if len(cols) > 2 else None  # Fourth <td> contains the film info
        film_title_tag = film_cell.find('i') if film_cell else None  # Find the <i> tag inside <td>
        film_title_link_tag = film_title_tag.find('a') if film_title_tag else None

        film_title = (
            film_title_link_tag.get_text(strip=True)
            if film_title_link_tag
            else (film_title_tag.get_text(strip=True) if film_title_tag else None)
        )

        if not film_title:  # Skip if no film title is found
            print(f"Skipping row due to missing film title: {row}")
            continue

        # Check winner status (bold indicates winner)
        is_winner = bool(actor_cell.find('b')) or bool(film_cell.find('b'))
        winner_status = "yes" if is_winner else "no"

        entry_data = {
            'movieTitle': film_title,
            'releaseYear': current_year,
            'categoryName': "Best Actress in a Leading Role",
            'iteration': current_iteration,
            'isWinner': winner_status,
            'actor': {
                'firstName': actor_name.split(" ", 1)[0],
                'lastName': actor_name.split(" ", 1)[1] if " " in actor_name else "",
                'link': actor_link
            }
        }

        data.append(entry_data)

        print(f"Total unique people links collected: {len(PeopleLinks)}")

Skipping row due to insufficient columns: <tr>
<th scope="row" style="text-align:center">†
</th>
<td>Indicates <a href="/wiki/List_of_posthumous_Academy_Award_winners_and_nominees" title="List of posthumous Academy Award winners and nominees">a posthumous nominee</a>
</td></tr>
Total unique people links collected: 965
Skipping row due to insufficient columns: <tr>
<td style="background:#FAEB86;"><b>Angela</b>
</td>
<td style="background:#FAEB86;"><i><b><a href="/wiki/Street_Angel_(1928_film)" title="Street Angel (1928 film)">Street Angel</a></b></i>
</td></tr>
Skipping row due to insufficient columns: <tr>
<td style="background:#FAEB86;"><span data-sort-value="Wife !"><b>The Wife</b></span>
</td>
<td style="background:#FAEB86;"><i><b><a href="/wiki/Sunrise:_A_Song_of_Two_Humans" title="Sunrise: A Song of Two Humans">Sunrise: A Song of Two Humans</a></b></i>
</td></tr>
Total unique people links collected: 966
Total unique people links collected: 967
Total unique people links collected: 

In [49]:
print(len(PeopleLinks))

1202


In [50]:
import requests
from bs4 import BeautifulSoup
import csv
import re

# Fetch the Wikipedia page
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Supporting_Actor"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []  # Main data list

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None  # Track current year
current_iteration = None  # Track current iteration

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for row in rows:
        cols = row.find_all(['td', 'th'])

        # Handle rows with <th> (release year and iteration)
        if cols[0].name == 'th':  # Year and iteration are in <th>
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            if match_year:
                current_year = int(match_year.group(1))
            if match_iteration:
                current_iteration = match_iteration.group(1)

            # Continue processing this row as it may also contain actor/film info
            cols.pop(0)  # Remove <th> from columns to process remaining <td> tags

        # Skip rows without enough columns to process (e.g., empty rows)
        if len(cols) < 3:  # Ensure there are enough columns for actor and film info
            print(f"Skipping row due to insufficient columns: {row}")
            continue

        # Extract actor's name and link from the first <td>
        actor_cell = cols[0]  # First <td> contains the actor info
        actor_link_tag = actor_cell.find('a')  # Find the <a> tag inside <td>

        if not actor_link_tag:  # If no hyperlink is found, skip this entry
            print(f"Skipping row due to missing actor hyperlink: {row}")
            continue

        actor_name = actor_link_tag.get_text(strip=True)
        actor_link = f"https://en.wikipedia.org{actor_link_tag['href']}"
        PeopleLinks.add(actor_link)  # Add actor link to global set

        # Extract film title from the fourth <td>
        film_cell = cols[2] if len(cols) > 2 else None  # Fourth <td> contains the film info
        film_title_tag = film_cell.find('i') if film_cell else None  # Find the <i> tag inside <td>
        film_title_link_tag = film_title_tag.find('a') if film_title_tag else None

        film_title = (
            film_title_link_tag.get_text(strip=True)
            if film_title_link_tag
            else (film_title_tag.get_text(strip=True) if film_title_tag else None)
        )

        if not film_title:  # Skip if no film title is found
            print(f"Skipping row due to missing film title: {row}")
            continue

        # Check winner status (bold indicates winner)
        is_winner = bool(actor_cell.find('b')) or bool(film_cell.find('b'))
        winner_status = "yes" if is_winner else "no"

        entry_data = {
            'movieTitle': film_title,
            'releaseYear': current_year,
            'categoryName': "Best Actor in a Supporting Role",
            'iteration': current_iteration,
            'isWinner': winner_status,
            'actor': {
                'firstName': actor_name.split(" ", 1)[0],
                'lastName': actor_name.split(" ", 1)[1] if " " in actor_name else "",
                'link': actor_link
            }
        }

        data.append(entry_data)

Skipping row due to insufficient columns: <tr>
<th scope="row" style="background:#FAEB86;">†
</th>
<td style="background:#FAEB86;"><b>Indicates <a href="/wiki/List_of_posthumous_Academy_Award_winners_and_nominees" title="List of posthumous Academy Award winners and nominees">a posthumous winner</a></b>
</td></tr>
Skipping row due to insufficient columns: <tr>
<th scope="row" style="text-align:center">†
</th>
<td>Indicates <a href="/wiki/List_of_posthumous_Academy_Award_winners_and_nominees" title="List of posthumous Academy Award winners and nominees">a posthumous nominee</a>
</td></tr>
Skipping row due to insufficient columns: <tr>
<th scope="row" style="text-align:center">§
</th>
<td>Indicates actor who refused the nomination
</td></tr>
Skipping row due to insufficient columns: <tr>
<td><span data-sort-value="Palance !"><a href="/wiki/Jack_Palance" title="Jack Palance">Jack Palance</a></span>
</td>
<td><span data-sort-value="Wilson !">Jack Wilson</span>
</td></tr>
Skipping row due to

In [51]:
print(len(PeopleLinks))

1445


In [52]:
print(PeopleLinks)

{'https://en.wikipedia.org/wiki/George_Sanders', 'https://en.wikipedia.org/wiki/John_Ireland_(actor)', 'https://en.wikipedia.org/wiki/John_Cassavetes', 'https://en.wikipedia.org/wiki/Pawe%C5%82_Pawlikowski', 'https://en.wikipedia.org/wiki/Todd_Field', 'https://en.wikipedia.org/wiki/John_G._Avildsen', 'https://en.wikipedia.org/wiki/Steve_Coogan', 'https://en.wikipedia.org/wiki/Hugh_Hudson', 'https://en.wikipedia.org/wiki/Broderick_Johnson', 'https://en.wikipedia.org/wiki/RKO_Pictures', 'https://en.wikipedia.org/wiki/Bong_Joon-ho', 'https://en.wikipedia.org/wiki/Carey_Mulligan', 'https://en.wikipedia.org/wiki/Ross_Hunter', 'https://en.wikipedia.org/wiki/Tom_Rosenberg', 'https://en.wikipedia.org/wiki/Walter_Mirisch', 'https://en.wikipedia.org/wiki/Denis_Villeneuve', 'https://en.wikipedia.org/wiki/Henry_Ginsberg', 'https://en.wikipedia.org/wiki/Jason_Miller_(playwright)', 'https://en.wikipedia.org/wiki/Andrew_Kosove', 'https://en.wikipedia.org/wiki/Peter_Glenville', 'https://en.wikipedia.o