In [38]:
import requests
from bs4 import BeautifulSoup
import csv
import re

# Fetch the Wikipedia page
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []  # Main data list
PeopleLinks = set()  # Global set to store unique producer links

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None  # Track current year
current_iteration = None  # Track current iteration

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for row in rows:
        cols = row.find_all(['td', 'th'])

        # Handle rows with <th> (release year and iteration)
        if cols[0].name == 'th':  # Year and iteration are in <th>
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            if match_year:
                current_year = int(match_year.group(1))
            if match_iteration:
                current_iteration = match_iteration.group(1)

            # Continue processing this row as it may also contain movie/producers info
            cols.pop(0)  # Remove <th> from columns to process remaining <td> tags

        # Skip rows without enough columns to process (e.g., empty rows)
        if len(cols) < 2:
            print(f"Skipping row due to insufficient columns: {row}")
            continue

        # Extract film title from the <i> tag within the first <td>
        film_cell = cols[0]  # First <td> contains the movie title
        film_title_tag = film_cell.find('i')  # Find the <i> tag inside <td>
        if not film_title_tag:
            print(f"Skipping row due to missing <i> tag")
            continue

        film_title_link_tag = film_title_tag.find('a')  # Check for nested <a> tag

        movie_title = (
            film_title_link_tag.get_text(strip=True)
            if film_title_link_tag
            else film_title_tag.get_text(strip=True)
        )

        # Check winner status (bold indicates winner)
        is_winner = bool(film_cell.find('b'))
        winner_status = "yes" if is_winner else "no"

        # Extract producers from the second column (<td>)
        producers_info = []
        producer_cell = cols[1]  # Second <td> contains producer info
        producer_links = producer_cell.find_all('a')  # Find all <a> tags in the cell

        if producer_links:  # Case: Hyperlinked producer names
            for link in producer_links:
                producer_name = link.get_text(strip=True)  # Extract producer name
                producer_url = f"https://en.wikipedia.org{link['href']}"  # Construct full Wikipedia URL

                producers_info.append({
                    'producerName': producer_name,
                    'producerLink': producer_url
                })
                PeopleLinks.add(producer_url)  # Add to global set of unique links
        else:  # Case: No hyperlinked producers, ignore entry
            print(f"Ignoring entry due to no hyperlinked producers: {row}")
            continue

        entry_data = {
            'movieTitle': movie_title,
            'releaseYear': current_year,
            'categoryName': "Best Picture",
            'iteration': current_iteration,
            'isWinner': winner_status,
            'producers': producers_info
        }

        data.append(entry_data)

# Save data to CSV files


Skipping row due to insufficient columns: <tr>
<th rowspan="6" style="text-align:center"><a href="/wiki/1929_in_film" title="1929 in film">1928/29</a><br/><span style="font-size: 85%;"><a href="/wiki/2nd_Academy_Awards" title="2nd Academy Awards">(2nd)</a></span><br/><sup class="reference" id="cite_ref-78"><a href="#cite_note-78"><span class="cite-bracket">[</span>a<span class="cite-bracket">]</span></a></sup>
</th></tr>
Skipping row due to insufficient columns: <tr>
<th rowspan="6" style="text-align:center"><a href="/wiki/1931_in_film" title="1931 in film">1930/31</a><br/><span style="font-size: 85%;"><a href="/wiki/4th_Academy_Awards" title="4th Academy Awards">(4th)</a></span>
</th></tr>
Ignoring entry due to no hyperlinked producers: <tr>
<td><i><a href="/wiki/East_Lynne_(1931_film)" title="East Lynne (1931 film)">East Lynne</a></i>
</td>
<td>Fox
</td></tr>
Ignoring entry due to no hyperlinked producers: <tr style="background:#eee;">
<td><i><a href="/wiki/Bad_Girl_(1931_film)" titl

In [39]:


# Fetch the Wikipedia page
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Director"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []  # Main data list

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None  # Track current year
current_iteration = None  # Track current iteration

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for row in rows:
        cols = row.find_all(['td', 'th'])

        # Handle rows with <th> (release year and iteration)
        if cols[0].name == 'th':  # Year and iteration are in <th>
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            if match_year:
                current_year = int(match_year.group(1))
            if match_iteration:
                current_iteration = match_iteration.group(1)

            # Continue processing this row as it may also contain director/film info
            cols.pop(0)  # Remove <th> from columns to process remaining <td> tags

        # Skip rows without enough columns to process (e.g., empty rows)
        if len(cols) < 2:
            print(f"Skipping row due to insufficient columns: {row}")
            continue

        # Extract director's name and link from the first <td>
        director_cell = cols[0]  # First <td> contains the director info
        director_link_tag = director_cell.find('a')  # Find the <a> tag inside <td>

        if not director_link_tag:  # If no hyperlink is found, skip this entry
            print(f"Skipping row due to missing director hyperlink: {row}")
            continue

        director_name = director_link_tag.get_text(strip=True)
        director_link = f"https://en.wikipedia.org{director_link_tag['href']}"
        PeopleLinks.add(director_link)  # Add director link to global set

        # Extract film title from the second <td>
        film_cell = cols[1]  # Second <td> contains the film info
        film_title_tag = film_cell.find('i')  # Find the <i> tag inside <td>
        film_title_link_tag = film_title_tag.find('a') if film_title_tag else None

        film_title = (
            film_title_link_tag.get_text(strip=True)
            if film_title_link_tag
            else (film_title_tag.get_text(strip=True) if film_title_tag else None)
        )

        if not film_title:  # Skip if no film title is found
            print(f"Skipping row due to missing film title: {row}")
            continue

        # Check winner status (bold indicates winner)
        is_winner = bool(director_cell.find('b')) or bool(film_cell.find('b'))
        winner_status = "yes" if is_winner else "no"

        entry_data = {
            'movieTitle': film_title,
            'releaseYear': current_year,
            'categoryName': "Best Directing",
            'iteration': current_iteration,
            'isWinner': winner_status,
            'director': {
                'firstName': director_name.split(" ", 1)[0],
                'lastName': director_name.split(" ", 1)[1] if " " in director_name else "",
                'link': director_link
            }
        }

        data.append(entry_data)

# Save data to CSV files



Skipping row due to missing director hyperlink: <tr>
<td rowspan="2"><span data-sort-value="Lloyd !">Frank Lloyd</span>
</td>
<td><i><a href="/wiki/Drag_(film)" title="Drag (film)">Drag</a></i>
</td></tr>
Skipping row due to insufficient columns: <tr>
<td><i><a href="/wiki/Weary_River" title="Weary River">Weary River</a></i>
</td></tr>
Skipping row due to insufficient columns: <tr>
<td><i><a href="/wiki/Romance_(1930_film)" title="Romance (1930 film)">Romance</a></i>
</td></tr>
Skipping row due to insufficient columns: <tr>
<td><i><a href="/wiki/Four_Daughters_(1938_film)" title="Four Daughters (1938 film)">Four Daughters</a></i>
</td></tr>
Skipping row due to missing director hyperlink: <tr>
<td><span data-sort-value="Soderbergh !">Steven Soderbergh</span>
</td>
<td><i><a href="/wiki/Erin_Brockovich_(film)" title="Erin Brockovich (film)">Erin Brockovich</a></i>
</td></tr>
Skipping row due to missing director hyperlink: <tr>
<td><div class="center" style="width:auto; margin-left:auto; 

In [40]:
print(len(PeopleLinks))

740


In [41]:


# Fetch the Wikipedia page
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actor"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []  # Main data list

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None  # Track current year
current_iteration = None  # Track current iteration

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for row in rows:
        cols = row.find_all(['td', 'th'])

        # Handle rows with <th> (release year and iteration)
        if cols[0].name == 'th':  # Year and iteration are in <th>
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            if match_year:
                current_year = int(match_year.group(1))
            if match_iteration:
                current_iteration = match_iteration.group(1)

            # Continue processing this row as it may also contain actor/film info
            cols.pop(0)  # Remove <th> from columns to process remaining <td> tags

        # Skip rows without enough columns to process (e.g., empty rows)
        if len(cols) < 3:  # Ensure there are enough columns for actor and film info
            print(f"Skipping row due to insufficient columns: {row}")
            continue

        # Extract actor's name and link from the first <td>
        actor_cell = cols[0]  # First <td> contains the actor info
        actor_link_tag = actor_cell.find('a')  # Find the <a> tag inside <td>

        if not actor_link_tag:  # If no hyperlink is found, skip this entry
            print(f"Skipping row due to missing actor hyperlink: {row}")
            continue

        actor_name = actor_link_tag.get_text(strip=True)
        actor_link = f"https://en.wikipedia.org{actor_link_tag['href']}"
        PeopleLinks.add(actor_link)  # Add actor link to global set

        # Extract film title from the fourth <td>
        film_cell = cols[2] if len(cols) > 2 else None  # Fourth <td> contains the film info
        film_title_tag = film_cell.find('i') if film_cell else None  # Find the <i> tag inside <td>
        film_title_link_tag = film_title_tag.find('a') if film_title_tag else None

        film_title = (
            film_title_link_tag.get_text(strip=True)
            if film_title_link_tag
            else (film_title_tag.get_text(strip=True) if film_title_tag else None)
        )

        if not film_title:  # Skip if no film title is found
            print(f"Skipping row due to missing film title: {row}")
            continue

        # Check winner status (bold indicates winner)
        is_winner = bool(actor_cell.find('b')) or bool(film_cell.find('b'))
        winner_status = "yes" if is_winner else "no"

        entry_data = {
            'movieTitle': film_title,
            'releaseYear': current_year,
            'categoryName': "Best Actor in a Leading Role",
            'iteration': current_iteration,
            'isWinner': winner_status,
            'actor': {
                'firstName': actor_name.split(" ", 1)[0],
                'lastName': actor_name.split(" ", 1)[1] if " " in actor_name else "",
                'link': actor_link
            }
        }

        data.append(entry_data)

Skipping row due to insufficient columns: <tr>
<th scope="row" style="background:#FAEB86;">§
</th>
<td style="background:#FAEB86;"><b>Indicates winner who refused the award</b>
</td></tr>
Skipping row due to insufficient columns: <tr>
<th scope="row" style="background:#FAEB86;">†
</th>
<td style="background:#FAEB86;"><b>Indicates <a href="/wiki/List_of_posthumous_Academy_Award_winners_and_nominees" title="List of posthumous Academy Award winners and nominees">a posthumous winner</a></b>
</td></tr>
Skipping row due to insufficient columns: <tr>
<th scope="row" style="text-align:center">†
</th>
<td>Indicates <a href="/wiki/List_of_posthumous_Academy_Award_winners_and_nominees" title="List of posthumous Academy Award winners and nominees">a posthumous nominee</a>
</td></tr>
Skipping row due to insufficient columns: <tr>
<td style="background:#FAEB86;"><b><span data-sort-value="Schilling !">August Schilling</span></b>
</td>
<td style="background:#FAEB86;"><b><span data-sort-value="Way !"><

In [42]:
print(PeopleLinks)

{'https://en.wikipedia.org/wiki/Chris_Columbus_(filmmaker)', 'https://en.wikipedia.org/wiki/Irving_Thalberg', 'https://en.wikipedia.org/wiki/Robert_Fox_(producer)', 'https://en.wikipedia.org/wiki/Tom_Hooper', 'https://en.wikipedia.org/wiki/Barry_Levinson', 'https://en.wikipedia.org/wiki/Arnold_Kopelson', 'https://en.wikipedia.org/wiki/Taylor_Hackford', 'https://en.wikipedia.org/wiki/Mark_Gordon_(film)', 'https://en.wikipedia.org/wiki/Trevor_White_(producer)', 'https://en.wikipedia.org/wiki/Guillermo_del_Toro', 'https://en.wikipedia.org/wiki/Rob_Marshall', 'https://en.wikipedia.org/wiki/Grant_Hill_(producer)', 'https://en.wikipedia.org/wiki/Stephen_Rea', 'https://en.wikipedia.org/wiki/Walter_Lang', 'https://en.wikipedia.org/wiki/Brad_Pitt', 'https://en.wikipedia.org/wiki/Robert_Downey_Jr.', 'https://en.wikipedia.org/wiki/James_Cagney', 'https://en.wikipedia.org/wiki/Saul_Zaentz', 'https://en.wikipedia.org/wiki/Robert_Chartoff', 'https://en.wikipedia.org/wiki/Chris_Moore_(film_producer)'

In [43]:
import requests
from bs4 import BeautifulSoup
import csv
import re

# Fetch the Wikipedia page
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actress"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []  # Main data list

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None  # Track current year
current_iteration = None  # Track current iteration

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for row in rows:
        cols = row.find_all(['td', 'th'])

        # Handle rows with <th> (release year and iteration)
        if cols[0].name == 'th':  # Year and iteration are in <th>
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            if match_year:
                current_year = int(match_year.group(1))
            if match_iteration:
                current_iteration = match_iteration.group(1)

            # Continue processing this row as it may also contain actor/film info
            cols.pop(0)  # Remove <th> from columns to process remaining <td> tags

        # Skip rows without enough columns to process (e.g., empty rows)
        if len(cols) < 3:  # Ensure there are enough columns for actor and film info
            print(f"Skipping row due to insufficient columns: {row}")
            continue

        # Extract actor's name and link from the first <td>
        actor_cell = cols[0]  # First <td> contains the actor info
        actor_link_tag = actor_cell.find('a')  # Find the <a> tag inside <td>

        if not actor_link_tag:  # If no hyperlink is found, skip this entry
            print(f"Skipping row due to missing actor hyperlink: {row}")
            continue

        actor_name = actor_link_tag.get_text(strip=True)
        actor_link = f"https://en.wikipedia.org{actor_link_tag['href']}"
        PeopleLinks.add(actor_link)  # Add actor link to global set

        # Extract film title from the fourth <td>
        film_cell = cols[2] if len(cols) > 2 else None  # Fourth <td> contains the film info
        film_title_tag = film_cell.find('i') if film_cell else None  # Find the <i> tag inside <td>
        film_title_link_tag = film_title_tag.find('a') if film_title_tag else None

        film_title = (
            film_title_link_tag.get_text(strip=True)
            if film_title_link_tag
            else (film_title_tag.get_text(strip=True) if film_title_tag else None)
        )

        if not film_title:  # Skip if no film title is found
            print(f"Skipping row due to missing film title: {row}")
            continue

        # Check winner status (bold indicates winner)
        is_winner = bool(actor_cell.find('b')) or bool(film_cell.find('b'))
        winner_status = "yes" if is_winner else "no"

        entry_data = {
            'movieTitle': film_title,
            'releaseYear': current_year,
            'categoryName': "Best Actress in a Leading Role",
            'iteration': current_iteration,
            'isWinner': winner_status,
            'actor': {
                'firstName': actor_name.split(" ", 1)[0],
                'lastName': actor_name.split(" ", 1)[1] if " " in actor_name else "",
                'link': actor_link
            }
        }

        data.append(entry_data)

        print(f"Total unique people links collected: {len(PeopleLinks)}")

Skipping row due to insufficient columns: <tr>
<th scope="row" style="text-align:center">†
</th>
<td>Indicates <a href="/wiki/List_of_posthumous_Academy_Award_winners_and_nominees" title="List of posthumous Academy Award winners and nominees">a posthumous nominee</a>
</td></tr>
Total unique people links collected: 965
Skipping row due to insufficient columns: <tr>
<td style="background:#FAEB86;"><b>Angela</b>
</td>
<td style="background:#FAEB86;"><i><b><a href="/wiki/Street_Angel_(1928_film)" title="Street Angel (1928 film)">Street Angel</a></b></i>
</td></tr>
Skipping row due to insufficient columns: <tr>
<td style="background:#FAEB86;"><span data-sort-value="Wife !"><b>The Wife</b></span>
</td>
<td style="background:#FAEB86;"><i><b><a href="/wiki/Sunrise:_A_Song_of_Two_Humans" title="Sunrise: A Song of Two Humans">Sunrise: A Song of Two Humans</a></b></i>
</td></tr>
Total unique people links collected: 966
Total unique people links collected: 967
Total unique people links collected: 

In [44]:
print(len(PeopleLinks))

1202


In [45]:
import requests
from bs4 import BeautifulSoup
import csv
import re

# Fetch the Wikipedia page
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Supporting_Actor"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []  # Main data list

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None  # Track current year
current_iteration = None  # Track current iteration

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for row in rows:
        cols = row.find_all(['td', 'th'])

        # Handle rows with <th> (release year and iteration)
        if cols[0].name == 'th':  # Year and iteration are in <th>
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            if match_year:
                current_year = int(match_year.group(1))
            if match_iteration:
                current_iteration = match_iteration.group(1)

            # Continue processing this row as it may also contain actor/film info
            cols.pop(0)  # Remove <th> from columns to process remaining <td> tags

        # Skip rows without enough columns to process (e.g., empty rows)
        if len(cols) < 3:  # Ensure there are enough columns for actor and film info
            print(f"Skipping row due to insufficient columns: {row}")
            continue

        # Extract actor's name and link from the first <td>
        actor_cell = cols[0]  # First <td> contains the actor info
        actor_link_tag = actor_cell.find('a')  # Find the <a> tag inside <td>

        if not actor_link_tag:  # If no hyperlink is found, skip this entry
            print(f"Skipping row due to missing actor hyperlink: {row}")
            continue

        actor_name = actor_link_tag.get_text(strip=True)
        actor_link = f"https://en.wikipedia.org{actor_link_tag['href']}"
        PeopleLinks.add(actor_link)  # Add actor link to global set

        # Extract film title from the fourth <td>
        film_cell = cols[2] if len(cols) > 2 else None  # Fourth <td> contains the film info
        film_title_tag = film_cell.find('i') if film_cell else None  # Find the <i> tag inside <td>
        film_title_link_tag = film_title_tag.find('a') if film_title_tag else None

        film_title = (
            film_title_link_tag.get_text(strip=True)
            if film_title_link_tag
            else (film_title_tag.get_text(strip=True) if film_title_tag else None)
        )

        if not film_title:  # Skip if no film title is found
            print(f"Skipping row due to missing film title: {row}")
            continue

        # Check winner status (bold indicates winner)
        is_winner = bool(actor_cell.find('b')) or bool(film_cell.find('b'))
        winner_status = "yes" if is_winner else "no"

        entry_data = {
            'movieTitle': film_title,
            'releaseYear': current_year,
            'categoryName': "Best Actor in a Supporting Role",
            'iteration': current_iteration,
            'isWinner': winner_status,
            'actor': {
                'firstName': actor_name.split(" ", 1)[0],
                'lastName': actor_name.split(" ", 1)[1] if " " in actor_name else "",
                'link': actor_link
            }
        }

        data.append(entry_data)

Skipping row due to insufficient columns: <tr>
<th scope="row" style="background:#FAEB86;">†
</th>
<td style="background:#FAEB86;"><b>Indicates <a href="/wiki/List_of_posthumous_Academy_Award_winners_and_nominees" title="List of posthumous Academy Award winners and nominees">a posthumous winner</a></b>
</td></tr>
Skipping row due to insufficient columns: <tr>
<th scope="row" style="text-align:center">†
</th>
<td>Indicates <a href="/wiki/List_of_posthumous_Academy_Award_winners_and_nominees" title="List of posthumous Academy Award winners and nominees">a posthumous nominee</a>
</td></tr>
Skipping row due to insufficient columns: <tr>
<th scope="row" style="text-align:center">§
</th>
<td>Indicates actor who refused the nomination
</td></tr>
Skipping row due to insufficient columns: <tr>
<td><span data-sort-value="Palance !"><a href="/wiki/Jack_Palance" title="Jack Palance">Jack Palance</a></span>
</td>
<td><span data-sort-value="Wilson !">Jack Wilson</span>
</td></tr>
Skipping row due to

In [46]:
print(len(PeopleLinks))

1445


In [47]:
print(PeopleLinks)

{'https://en.wikipedia.org/wiki/Irving_Thalberg', 'https://en.wikipedia.org/wiki/Randy_Quaid', 'https://en.wikipedia.org/wiki/Rob_Marshall', 'https://en.wikipedia.org/wiki/Rosamund_Pike', 'https://en.wikipedia.org/wiki/Brad_Pitt', 'https://en.wikipedia.org/wiki/Saul_Zaentz', 'https://en.wikipedia.org/wiki/Niki_Marvin', 'https://en.wikipedia.org/wiki/Bruce_Beresford', 'https://en.wikipedia.org/wiki/Richard_Harris', 'https://en.wikipedia.org/wiki/Joan_Fontaine', 'https://en.wikipedia.org/wiki/Duncan_Kenworthy', 'https://en.wikipedia.org/wiki/Juliette_Howell', 'https://en.wikipedia.org/wiki/Sarah_Green_(film_producer)', 'https://en.wikipedia.org/wiki/Elia_Kazan', 'https://en.wikipedia.org/wiki/Metro-Goldwyn-Mayer', 'https://en.wikipedia.org/wiki/Robert_De_Niro', 'https://en.wikipedia.org/wiki/Troy_Kotsur', 'https://en.wikipedia.org/wiki/Jim_Broadbent', 'https://en.wikipedia.org/wiki/Samuel_Goldwyn_Jr.', 'https://en.wikipedia.org/wiki/Fred_Baron_(producer)', 'https://en.wikipedia.org/wiki/

In [48]:
import requests
from bs4 import BeautifulSoup
import csv
import re

# Fetch the Wikipedia page for Best Cinematography
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Cinematography"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None
current_iteration = None

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for row in rows:
        cols = row.find_all(['td', 'th'])

        # Extract year and iteration from <th> column
        if cols and cols[0].name == 'th':
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            current_year = int(match_year.group(1)) if match_year else None
            current_iteration = match_iteration.group(1) if match_iteration else None

            cols.pop(0)  # Remove <th> to process remaining columns

        # Skip rows without enough columns
        if len(cols) < 2:
            continue

        # Extract film title from second column
        film_cell = cols[0]
        film_title_tag = film_cell.find('i')
        if not film_title_tag:
            continue

        film_title_link_tag = film_title_tag.find('a')
        movie_title = film_title_link_tag.get_text(strip=True) if film_title_link_tag else film_title_tag.get_text(
            strip=True)

        # Check winner status (background color indicates winner)
        is_winner = "yes" if "background:#FAEB86" in row.get("style", "") else "no"

        # Extract nominees from third column
        nominee_cell = cols[1]
        nominee_links_tags = nominee_cell.find_all('a')

        nominees_info = []
        for link in nominee_links_tags:
            nominee_name = link.get_text(strip=True)
            nominee_url = f"https://en.wikipedia.org{link['href']}"
            PeopleLinks.add(nominee_url)

            nominees_info.append({'nomineeName': nominee_name, 'nomineeLink': nominee_url})

        entry_data = {
            'movieTitle': movie_title,
            'releaseYear': current_year,
            'categoryName': "Best Cinematography",
            'iteration': current_iteration,
            'isWinner': is_winner,
            'nominees': nominees_info
        }

        data.append(entry_data)

print(f"Total unique people links collected: {len(PeopleLinks)}")


Total unique people links collected: 1709


In [49]:
print(PeopleLinks)

{'https://en.wikipedia.org/wiki/Irving_Thalberg', 'https://en.wikipedia.org/wiki/Randy_Quaid', 'https://en.wikipedia.org/wiki/Rob_Marshall', 'https://en.wikipedia.org/wiki/Rosamund_Pike', 'https://en.wikipedia.org/wiki/Brad_Pitt', 'https://en.wikipedia.org/wiki/Karl_Freund', 'https://en.wikipedia.org/wiki/Saul_Zaentz', 'https://en.wikipedia.org/wiki/Joseph_MacDonald', 'https://en.wikipedia.org/wiki/Tony_Gaudio', 'https://en.wikipedia.org/wiki/Niki_Marvin', 'https://en.wikipedia.org/wiki/Barney_McGill', 'https://en.wikipedia.org/wiki/Bruce_Beresford', 'https://en.wikipedia.org/wiki/Richard_Harris', 'https://en.wikipedia.org/wiki/Joan_Fontaine', 'https://en.wikipedia.org/wiki/Duncan_Kenworthy', 'https://en.wikipedia.org/wiki/Juliette_Howell', 'https://en.wikipedia.org/wiki/Sarah_Green_(film_producer)', 'https://en.wikipedia.org/wiki/Elia_Kazan', 'https://en.wikipedia.org/wiki/Metro-Goldwyn-Mayer', 'https://en.wikipedia.org/wiki/Robert_De_Niro', 'https://en.wikipedia.org/wiki/Troy_Kotsur'

In [50]:
with open('peoplelinks.csv', mode='w', encoding='utf-8', newline='') as file1:
    writer1 = csv.writer(file1)
    writer1.writerow(['link'])  # Write the header row
    for link in PeopleLinks:
        writer1.writerow([link])  # Write each link as a single-element list


In [51]:
import requests
from bs4 import BeautifulSoup
import csv
import re

# Fetch the Wikipedia page for Best Cinematography
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Film_Editing"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None
current_iteration = None

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for row in rows:
        cols = row.find_all(['td', 'th'])

        # Extract year and iteration from <th> column
        if cols and cols[0].name == 'th':
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            current_year = int(match_year.group(1)) if match_year else None
            current_iteration = match_iteration.group(1) if match_iteration else None

            cols.pop(0)  # Remove <th> to process remaining columns

        # Skip rows without enough columns
        if len(cols) < 2:
            continue

        # Extract film title from second column
        film_cell = cols[0]
        film_title_tag = film_cell.find('i')
        if not film_title_tag:
            continue

        film_title_link_tag = film_title_tag.find('a')
        movie_title = film_title_link_tag.get_text(strip=True) if film_title_link_tag else film_title_tag.get_text(
            strip=True)

        # Check winner status (background color indicates winner)
        is_winner = "yes" if "background:#FAEB86" in row.get("style", "") else "no"

        # Extract nominees from third column
        nominee_cell = cols[1]
        nominee_links_tags = nominee_cell.find_all('a')

        nominees_info = []
        for link in nominee_links_tags:
            nominee_name = link.get_text(strip=True)
            nominee_url = f"https://en.wikipedia.org{link['href']}"
            PeopleLinks.add(nominee_url)

            nominees_info.append({'nomineeName': nominee_name, 'nomineeLink': nominee_url})

        entry_data = {
            'movieTitle': movie_title,
            'releaseYear': current_year,
            'categoryName': "Best Film Editing",
            'iteration': current_iteration,
            'isWinner': is_winner,
            'nominees': nominees_info
        }

        data.append(entry_data)

print(f"Total unique people links collected: {len(PeopleLinks)}")


Total unique people links collected: 2033


In [52]:
# Fetch the Wikipedia page for Best Cinematography
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Makeup_and_Hairstyling"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None
current_iteration = None

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for row in rows:
        cols = row.find_all(['td', 'th'])

        # Extract year and iteration from <th> column
        if cols and cols[0].name == 'th':
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            current_year = int(match_year.group(1)) if match_year else None
            current_iteration = match_iteration.group(1) if match_iteration else None

            cols.pop(0)  # Remove <th> to process remaining columns

        # Skip rows without enough columns
        if len(cols) < 2:
            continue

        # Extract film title from second column
        film_cell = cols[0]
        film_title_tag = film_cell.find('i')
        if not film_title_tag:
            continue

        film_title_link_tag = film_title_tag.find('a')
        movie_title = film_title_link_tag.get_text(strip=True) if film_title_link_tag else film_title_tag.get_text(
            strip=True)

        # Check winner status (background color indicates winner)
        is_winner = "yes" if "background:#FAEB86" in row.get("style", "") else "no"

        # Extract nominees from third column
        nominee_cell = cols[1]
        nominee_links_tags = nominee_cell.find_all('a')

        nominees_info = []
        for link in nominee_links_tags:
            nominee_name = link.get_text(strip=True)
            nominee_url = f"https://en.wikipedia.org{link['href']}"
            PeopleLinks.add(nominee_url)

            nominees_info.append({'nomineeName': nominee_name, 'nomineeLink': nominee_url})

print(len(PeopleLinks))

2241


In [53]:


# Fetch the Wikipedia page for Best Cinematography
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Animated_Short_Film"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None
current_iteration = None

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for row in rows:
        cols = row.find_all(['td', 'th'])

        # Extract year and iteration from <th> column
        if cols and cols[0].name == 'th':
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            current_year = int(match_year.group(1)) if match_year else None
            current_iteration = match_iteration.group(1) if match_iteration else None

            cols.pop(0)  # Remove <th> to process remaining columns

        # Skip rows without enough columns
        if len(cols) < 2:
            continue

        # Extract film title from second column
        film_cell = cols[0]
        film_title_tag = film_cell.find('i')
        if not film_title_tag:
            continue

        film_title_link_tag = film_title_tag.find('a')
        movie_title = film_title_link_tag.get_text(strip=True) if film_title_link_tag else film_title_tag.get_text(
            strip=True)

        # Check winner status (background color indicates winner)
        is_winner = "yes" if "background:#FAEB86" in row.get("style", "") else "no"

        # Extract nominees from third column
        nominee_cell = cols[1]
        nominee_links_tags = nominee_cell.find_all('a')

        nominees_info = []
        for link in nominee_links_tags:
            nominee_name = link.get_text(strip=True)
            nominee_url = f"https://en.wikipedia.org{link['href']}"
            PeopleLinks.add(nominee_url)

            nominees_info.append({'nomineeName': nominee_name, 'nomineeLink': nominee_url})

        entry_data = {
            'movieTitle': movie_title,
            'releaseYear': current_year,
            'categoryName': "Best Animated Short Film",
            'iteration': current_iteration,
            'isWinner': is_winner,
            'nominees': nominees_info
        }

print(f"Total unique people links collected: {len(PeopleLinks)}")


Total unique people links collected: 2466


In [54]:
print(PeopleLinks)

{'https://en.wikipedia.org/wiki/Samuel_E._Beetley', 'https://en.wikipedia.org/wiki/Tom_Priestley', 'https://en.wikipedia.org/wiki/Irving_Thalberg', 'https://en.wikipedia.org/wiki/Randy_Quaid', 'https://en.wikipedia.org/wiki/Marcus_D%27Arcy', 'https://en.wikipedia.org/wiki/Rob_Marshall', 'https://en.wikipedia.org/wiki/Rosamund_Pike', 'https://en.wikipedia.org/wiki/Carol_Hemming', 'https://en.wikipedia.org/wiki/Brad_Pitt', 'https://en.wikipedia.org/wiki/Karl_Freund', 'https://en.wikipedia.org/wiki/Carlos_Saldanha', 'https://en.wikipedia.org/wiki/David_Mart%C3%AD', 'https://en.wikipedia.org/wiki/Saul_Zaentz', 'https://en.wikipedia.org/wiki/Joseph_MacDonald', 'https://en.wikipedia.org/wiki/Tony_Gaudio', 'https://en.wikipedia.org/wiki/Burny_Mattinson', 'https://en.wikipedia.org/wiki/Niki_Marvin', 'https://en.wikipedia.org/wiki/Barney_McGill', 'https://en.wikipedia.org/wiki/Bruce_Beresford', 'https://en.wikipedia.org/wiki/Richard_Harris', 'https://en.wikipedia.org/wiki/Joan_Fontaine', 'https

In [55]:
with open('peoplelinks.csv', mode='w', encoding='utf-8', newline='') as file1:
    writer1 = csv.writer(file1)
    writer1.writerow(['link'])  # Write the header row
    for link in PeopleLinks:
        writer1.writerow([link])  # Write each link as a single-element list


In [56]:
# Fetch the Wikipedia page
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Original_Screenplay"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []  # Main data list

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None  # Track current year
current_iteration = None  # Track current iteration

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for i, row in enumerate(rows):
        cols = row.find_all(['td', 'th'])

        # Handle rows with <th> (release year and iteration)
        if cols and cols[0].name == 'th':  # Year and iteration are in <th>
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            if match_year:
                current_year = int(match_year.group(1))
            if match_iteration:
                current_iteration = match_iteration.group(1)

            # Process the rest of this row if it contains film info
            if len(cols) > 2:  # If there's film info in this row
                film_cell = cols[1]
                nominee_cell = cols[2]
            else:
                continue  # Skip to next row if this row only has year info
        else:
            # For rows without <th>, film is in cols[0] and nominees in cols[1]
            if len(cols) < 2:
                continue  # Skip rows without enough columns

            film_cell = cols[0]
            nominee_cell = cols[1]

        # Extract film title
        film_title_tag = film_cell.find('i')  # Find the <i> tag inside <td>
        if not film_title_tag:
            continue

        film_title_link_tag = film_title_tag.find('a')  # Check for nested <a> tag
        movie_title = (
            film_title_link_tag.get_text(strip=True)
            if film_title_link_tag
            else film_title_tag.get_text(strip=True)
        )

        # Check winner status (winners are marked with bold text, background color, or ‡)
        is_winner = bool(film_cell.find('b')) or "background:#FAEB86" in str(row) or "‡" in film_cell.get_text()
        winner_status = "yes" if is_winner else "no"

        # Extract nominees from the nominee cell
        nominees_info = []
        nominee_links = nominee_cell.find_all('a')

        for link in nominee_links:
            nominee_name = link.get_text(strip=True)
            nominee_url = f"https://en.wikipedia.org{link['href']}"
            nominees_info.append({
                'nomineeName': nominee_name,
                'nomineeLink': nominee_url
            })
            PeopleLinks.add(nominee_url)
print(len(PeopleLinks))

2855


In [57]:
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Adapted_Screenplay"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []  # Main data list

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None  # Track current year
current_iteration = None  # Track current iteration

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for i, row in enumerate(rows):
        cols = row.find_all(['td', 'th'])

        # Handle rows with <th> (release year and iteration)
        if cols and cols[0].name == 'th':  # Year and iteration are in <th>
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            if match_year:
                current_year = int(match_year.group(1))
            if match_iteration:
                current_iteration = match_iteration.group(1)

            # Process the rest of this row if it contains film info
            if len(cols) > 2:  # If there's film info in this row
                film_cell = cols[1]
                nominee_cell = cols[2]
            else:
                continue  # Skip to next row if this row only has year info
        else:
            # For rows without <th>, film is in cols[0] and nominees in cols[1]
            if len(cols) < 2:
                continue  # Skip rows without enough columns

            film_cell = cols[0]
            nominee_cell = cols[1]

        # Extract film title
        film_title_tag = film_cell.find('i')  # Find the <i> tag inside <td>
        if not film_title_tag:
            continue

        film_title_link_tag = film_title_tag.find('a')  # Check for nested <a> tag
        movie_title = (
            film_title_link_tag.get_text(strip=True)
            if film_title_link_tag
            else film_title_tag.get_text(strip=True)
        )

        # Check winner status (winners are marked with bold text, background color, or ‡)
        is_winner = bool(film_cell.find('b')) or "background:#FAEB86" in str(row) or "‡" in film_cell.get_text()
        winner_status = "yes" if is_winner else "no"

        # Extract nominees from the nominee cell
        nominees_info = []
        nominee_links = nominee_cell.find_all('a')

        for link in nominee_links:
            nominee_name = link.get_text(strip=True)
            nominee_url = f"https://en.wikipedia.org{link['href']}"
            nominees_info.append({
                'nomineeName': nominee_name,
                'nomineeLink': nominee_url
            })
            PeopleLinks.add(nominee_url)

In [58]:
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Animated_Feature"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []  # Main data list

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None  # Track current year
current_iteration = None  # Track current iteration

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for i, row in enumerate(rows):
        cols = row.find_all(['td', 'th'])

        # Handle rows with <th> (release year and iteration)
        if cols and cols[0].name == 'th':  # Year and iteration are in <th>
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            if match_year:
                current_year = int(match_year.group(1))
            if match_iteration:
                current_iteration = match_iteration.group(1)

            # Process the rest of this row if it contains film info
            if len(cols) > 2:  # If there's film info in this row
                film_cell = cols[1]
                nominee_cell = cols[2]
            else:
                continue  # Skip to next row if this row only has year info
        else:
            # For rows without <th>, film is in cols[0] and nominees in cols[1]
            if len(cols) < 2:
                continue  # Skip rows without enough columns

            film_cell = cols[0]
            nominee_cell = cols[1]

        # Extract film title
        film_title_tag = film_cell.find('i')  # Find the <i> tag inside <td>
        if not film_title_tag:
            continue

        film_title_link_tag = film_title_tag.find('a')  # Check for nested <a> tag
        movie_title = (
            film_title_link_tag.get_text(strip=True)
            if film_title_link_tag
            else film_title_tag.get_text(strip=True)
        )

        # Check winner status (winners are marked with bold text, background color, or ‡)
        is_winner = bool(film_cell.find('b')) or "background:#FAEB86" in str(row) or "‡" in film_cell.get_text()
        winner_status = "yes" if is_winner else "no"

        # Extract nominees from the nominee cell
        nominees_info = []
        nominee_links = nominee_cell.find_all('a')

        for link in nominee_links:
            nominee_name = link.get_text(strip=True)
            nominee_url = f"https://en.wikipedia.org{link['href']}"
            nominees_info.append({
                'nomineeName': nominee_name,
                'nomineeLink': nominee_url
            })
            PeopleLinks.add(nominee_url)
print(len(PeopleLinks))

3353


In [59]:
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Documentary_Feature_Film"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []  # Main data list

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None  # Track current year
current_iteration = None  # Track current iteration

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for i, row in enumerate(rows):
        cols = row.find_all(['td', 'th'])

        # Handle rows with <th> (release year and iteration)
        if cols and cols[0].name == 'th':  # Year and iteration are in <th>
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            if match_year:
                current_year = int(match_year.group(1))
            if match_iteration:
                current_iteration = match_iteration.group(1)

            # Process the rest of this row if it contains film info
            if len(cols) > 2:  # If there's film info in this row
                film_cell = cols[1]
                nominee_cell = cols[2]
            else:
                continue  # Skip to next row if this row only has year info
        else:
            # For rows without <th>, film is in cols[0] and nominees in cols[1]
            if len(cols) < 2:
                continue  # Skip rows without enough columns

            film_cell = cols[0]
            nominee_cell = cols[1]

        # Extract film title
        film_title_tag = film_cell.find('i')  # Find the <i> tag inside <td>
        if not film_title_tag:
            continue

        film_title_link_tag = film_title_tag.find('a')  # Check for nested <a> tag
        movie_title = (
            film_title_link_tag.get_text(strip=True)
            if film_title_link_tag
            else film_title_tag.get_text(strip=True)
        )

        # Check winner status (winners are marked with bold text, background color, or ‡)
        is_winner = bool(film_cell.find('b')) or "background:#FAEB86" in str(row)
        winner_status = "yes" if is_winner else "no"

        # Extract nominees from the nominee cell
        nominees_info = []
        nominee_links = nominee_cell.find_all('a')

        for link in nominee_links:
            nominee_name = link.get_text(strip=True)
            nominee_url = f"https://en.wikipedia.org{link['href']}"

            # Split the name and count the number of words
            name_parts = nominee_name.split()

            # Check if it's likely a person (3 or fewer words in name)
            if len(name_parts) <= 3:
                nominees_info.append({
                    'nomineeName': nominee_name,
                    'nomineeLink': nominee_url
                })
                PeopleLinks.add(nominee_url)

print(len(PeopleLinks))

3819


In [60]:
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Documentary_Short_Film"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []  # Main data list

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None  # Track current year
current_iteration = None  # Track current iteration

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for i, row in enumerate(rows):
        cols = row.find_all(['td', 'th'])

        # Handle rows with <th> (release year and iteration)
        if cols and cols[0].name == 'th':  # Year and iteration are in <th>
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            if match_year:
                current_year = int(match_year.group(1))
            if match_iteration:
                current_iteration = match_iteration.group(1)

            # Process the rest of this row if it contains film info
            if len(cols) > 2:  # If there's film info in this row
                film_cell = cols[1]
                nominee_cell = cols[2]
            else:
                continue  # Skip to next row if this row only has year info
        else:
            # For rows without <th>, film is in cols[0] and nominees in cols[1]
            if len(cols) < 2:
                continue  # Skip rows without enough columns

            film_cell = cols[0]
            nominee_cell = cols[1]

        # Extract film title
        film_title_tag = film_cell.find('i')  # Find the <i> tag inside <td>
        if not film_title_tag:
            continue

        film_title_link_tag = film_title_tag.find('a')  # Check for nested <a> tag
        movie_title = (
            film_title_link_tag.get_text(strip=True)
            if film_title_link_tag
            else film_title_tag.get_text(strip=True)
        )

        # Check winner status (winners are marked with bold text, background color, or ‡)
        is_winner = bool(film_cell.find('b')) or "background:#FAEB86" in str(row)
        winner_status = "yes" if is_winner else "no"

        # Extract nominees from the nominee cell
        nominees_info = []
        nominee_links = nominee_cell.find_all('a')

        for link in nominee_links:
            nominee_name = link.get_text(strip=True)
            nominee_url = f"https://en.wikipedia.org{link['href']}"

            # Split the name and count the number of words
            name_parts = nominee_name.split()

            # Check if it's likely a person (3 or fewer words in name)
            if len(name_parts) <= 3:
                nominees_info.append({
                    'nomineeName': nominee_name,
                    'nomineeLink': nominee_url
                })
                PeopleLinks.add(nominee_url)



In [61]:
print(len(PeopleLinks))

3992


In [62]:
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Original_Score"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None
current_iteration = None

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for row in rows:
        cols = row.find_all(['td', 'th'])

        # Extract year and iteration from <th> column
        if cols and cols[0].name == 'th':
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            current_year = int(match_year.group(1)) if match_year else None
            current_iteration = match_iteration.group(1) if match_iteration else None

            cols.pop(0)  # Remove <th> to process remaining columns

        # Skip rows without enough columns
        if len(cols) < 2:
            continue

        # Extract film title from second column
        film_cell = cols[0]
        film_title_tag = film_cell.find('i')
        if not film_title_tag:
            continue

        film_title_link_tag = film_title_tag.find('a')
        movie_title = film_title_link_tag.get_text(strip=True) if film_title_link_tag else film_title_tag.get_text(
            strip=True)

        # Check winner status (background color indicates winner)
        is_winner = "yes" if "background:#FAEB86" in row.get("style", "") else "no"

        # Extract nominees from third column
        nominee_cell = cols[1]
        nominee_links_tags = nominee_cell.find_all('a')

        nominees_info = []
        for link in nominee_links_tags:
            nominee_name = link.get_text(strip=True)
            nominee_url = f"https://en.wikipedia.org{link['href']}"
            PeopleLinks.add(nominee_url)
print(len(PeopleLinks))

4309


In [63]:
# Fetch the Wikipedia page for Best Cinematography
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Original_Song"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None
current_iteration = None

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for row in rows:
        cols = row.find_all(['td', 'th'])

        # Extract year and iteration from <th> column
        if cols and cols[0].name == 'th':
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            current_year = int(match_year.group(1)) if match_year else None
            current_iteration = match_iteration.group(1) if match_iteration else None

            cols.pop(0)  # Remove <th> to process remaining columns

        # Skip rows without enough columns
        if len(cols) < 3:
            continue

        # Extract film title from second column
        film_cell = cols[0]
        film_title_tag = film_cell.find('i')
        if not film_title_tag:
            continue

        film_title_link_tag = film_title_tag.find('a')
        movie_title = film_title_link_tag.get_text(strip=True) if film_title_link_tag else film_title_tag.get_text(
            strip=True)

        # Check winner status (background color indicates winner)
        is_winner = "yes" if "background:#FAEB86" in row.get("style", "") else "no"

        # Extract nominees from third column
        nominee_cell = cols[2]
        nominee_links_tags = nominee_cell.find_all('a')

        nominees_info = []
        for link in nominee_links_tags:
            nominee_name = link.get_text(strip=True)
            nominee_url = f"https://en.wikipedia.org{link['href']}"
            PeopleLinks.add(nominee_url)

print(len(PeopleLinks))

4633


In [64]:
PeopleLinks.add('https://en.wikipedia.org/wiki/John_Lennon')
PeopleLinks.add('https://en.wikipedia.org/wiki/Paul_McCartney')
PeopleLinks.add('https://en.wikipedia.org/wiki/Ringo_Starr')
PeopleLinks.add('https://en.wikipedia.org/wiki/George_Harrison')

In [65]:
print(PeopleLinks)

{'https://en.wikipedia.org/wiki/Samuel_E._Beetley', 'https://en.wikipedia.org/wiki/Tom_Priestley', 'https://en.wikipedia.org/wiki/Guy_Glover', 'https://en.wikipedia.org/wiki/Irving_Thalberg', 'https://en.wikipedia.org/wiki/Randy_Quaid', 'https://en.wikipedia.org/wiki/Marcus_D%27Arcy', 'https://en.wikipedia.org/wiki/Sarah_Polley', 'https://en.wikipedia.org/w/index.php?title=Janet_Cole_(filmmaker)&action=edit&redlink=1', 'https://en.wikipedia.org/wiki/Jack_Brooks_(lyricist)', 'https://en.wikipedia.org/wiki/Alex_Bulkley', 'https://en.wikipedia.org/wiki/Rob_Marshall', 'https://en.wikipedia.org/wiki/Rosamund_Pike', 'https://en.wikipedia.org/wiki/Howard_E._Koch', 'https://en.wikipedia.org/wiki/Tab_Murphy', 'https://en.wikipedia.org/wiki/Chris_Miller_(animator)', 'https://en.wikipedia.org/wiki/Kira_Simon-Kennedy', 'https://en.wikipedia.org/wiki/Carol_Hemming', 'https://en.wikipedia.org/wiki/Brad_Pitt', 'https://en.wikipedia.org/wiki/Karl_Freund', 'https://en.wikipedia.org/wiki/Carlos_Saldanha

In [66]:
# Fetch the Wikipedia page for Best Cinematography
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Visual_Effects"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None
current_iteration = None

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for row in rows:
        cols = row.find_all(['td', 'th'])

        # Extract year and iteration from <th> column
        if cols and cols[0].name == 'th':
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            current_year = int(match_year.group(1)) if match_year else None
            current_iteration = match_iteration.group(1) if match_iteration else None

            cols.pop(0)  # Remove <th> to process remaining columns

        # Skip rows without enough columns
        if len(cols) < 2:
            continue

        # Extract film title from second column
        film_cell = cols[0]
        film_title_tag = film_cell.find('i')
        if not film_title_tag:
            continue

        film_title_link_tag = film_title_tag.find('a')
        movie_title = film_title_link_tag.get_text(strip=True) if film_title_link_tag else film_title_tag.get_text(
            strip=True)

        is_winner = "yes" if ("background:#FAEB86" in row.get("style", "") or bool(row.find('b'))) else "no"

        # Extract nominees from third column
        nominee_cell = cols[1]
        nominee_links_tags = nominee_cell.find_all('a')

        nominees_info = []
        for link in nominee_links_tags:
            nominee_name = link.get_text(strip=True)
            nominee_url = f"https://en.wikipedia.org{link['href']}"
            PeopleLinks.add(nominee_url)

print(f"Total unique people links collected: {len(PeopleLinks)}")


Total unique people links collected: 5085


In [67]:
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Costume_Design"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None  # Track current year
current_iteration = None  # Track current iteration

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for row in rows:
        cols = row.find_all(['td', 'th'])

        # Handle rows with a <td> containing year and iteration (spanning multiple rows)
        if cols and len(cols) == 1 and cols[0].name == 'td' and "rowspan" in cols[0].attrs:
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            if match_year:
                current_year = int(match_year.group(1))
            if match_iteration:
                current_iteration = match_iteration.group(1)

            continue  # Skip to next row since this row only contains year info

        # Skip rows without enough columns to process
        if len(cols) < 2:
            continue

        # Extract film title from the first column
        film_cell = cols[0]
        film_title_tag = film_cell.find('i')  # Find the <i> tag inside <td>
        if not film_title_tag:
            continue

        film_title_link_tag = film_title_tag.find('a')  # Check for nested <a> tag
        movie_title = (
            film_title_link_tag.get_text(strip=True)
            if film_title_link_tag
            else film_title_tag.get_text(strip=True)
        )

        # Check winner status (background color or bold text indicates winner)
        is_winner = "yes" if ("background:#FAEB86" in row.get("style", "") or bool(row.find('b'))) else "no"

        # Extract nominees from the second column
        nominees_info = []
        nominee_cell = cols[1]
        nominee_links_tags = nominee_cell.find_all('a')

        for link in nominee_links_tags:
            nominee_name = link.get_text(strip=True)
            nominee_url = f"https://en.wikipedia.org{link['href']}"
            PeopleLinks.add(nominee_url)

print(f"Total unique people links collected: {len(PeopleLinks)}")


Total unique people links collected: 5275


In [68]:
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Live_Action_Short_Film"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None
current_iteration = None

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for row in rows:
        cols = row.find_all(['td', 'th'])

        # Extract year and iteration from <th> column
        if cols and cols[0].name == 'th':
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            current_year = int(match_year.group(1)) if match_year else None
            current_iteration = match_iteration.group(1) if match_iteration else None

            cols.pop(0)  # Remove <th> to process remaining columns

        # Skip rows without enough columns
        if len(cols) < 2:
            continue

        # Extract film title from second column
        film_cell = cols[0]
        film_title_tag = film_cell.find('i')
        if not film_title_tag:
            continue

        film_title_link_tag = film_title_tag.find('a')
        movie_title = film_title_link_tag.get_text(strip=True) if film_title_link_tag else film_title_tag.get_text(
            strip=True)

        # Check winner status (background color indicates winner)
        is_winner = "yes" if "background:#FAEB86" in row.get("style", "") else "no"

        # Extract nominees from third column
        nominee_cell = cols[1]
        nominee_links_tags = nominee_cell.find_all('a')

        nominees_info = []
        for link in nominee_links_tags:
            nominee_name = link.get_text(strip=True)
            nominee_url = f"https://en.wikipedia.org{link['href']}"
            PeopleLinks.add(nominee_url)

print(f"Total unique people links collected: {len(PeopleLinks)}")


Total unique people links collected: 5514


In [69]:

url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Sound"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None  # Track current year
current_iteration = None  # Track current iteration

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for row in rows:
        cols = row.find_all(['td', 'th'])

        # Handle rows with a <td> containing year and iteration (spanning multiple rows)
        if cols and len(cols) == 1 and cols[0].name == 'td' and "rowspan" in cols[0].attrs:
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            if match_year:
                current_year = int(match_year.group(1))
            if match_iteration:
                current_iteration = match_iteration.group(1)

            continue  # Skip to next row since this row only contains year info

        # Skip rows without enough columns to process
        if len(cols) < 2:
            continue

        # Extract film title from the first column
        film_cell = cols[0]
        film_title_tag = film_cell.find('i')  # Find the <i> tag inside <td>
        if not film_title_tag:
            continue

        film_title_link_tag = film_title_tag.find('a')  # Check for nested <a> tag
        movie_title = (
            film_title_link_tag.get_text(strip=True)
            if film_title_link_tag
            else film_title_tag.get_text(strip=True)
        )

        # Check winner status (background color or bold text indicates winner)
        is_winner = "yes" if ("background:#FAEB86" in row.get("style", "") or bool(row.find('b'))) else "no"

        # Extract nominees from the second column
        nominees_info = []
        nominee_cell = cols[1]
        nominee_links_tags = nominee_cell.find_all('a')

        for link in nominee_links_tags:
            nominee_name = link.get_text(strip=True)
            nominee_url = f"https://en.wikipedia.org{link['href']}"
            PeopleLinks.add(nominee_url)

print(f"Total unique people links collected: {len(PeopleLinks)}")


Total unique people links collected: 5980


In [70]:
# Fetch the Wikipedia page for Best Production Design
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Production_Design"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None  # Track current year
current_iteration = None  # Track current iteration

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for row in rows:
        cols = row.find_all(['td', 'th'])

        # Handle rows with a <th> containing year and iteration (spanning multiple rows)
        if cols and cols[0].name == 'th' and "rowspan" in cols[0].attrs:
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            if match_year:
                current_year = int(match_year.group(1))
            if match_iteration:
                current_iteration = match_iteration.group(1)

            continue  # Skip to next row since this row only contains year info

        # Skip rows without enough columns to process
        if len(cols) < 3:
            continue

        # Extract film title from the first column
        film_cell = cols[0]
        film_title_tag = film_cell.find('i')  # Find the <i> tag inside <td>
        if not film_title_tag:
            continue

        film_title_link_tag = film_title_tag.find('a')  # Check for nested <a> tag
        movie_title = (
            film_title_link_tag.get_text(strip=True)
            if film_title_link_tag
            else film_title_tag.get_text(strip=True)
        )

        # Check winner status (background color or bold text indicates winner)
        is_winner = "yes" if ("background:#FAEB86" in row.get("style", "") or bool(row.find('b'))) else "no"

        # Extract nominees from the second column
        nominees_info = []
        nominee_cell_1 = cols[1]
        nominee_links_tags_1 = nominee_cell_1.find_all('a')

        for link in nominee_links_tags_1:
            nominee_name = link.get_text(strip=True)
            nominee_url = f"https://en.wikipedia.org{link['href']}"
            PeopleLinks.add(nominee_url)
            print("Nominee Added: ", nominee_name)

            nominees_info.append({'nomineeName': nominee_name, 'nomineeLink': nominee_url})

        # If there is a third column, extract additional nominees (for cases where nominees are split across two columns)
        if len(cols) > 2:
            nominee_cell_2 = cols[2]
            nominee_links_tags_2 = nominee_cell_2.find_all('a')

            for link in nominee_links_tags_2:
                nominee_name = link.get_text(strip=True)
                nominee_url = f"https://en.wikipedia.org{link['href']}"
                PeopleLinks.add(nominee_url)
                print("Nominee Added: ", nominee_name,"\n")
                nominees_info.append({'nomineeName': nominee_name, 'nomineeLink': nominee_url})

        entry_data = {
            'movieTitle': movie_title,
            'releaseYear': current_year,
            'categoryName': "Best Production Design",
            'iteration': current_iteration,
            'isWinner': is_winner,
            'nominees': nominees_info
        }


print(f"Total unique people links collected: {len(PeopleLinks)}")


Nominee Added:  Cedric Gibbons
Nominee Added:  Paul Groesse
Nominee Added:  Hans Dreier
Nominee Added:  Robert Usher
Nominee Added:  Vincent Korda
Nominee Added:  Cedric Gibbons
Nominee Added:  John S. Detlie
Nominee Added:  Richard Day
Nominee Added:  Nathan Juran
Nominee Added:  Thomas Little 

Nominee Added:  Perry Ferguson
Nominee Added:  Van Nest Polglase
Nominee Added:  A. Roland Fields 

Nominee Added:  Darrell Silvera 

Nominee Added:  Martin Obzina
Nominee Added:  Jack Otterson
Nominee Added:  Russell A. Gausman 

Nominee Added:  Hans Dreier
Nominee Added:  Robert Usher
Nominee Added:  Samuel M. Comer 

Nominee Added:  Lionel Banks
Nominee Added:  George Montgomery 

Nominee Added:  Stephen Goosson
Nominee Added:  Howard Bristol 

Nominee Added:  John Hughes
Nominee Added:  Fred M. MacLean 

Nominee Added:  John DuCasse Schulze
Nominee Added:  Edward G. Boyle 

Nominee Added:  Alexander Golitzen
Nominee Added:  Richard Irvine 

Nominee Added:  Vincent Korda
Nominee Added:  Jul

In [71]:
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Production_Design"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None  # Track current year
current_iteration = None  # Track current iteration

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for row in rows:
        cols = row.find_all(['td', 'th'])

        # Handle rows with a <td> containing year and iteration (spanning multiple rows)
        if cols and cols[0].name == 'th' and "rowspan" in cols[0].attrs:
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            if match_year:
                current_year = int(match_year.group(1))
            if match_iteration:
                current_iteration = match_iteration.group(1)

            continue  # Skip to next row since this row only contains year info

        # Skip rows without enough columns to process
        if len(cols) < 2:
            continue

        # Extract film title from the first column
        film_cell = cols[0]
        film_title_tag = film_cell.find('i')  # Find the <i> tag inside <td>
        if not film_title_tag:
            continue

        film_title_link_tag = film_title_tag.find('a')  # Check for nested <a> tag
        movie_title = (
            film_title_link_tag.get_text(strip=True)
            if film_title_link_tag
            else film_title_tag.get_text(strip=True)
        )

        # Check winner status (background color or bold text indicates winner)
        is_winner = "yes" if ("background:#FAEB86" in row.get("style", "") or bool(row.find('b'))) else "no"

        # Extract nominees from the second column
        nominees_info = []
        nominee_cell = cols[1]
        nominee_links_tags = nominee_cell.find_all('a')

        for link in nominee_links_tags:
            nominee_name = link.get_text(strip=True)
            nominee_url = f"https://en.wikipedia.org{link['href']}"
            PeopleLinks.add(nominee_url)
            print("Nominee Added: ", nominee_name)
            nominees_info.append({'nomineeName': nominee_name, 'nomineeLink': nominee_url})

        entry_data = {
            'movieTitle': movie_title,
            'releaseYear': current_year,
            'categoryName': "Best Production Design",
            'iteration': current_iteration,
            'isWinner': is_winner,
            'nominees': nominees_info
        }


print(f"Total unique people links collected: {len(PeopleLinks)}")


Nominee Added:  Harry Oliver
Nominee Added:  Rochus Gliese
Nominee Added:  Cedric Gibbons
Nominee Added:  William Cameron Menzies
Nominee Added:  Mitchell Leisen
Nominee Added:  Hans Dreier
Nominee Added:  Harry Oliver
Nominee Added:  William Cameron Menzies
Nominee Added:  Hans Dreier
Nominee Added:  Jack Okey
Nominee Added:  Hans Dreier
Nominee Added:  Max Rée
Nominee Added:  Stephen Goosson
Nominee Added:  Ralph Hammeras
Nominee Added:  Hans Dreier
Nominee Added:  Anton Grot
Nominee Added:  Richard Day
Nominee Added:  Gordon Wiles
Nominee Added:  Richard Day
Nominee Added:  Lazare Meerson
Nominee Added:  William S. Darling
Nominee Added:  Hans Dreier
Nominee Added:  Roland Anderson
Nominee Added:  Cedric Gibbons
Nominee Added:  Cedric Gibbons
Nominee Added:  Fredric Hope
Nominee Added:  Richard Day
Nominee Added:  Van Nest Polglase
Nominee Added:  Carroll Clark
Nominee Added:  Richard Day
Nominee Added:  Hans Dreier
Nominee Added:  Roland Anderson
Nominee Added:  Carroll Clark
Nomin

In [72]:
# Fetch the Wikipedia page for Best International Feature Film
url = "https://en.wikipedia.org/wiki/List_of_Academy_Award_winners_and_nominees_for_Best_International_Feature_Film"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare data storage
data = []

# Find all wikitable tables on the page
tables = soup.find_all('table', class_='wikitable')

current_year = None  # Track current year
current_iteration = None  # Track current iteration

# Iterate through tables
for table in tables:
    rows = table.find_all('tr')[1:]  # Skip header row

    for row in rows:
        cols = row.find_all(['td', 'th'])

        # Handle rows with <th> (release year and iteration)
        if cols and cols[0].name == 'th':  # Year and iteration are in <th>
            year_text = cols[0].get_text(strip=True)
            match_year = re.search(r'(\d{4})', year_text)
            match_iteration = re.search(r'\((\d+(?:st|nd|rd|th))\)', year_text)

            if match_year:
                current_year = int(match_year.group(1))
            if match_iteration:
                current_iteration = match_iteration.group(1)

            cols.pop(0)

        # Skip rows without enough columns to process
        if len(cols) < 3:
            continue


        # Extract film title from the second column
        directorIndex = 2
        film_cell = cols[0]
        if 'colspan' in film_cell.attrs: directorIndex = 1
        film_title_tag = film_cell.find('i')  # Find the <i> tag inside <td>
        if not film_title_tag:
            continue

        film_title_link_tag = film_title_tag.find('a')  # Check for nested <a> tag
        movie_title = (
            film_title_link_tag.get_text(strip=True)
            if film_title_link_tag
            else film_title_tag.get_text(strip=True)
        )

        # Extract director's name and link from the fourth column
        director_cell = cols[directorIndex]
        director_links_tags = director_cell.find_all('a')

        nominees_info = []
        for link in director_links_tags:
            director_name = link.get_text(strip=True)
            director_url = f"https://en.wikipedia.org{link['href']}"
            PeopleLinks.add(director_url)

print(f"Total unique people links collected: {len(PeopleLinks)}")


Total unique people links collected: 6725


In [73]:
with open('peoplelinks.csv', mode='w', encoding='utf-8', newline='') as file2:
    writer2 = csv.writer(file2)  # Use minimal quoting for links
    writer2.writerow(['link'])
    for link in PeopleLinks:
        writer2.writerow([link])

In [74]:
from bs4 import BeautifulSoup
import requests
import csv
import re
from urllib.parse import urlparse
import calendar

# Prepare data storage
person_data_basic = []  # Data for the first CSV (basic info)
person_data_roles = []  # Data for the second CSV (roles info)
Roles = set()  # Set to store all unique roles
MalformedLinks = []  # List to store malformed or incorrect links
missingData = []

# Function to extract career years
def extract_career_years(years_text):
    # Normalize the text by removing extra spaces and dashes
    years_text = re.sub(r"\s*–\s*", "–", years_text)  # Replace spaces around en dash with a single en dash
    years_text = re.sub(r"\s*-\s*", "–", years_text)  # Replace hyphen with en dash

    # Remove any extra information in parentheses
    years_text = re.sub(r"\(.*?\)", "", years_text)

    # Split by semicolon or 'and' to handle multiple ranges
    year_ranges = re.split(r";|and", years_text)

    # Extract all years from the ranges
    all_years = []
    for year_range in year_ranges:
        years = re.findall(r"\b(\d{4}|present)\b", year_range)
        all_years.extend(years)

    # Convert 'present' to None (indicating ongoing career)
    all_years = [int(year) if year != "present" else None for year in all_years]

    # Filter out None values for min and max calculations
    numeric_years = [year for year in all_years if year is not None]

    # Return the earliest year and latest year (leave latest as None if ongoing)
    if numeric_years:
        earliest_year = min(numeric_years)
        latest_year = max(numeric_years) if None not in all_years else None
        return earliest_year, latest_year
    return None, None

# Function to extract and format death date
def extract_date(date_text):
    # Normalize the text by removing extra spaces and parentheses
    date_text = re.sub(r"\(.*?\)", "", date_text).strip()

    # Extract year, month, and day if available
    match = re.search(r"(\w+)?\s*(\d{1,2})?,?\s*(\d{4})", date_text)
    if match:
        year = match.group(3)
        month = match.group(1) if match.group(1) else "01"
        day = match.group(2) if match.group(2) else "01"

        # Convert month name to number if applicable
        try:
            month = str(list(calendar.month_name).index(month)) if month.isalpha() else month
        except ValueError:
            month = "01"  # Default to January if month is invalid

        # Format as yyyy-mm-dd
        return f"{year}-{month.zfill(2)}-{day.zfill(2)}"

    # If no valid date is found, return None
    return None

# Function to validate a URL
def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False

print(PeopleLinks)
# Loop over each URL in the list
for url in PeopleLinks:
    if not is_valid_url(url):  # Validate URL before making a request
        MalformedLinks.append(url)
        print(f"Malformed URL: {url}")
        continue

    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract firstName and lastName
        first_name, last_name, dob, birth_country, death_date = "", "", "", "", ""
        # Extract birth date and country of birth from the "Born" row
        born_row = soup.find('th', string=re.compile(r'(?i)\bBorn\b[:]?'))  # Match "Born", "born", "Born:", etc.

        if born_row:
            born_data = born_row.find_next_sibling('td')

            if born_data:
                # Extract and clean DOB
                dob_span = born_data.find('span', class_='bday')
                raw_dob_text = dob_span.get_text(strip=True) if dob_span else born_data.get_text(strip=True)
                dob = extract_date(raw_dob_text)  # Use unified extract_date function

                # Extract and clean country of birth
                birthplace_div = born_data.find('div', class_='birthplace')

                if birthplace_div:
                    raw_birth_country = birthplace_div.get_text(strip=True)
                else:
                    # If no <div>, get text from <td> directly (including <a> tags and plain text)
                    raw_birth_country = born_data.get_text(separator=" ", strip=True)

                # Remove text inside parentheses and clean the location
                cleaned_location = re.sub(r"\(.*?\)", "", raw_birth_country).strip()
                birth_country = cleaned_location.split(",")[-1].strip()

        # Extract name from various possible locations
        nameDiv = soup.find('div', class_='fn')  # First check for <div> with class 'fn'

        if not nameDiv:
            # Fallback: Check for <th> with class 'infobox-above' and look for <div>
            name_th = soup.find('th', class_='infobox-above')
            if name_th:
                nameDiv = name_th.find('div')

        if not nameDiv:
            # Fallback: Check for <caption> with class 'infobox-title fn'
            name_caption = soup.find('caption', class_='infobox-title fn')
            if name_caption:
                nameDiv = name_caption

        # Extract the name if found
        first_name, last_name = "", ""
        if nameDiv:
            full_name = nameDiv.get_text(strip=True)
            name_parts = full_name.split(" ")
            first_name = name_parts[0]  # First part of the name
            last_name = name_parts[-1] if len(name_parts) > 1 else ""  # Last part if it exists

        # Extract deathDate if available
        death_row = soup.find('th', string='Died')
        if death_row:
            death_data = death_row.find_next_sibling('td')
            death_date_text = death_data.get_text(strip=True) if death_data else ""
            death_date = extract_date(death_date_text)

        # Extract roleType from the "Occupations" row
        occupations_row = soup.find('th', string=re.compile(r'Occupation(s)?'))
        roles = []
        if occupations_row:
            occupations_data = occupations_row.find_next_sibling('td') or soup.find('td', class_='infobox-data role')
            if occupations_data:
                # Extract roles from <li> tags or split by commas if stored as a single string
                roles_list = occupations_data.find_all('li')
                if roles_list:
                    roles = [role.get_text(strip=True).title() for role in roles_list]
                else:
                    roles_text = occupations_data.get_text(strip=True)
                    roles = [role.strip().title() for role in roles_text.split(",")]

        # Add roles to the global Roles set
        Roles.update(roles)

        # Extract careerStartYYYY and careerEndYYYY from "Years active" row
        years_active_row = soup.find('th', string=re.compile(r'Years\s*active'))
        career_start_year, career_end_year = "", ""
        if years_active_row:
            years_active_data = years_active_row.find_next_sibling('td')
            if years_active_data:
                years_text = years_active_data.get_text(strip=True)
                career_start_year, career_end_year = extract_career_years(years_text)


        if first_name is None:
            missingData.append(url)
        # Store extracted data for basic info (first CSV)
        person_entry_basic = {
            'firstName': first_name,
            'lastName': last_name,
            'DOB': dob,
            'birthCountry': birth_country,
            'deathDate': death_date,
            'careerStartYYYY': career_start_year,
            'careerEndYYYY': career_end_year,
        }
        person_data_basic.append(person_entry_basic)
        print(person_entry_basic)

        # Store extracted data for each role separately (second CSV)
        for role in roles:
            person_entry_roles = {
                'firstName': first_name,
                'lastName': last_name,
                'roleType': role,
                'careerStartYYYY': career_start_year,
                'careerEndYYYY': career_end_year,
            }
            person_data_roles.append(person_entry_roles)

    except Exception as e:
        print(f"Error processing URL {url}: {e}")
        MalformedLinks.append(url)

# # Save data to the first CSV (basic info + career years)
# with open('people_basic_details.csv', mode='w', encoding='utf-8', newline='') as file1:
#     writer1 = csv.writer(file1)
#     writer1.writerow(['firstName', 'lastName', 'DOB', 'birthCountry', 'deathDate',
#                       'careerStartYYYY', 'careerEndYYYY'])
#     for entry in person_data_basic:
#         writer1.writerow([
#             entry['firstName'],
#             entry['lastName'],
#             entry['DOB'],
#             entry['birthCountry'],
#             entry['deathDate'],
#             entry['careerStartYYYY'],
#             entry['careerEndYYYY']
#         ])
#
# # Save data to the second CSV (roles info)
# with open('people_roles_details.csv', mode='w', encoding='utf-8', newline='') as file2:
#     writer2 = csv.writer(file2)
#     writer2.writerow(['firstName', 'lastName', 'roleType'])
#     for entry in person_data_roles:
#         writer2.writerow([
#             entry['firstName'],
#             entry['lastName'],
#             entry['roleType']
#         ])
#
# # Save malformed links to a separate file
# with open('malformed_links.csv', mode='w', encoding='utf-8', newline='') as file3:
#     writer3 = csv.writer(file3)
#     writer3.writerow(['MalformedLink'])
#     for link in MalformedLinks:
#         writer3.writerow([link])

# Print all unique roles collected in the Roles set
print("Unique Roles Collected:")
for role in Roles:
    print(role)

print("\nScraped data saved to people_basic_details.csv, people_roles_details.csv, and malformed_links.csv.")


{'https://en.wikipedia.org/wiki/Guy_Glover', 'https://en.wikipedia.org/wiki/Frank_E._Hughes', 'https://en.wikipedia.org/wiki/Harold_M._Etherington', 'https://en.wikipedia.org/wiki/Jack_Brooks_(lyricist)', 'https://en.wikipedia.org/wiki/Alex_Bulkley', 'https://en.wikipedia.org/wiki/Rob_Marshall', 'https://en.wikipedia.org/wiki/Howard_E._Koch', 'https://en.wikipedia.org/wiki/Kira_Simon-Kennedy', 'https://en.wikipedia.org/wiki/June_Wayne', 'https://en.wikipedia.org/wiki/Charles_Fuller', 'https://en.wikipedia.org/wiki/Carlinhos_Brown', 'https://en.wikipedia.org/wiki/Paul_Brincat', 'https://en.wikipedia.org/wiki/John_Bryan_(art_director)', 'https://en.wikipedia.org/wiki/Alexandre_Tansman', 'https://en.wikipedia.org/wiki/Florian_Henckel_von_Donnersmarck', 'https://en.wikipedia.org/wiki/James_Spione', 'https://en.wikipedia.org/wiki/Richard_Nord', 'https://en.wikipedia.org/wiki/Richard_LaGravenese', 'https://en.wikipedia.org/wiki/Leslie_I._Carey', 'https://en.wikipedia.org/wiki/Nadia_Stacey', 

KeyboardInterrupt: 

In [90]:
# Prepare data storage
person_data = []

# Function to clean country of birth
def clean_country(raw_country):
    cleaned_location = re.sub(r"\(.*?\)", "", raw_country).strip()  # Remove text inside parentheses
    return cleaned_location.split(",")[-1].strip()  # Extract the last part (country)

# Loop over each URL in the list
for url in PeopleLinks:
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract full name
        nameDiv = soup.find('div', class_='fn')  # Check for <div class="fn">
        if not nameDiv:
            # Fallback: Check for <th class="infobox-above"> or <caption class="infobox-title fn">
            name_th = soup.find('th', class_='infobox-above')
            if name_th:
                nameDiv = name_th.find('div')
            if not nameDiv:
                name_caption = soup.find('caption', class_='infobox-title fn')
                if name_caption:
                    nameDiv = name_caption

        full_name = nameDiv.get_text(strip=True) if nameDiv else None

        # Extract country of birth from the "Born" row
        born_row = soup.find('th', string=re.compile(r'(?i)\bBorn\b[:]?'))  # Match variations like "Born", "born", "Born:"
        birth_country = ""
        if born_row:
            born_data = born_row.find_next_sibling('td')
            if born_data:
                birthplace_div = born_data.find('div', class_='birthplace')
                if birthplace_div:
                    raw_birth_country = birthplace_div.get_text(strip=True)
                else:
                    raw_birth_country = born_data.get_text(separator=" ", strip=True)
                birth_country = clean_country(raw_birth_country)

        # Store extracted data
        person_entry = {
            'fullName': full_name,
            'birthCountry': birth_country
        }
        person_data.append(person_entry)

    except Exception as e:
        print(f"Error processing URL {url}: {e}")

# # Save data to CSV file
# with open('people_birth_country_details.csv', mode='w', encoding='utf-8', newline='') as file:
#     writer = csv.writer(file)
#     writer.writerow(['fullName', 'birthCountry'])
#     for entry in person_data:
#         writer.writerow([
#             entry['fullName'],
#             entry['birthCountry']
#         ])

print("Scraped data saved to people_birth_country_details.csv.")


Error processing URL https://en.wikipedia.orghttps://sv.wikipedia.org/wiki/Lennart_Ehrenborg: HTTPSConnectionPool(host='en.wikipedia.orghttps', port=443): Max retries exceeded with url: /sv.wikipedia.org/wiki/Lennart_Ehrenborg (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000015B5858C050>: Failed to resolve 'en.wikipedia.orghttps' ([Errno 11001] getaddrinfo failed)"))
Error processing URL https://en.wikipedia.orghttps://de.wikipedia.org/wiki/Gertrude_Ross_Marks: HTTPSConnectionPool(host='en.wikipedia.orghttps', port=443): Max retries exceeded with url: /de.wikipedia.org/wiki/Gertrude_Ross_Marks (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000015B5858C050>: Failed to resolve 'en.wikipedia.orghttps' ([Errno 11001] getaddrinfo failed)"))
Error processing URL https://en.wikipedia.orghttps://de.wikipedia.org/wiki/Ansgar_Frerich: HTTPSConnectionPool(host='en.wikipedia.orghttps', port=443): Max retries exceeded with url: /

In [91]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import pandas as pd
from difflib import get_close_matches


# Function to read country names from a CSV file and store them in a list
def read_countries_from_csv(file_path):
    try:
        # Read the CSV file using pandas
        df = pd.read_csv(file_path)
        # Assuming the country names are in the first column
        countries_list = df.iloc[:, 0].tolist()
        return countries_list
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return []

# Example usage (replace 'countries.csv' with the actual file path)
countries = read_countries_from_csv('countries.csv')
person_data = []


# Function to clean and match country from the Born row
def match_country(birth_country):
    # Find the most similar country from the predefined list
    most_similar_country = get_close_matches(birth_country, countries, n=1, cutoff=0.6)
    return most_similar_country[0] if most_similar_country else None

# Loop over each URL in the list
for url in PeopleLinks:
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract full name
        nameDiv = soup.find('div', class_='fn')  # Check for <div class="fn">
        if not nameDiv:
            # Fallback: Check for <th class="infobox-above"> or <caption class="infobox-title fn">
            name_th = soup.find('th', class_='infobox-above')
            if name_th:
                nameDiv = name_th.find('div')
            if not nameDiv:
                name_caption = soup.find('caption', class_='infobox-title fn')
                if name_caption:
                    nameDiv = name_caption

        full_name = nameDiv.get_text(strip=True) if nameDiv else None

        # Extract country of birth from the Born row
        born_row = soup.find('th', string=re.compile(r'(?i)\bBorn\b[:]?'))  # Match variations like "Born", "born", "Born:"
        birth_country = None
        if born_row:
            born_data = born_row.find_next_sibling('td')
            if born_data:
                birthplace_div = born_data.find('div', class_='birthplace')
                if birthplace_div:
                    raw_birth_country = birthplace_div.get_text(strip=True)
                else:
                    raw_birth_country = born_data.get_text(separator=" ", strip=True)
                # Remove text inside parentheses and clean the location
                cleaned_location = re.sub(r"\(.*?\)", "", raw_birth_country).strip()
                extracted_country = cleaned_location.split(",")[-1].strip()
                birth_country = match_country(extracted_country) or extracted_country

        # Store extracted data
        person_entry = {
            'fullName': full_name,
            'birthCountry': birth_country or "Unknown"  # Default to 'Unknown' if no match is found
        }
        person_data.append(person_entry)

    except Exception as e:
        print(f"Error processing URL {url}: {e}")


print("Scraped data saved to people_birth_country_details.csv.")


Error processing URL https://en.wikipedia.orghttps://sv.wikipedia.org/wiki/Lennart_Ehrenborg: HTTPSConnectionPool(host='en.wikipedia.orghttps', port=443): Max retries exceeded with url: /sv.wikipedia.org/wiki/Lennart_Ehrenborg (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000015B5AAC16D0>: Failed to resolve 'en.wikipedia.orghttps' ([Errno 11001] getaddrinfo failed)"))
Error processing URL https://en.wikipedia.orghttps://de.wikipedia.org/wiki/Gertrude_Ross_Marks: HTTPSConnectionPool(host='en.wikipedia.orghttps', port=443): Max retries exceeded with url: /de.wikipedia.org/wiki/Gertrude_Ross_Marks (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000015B5D0CD090>: Failed to resolve 'en.wikipedia.orghttps' ([Errno 11001] getaddrinfo failed)"))
Error processing URL https://en.wikipedia.orghttps://de.wikipedia.org/wiki/Ansgar_Frerich: HTTPSConnectionPool(host='en.wikipedia.orghttps', port=443): Max retries exceeded with url: /

In [75]:
from bs4 import BeautifulSoup
import requests
import csv
import re
from urllib.parse import urlparse
import calendar

# Prepare data storage
person_data_basic = []  # Data for the first CSV (basic info)
person_data_roles = []  # Data for the second CSV (roles info)
Roles = set()  # Set to store all unique roles
MalformedLinks = []  # List to store malformed or incorrect links
missingData = []

# Function to validate a URL
def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False

# Loop over each URL in the list
for url in PeopleLinks:
    if not is_valid_url(url):  # Validate URL before making a request
        MalformedLinks.append(url)
        print(f"Malformed URL: {url}")
        continue

    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract firstName and lastName
        first_name, last_name = "", ""
        # Extract birth date and country of birth from the "Born" row

        # Extract name from various possible locations
        nameDiv = soup.find('div', class_='fn')  # First check for <div> with class 'fn'

        if not nameDiv:
            # Fallback: Check for <th> with class 'infobox-above' and look for <div>
            name_th = soup.find('th', class_='infobox-above')
            if name_th:
                nameDiv = name_th.find('div')

        if not nameDiv:
            # Fallback: Check for <caption> with class 'infobox-title fn'
            name_caption = soup.find('caption', class_='infobox-title fn')
            if name_caption:
                nameDiv = name_caption

        # Extract the name if found
        first_name, last_name = "", ""
        if nameDiv:
            full_name = nameDiv.get_text(strip=True)
            name_parts = full_name.split(" ")
            first_name = name_parts[0]  # First part of the name
            last_name = name_parts[-1] if len(name_parts) > 1 else ""  # Last part if it exists

        # Extract roleType from the "Occupations" row
        occupations_row = soup.find('th', string=re.compile(r'Occupation(s)?'))
        roles = []
        if occupations_row:
            occupations_data = occupations_row.find_next_sibling('td') or soup.find('td', class_='infobox-data role')
            if occupations_data:
                # Extract roles from <li> tags or split by commas if stored as a single string
                roles_list = occupations_data.find_all('li')
                if roles_list:
                    roles = [role.get_text(strip=True).title() for role in roles_list]
                else:
                    roles_text = occupations_data.get_text(strip=True)
                    roles = [role.strip().title() for role in roles_text.split(",")]

        # Add roles to the global Roles set
        Roles.update(roles)

        if first_name is None:
            missingData.append(url)

        for role in roles:
            person_entry_roles = {
                'firstName': first_name,
                'lastName': last_name,
                'roleType': role
            }
            person_data_roles.append(person_entry_roles)

    except Exception as e:
        print(f"Error processing URL {url}: {e}")
        MalformedLinks.append(url)


# Save data to the second CSV (roles info)
with open('people_role.csv', mode='w', encoding='utf-8', newline='') as file2:
    writer2 = csv.writer(file2)
    writer2.writerow(['firstName', 'lastName', 'roleType'])
    for entry in person_data_roles:
        writer2.writerow([
            entry['firstName'],
            entry['lastName'],
            entry['roleType']
        ])


# Print all unique roles collected in the Roles set
print("Unique Roles Collected:")
for role in Roles:
    print(role)

print("\nScraped data saved")


Error processing URL https://en.wikipedia.orghttps://es.wikipedia.org/wiki/Carles_Bosch: HTTPSConnectionPool(host='en.wikipedia.orghttps', port=443): Max retries exceeded with url: /es.wikipedia.org/wiki/Carles_Bosch (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001C8E1D1EAD0>: Failed to resolve 'en.wikipedia.orghttps' ([Errno 11001] getaddrinfo failed)"))
Error processing URL https://en.wikipedia.orghttps://pt.wikipedia.org/wiki/Rick_Goldsmith: HTTPSConnectionPool(host='en.wikipedia.orghttps', port=443): Max retries exceeded with url: /pt.wikipedia.org/wiki/Rick_Goldsmith (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001C8E1D1C190>: Failed to resolve 'en.wikipedia.orghttps' ([Errno 11001] getaddrinfo failed)"))
Error processing URL https://en.wikipedia.orghttps://de.wikipedia.org/wiki/Ren%C3%A9_Lafuite: HTTPSConnectionPool(host='en.wikipedia.orghttps', port=443): Max retries exceeded with url: /de.wikipedia.org/

In [78]:
from bs4 import BeautifulSoup
import requests
import csv
import re
from urllib.parse import urlparse
import calendar

# Prepare data storage
person_data_basic = []  # Data for the first CSV (basic info)
person_data_roles = []  # Data for the second CSV (roles info)
Roles = set()  # Set to store all unique roles
MalformedLinks = []  # List to store malformed or incorrect links
missingData = []

# Function to extract career years
def extract_career_years(years_text):
    # Normalize the text by removing extra spaces and dashes
    years_text = re.sub(r"\s*–\s*", "–", years_text)  # Replace spaces around en dash with a single en dash
    years_text = re.sub(r"\s*-\s*", "–", years_text)  # Replace hyphen with en dash

    # Remove any extra information in parentheses
    years_text = re.sub(r"\(.*?\)", "", years_text)

    # Split by semicolon or 'and' to handle multiple ranges
    year_ranges = re.split(r";|and", years_text)

    # Extract all years from the ranges
    all_years = []
    for year_range in year_ranges:
        years = re.findall(r"\b(\d{4}|present)\b", year_range)
        all_years.extend(years)

    # Convert 'present' to None (indicating ongoing career)
    all_years = [int(year) if year != "present" else None for year in all_years]

    # Filter out None values for min and max calculations
    numeric_years = [year for year in all_years if year is not None]

    # Return the earliest year and latest year (leave latest as None if ongoing)
    if numeric_years:
        earliest_year = min(numeric_years)
        latest_year = max(numeric_years) if None not in all_years else None
        return earliest_year, latest_year
    return None, None

# Function to extract and format death date
def extract_date(date_text):
    """
    Extracts a date from a string and formats it as yyyy-mm-dd.
    Handles full dates, partial dates (year only), and month-year formats.
    """
    # Normalize the text by removing extra spaces and parentheses
    date_text = re.sub(r"\(.*?\)", "", date_text).strip()

    # Regular expression to match full dates (e.g., "February 14, 1949"), partial dates (e.g., "1949"), or month-year (e.g., "February 1949")
    match = re.search(r"(?:(\w+)\s+)?(?:(\d{1,2}),?\s+)?(\d{4})", date_text)

    if match:
        year = match.group(3)  # Extract year
        month = match.group(1) if match.group(1) else "01"  # Extract month name or default to "01"
        day = match.group(2) if match.group(2) else "01"  # Extract day or default to "01"

        # Convert month name to number if applicable
        if month.isalpha():  # Check if month is a name (e.g., "February")
            try:
                month = str(list(calendar.month_name).index(month.capitalize()))  # Convert month name to number
            except ValueError:
                month = "01"  # Default to January if month name is invalid

        # Format as yyyy-mm-dd
        return f"{year}-{month.zfill(2)}-{day.zfill(2)}"

    # If no valid date is found, return None
    return None

# Function to validate a URL
def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False

# Loop over each URL in the list
for url in PeopleLinks:
    if not is_valid_url(url):  # Validate URL before making a request
        MalformedLinks.append(url)
        print(f"Malformed URL: {url}")
        continue

    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract firstName and lastName
        first_name, last_name, dob, birth_country, death_date = "", "", "", "", ""
        # Extract birth date and country of birth from the "Born" row
        born_row = soup.find('th', string=re.compile(r'(?i)\bBorn\b[:]?'))  # Match "Born", "born", "Born:", etc.

        if born_row:
            born_data = born_row.find_next_sibling('td')

            if born_data:
                # Extract and clean DOB
                dob_span = born_data.find('span', class_='bday')
                raw_dob_text = dob_span.get_text(strip=True) if dob_span else born_data.get_text(strip=True)
                dob = extract_date(raw_dob_text)  # Use unified extract_date function


        # Extract name from various possible locations
        nameDiv = soup.find('div', class_='fn')  # First check for <div> with class 'fn'

        if not nameDiv:
            # Fallback: Check for <th> with class 'infobox-above' and look for <div>
            name_th = soup.find('th', class_='infobox-above')
            if name_th:
                nameDiv = name_th.find('div')

        if not nameDiv:
            # Fallback: Check for <caption> with class 'infobox-title fn'
            name_caption = soup.find('caption', class_='infobox-title fn')
            if name_caption:
                nameDiv = name_caption

        # Extract the name if found
        first_name, last_name = "", ""
        if nameDiv:
            full_name = nameDiv.get_text(strip=True)
            name_parts = full_name.split(" ")
            first_name = name_parts[0]  # First part of the name
            last_name = name_parts[-1] if len(name_parts) > 1 else ""  # Last part if it exists

        # Extract deathDate if available
        death_row = soup.find('th', string='Died')
        if death_row:
            death_data = death_row.find_next_sibling('td')
            death_date_text = death_data.get_text(strip=True) if death_data else ""
            death_date = extract_date(death_date_text)





        if first_name is None:
            missingData.append(url)
        # Store extracted data for basic info (first CSV)
        person_entry_basic = {
            'firstName': first_name,
            'lastName': last_name,
            'DOB': dob,
        }
        person_data_basic.append(person_entry_basic)
        print(person_entry_basic)

    except Exception as e:
        print(f"Error processing URL {url}: {e}")
        MalformedLinks.append(url)

# # Save data to the first CSV (basic info + career years)
# with open('people_basic_details.csv', mode='w', encoding='utf-8', newline='') as file1:
#     writer1 = csv.writer(file1)
#     writer1.writerow(['firstName', 'lastName', 'DOB', 'birthCountry', 'deathDate',
#                       'careerStartYYYY', 'careerEndYYYY'])
#     for entry in person_data_basic:
#         writer1.writerow([
#             entry['firstName'],
#             entry['lastName'],
#             entry['DOB'],
#             entry['birthCountry'],
#             entry['deathDate'],
#             entry['careerStartYYYY'],
#             entry['careerEndYYYY']
#         ])
#
# # Save data to the second CSV (roles info)
# with open('people_roles_details.csv', mode='w', encoding='utf-8', newline='') as file2:
#     writer2 = csv.writer(file2)
#     writer2.writerow(['firstName', 'lastName', 'roleType'])
#     for entry in person_data_roles:
#         writer2.writerow([
#             entry['firstName'],
#             entry['lastName'],
#             entry['roleType']
#         ])
#
# # Save malformed links to a separate file
# with open('malformed_links.csv', mode='w', encoding='utf-8', newline='') as file3:
#     writer3 = csv.writer(file3)
#     writer3.writerow(['MalformedLink'])
#     for link in MalformedLinks:
#         writer3.writerow([link])

# Print all unique roles collected in the Roles set

print("\nScraped data saved to people_basic_details.csv, people_roles_details.csv, and malformed_links.csv.")


{'firstName': '', 'lastName': '', 'DOB': ''}
{'firstName': 'Frank', 'lastName': 'Hughes', 'DOB': '1893-06-14'}
{'firstName': 'Harold', 'lastName': 'Etherington', 'DOB': ''}
{'firstName': 'Jack', 'lastName': 'Brooks', 'DOB': '1912-01-01'}
{'firstName': '', 'lastName': '', 'DOB': ''}
{'firstName': 'Rob', 'lastName': 'Marshall', 'DOB': '1960-01-01'}
{'firstName': 'Howard', 'lastName': 'Koch', 'DOB': '1901-12-12'}
{'firstName': '', 'lastName': '', 'DOB': ''}
{'firstName': 'June', 'lastName': 'Wayne', 'DOB': '1918-01-01'}
{'firstName': 'Charles', 'lastName': 'Fuller', 'DOB': '1939-01-01'}
{'firstName': 'Carlinhos', 'lastName': 'Brown', 'DOB': '1962-01-01'}
{'firstName': 'Paul', 'lastName': 'Brincat', 'DOB': ''}
{'firstName': '', 'lastName': '', 'DOB': ''}
{'firstName': '', 'lastName': '', 'DOB': ''}
{'firstName': 'Florian', 'lastName': 'Donnersmarck', 'DOB': '1973-01-01'}
{'firstName': 'James', 'lastName': 'Spione', 'DOB': None}
{'firstName': 'Richard', 'lastName': 'Nord', 'DOB': ''}
{'firs

In [85]:
import csv

def read_roles_from_csv(file_path):
    """
    Reads a CSV file and extracts unique role types into a set.

    Args:
        file_path (str): Path to the CSV file.

    Returns:
        set: A set of unique role types.
    """
    roles = set()  # Initialize an empty set to store unique roles
    try:
        # Open the CSV file
        with open(file_path, mode='r', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            # Iterate over each row in the CSV
            for row in reader:
                role = row['roleType']  # Extract the 'roleType' column
                if role:  # Ensure it's not empty or None
                    roles.add(role.strip())  # Add the cleaned role to the set
    except Exception as e:
        print(f"Error reading CSV file: {e}")

    return roles

# Example usage (replace 'roles.csv' with the actual file path)
roles = read_roles_from_csv('C:\\Users\Asus\PycharmProjects\DatabaseProject\SQL\people_role.csv')
print(roles)
print(len(roles))


{'Theater Director', 'Reviewer', 'Storyboard Artist', 'Stage Designer', 'Scriptwriter', 'Music Director', 'Supervising Technical Director', 'Short Story Writer', 'Comedian', 'Matte Artist', 'Record Producer', 'Music Executive', 'Song Lyricist', 'Stage Actor', 'Composer', 'Physician', 'Advertising Executive', 'Screen Production Designer', 'Radio', 'Puppeteer', 'Peace Activist', 'Layout Artist', 'Stand-Up Comedian', 'Humorist', 'Critic', 'Electrical Engineer', 'Orchestral Conductor', 'Theatre Producer', 'Comic Book Writer', 'Sound Design', 'Documentary Filmmaker', 'Visual Effects Director', 'Special Make-Up Effects', 'Psychologist', 'Musician', 'Agent', 'Set Designer', 'Sound Mixer', 'Associate Producer', 'Television Host', 'Visual Effects Animator', 'Music Journalist', 'Costumier', 'Mixer', 'Race Car Driver', 'Curator', 'Guitarist', 'Sound Re-Recording Mixer', 'Community Activist', 'Stage Performer', 'Fabrication Supervisor', 'Animators', 'Stage Director', 'Comic', 'Producer', 'Stop Mot

  roles = read_roles_from_csv('C:\\Users\Asus\PycharmProjects\DatabaseProject\SQL\people_role.csv')


In [86]:
with open('roles.csv', mode='w', encoding='utf-8', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['roleType'])  # Write header
            for role in roles:
                writer.writerow([role])
            print(f"Roles successfully written to roles.csv")

Roles successfully written to roles.csv
