In [314]:
#director, lead actors, writers, producers, genre, budget, film length, revenue

In [315]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import re
import urllib.parse
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from pathlib import Path
from io import StringIO

In [316]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [317]:
url = "https://editorial.rottentomatoes.com/guide/oscars-best-and-worst-best-pictures/"
r = requests.get(url)

In [318]:
text = r.text
soup = BeautifulSoup(r.text, 'html.parser')

In [319]:
div_tags = soup.find_all('div', {'class': 'article_movie_title'})

In [320]:
titles = []
for div in div_tags:
    a_tags = div.find_all('a')
    for a in a_tags:
        titles.append(a.get_text().strip())

In [321]:
year_tag = soup.find_all("span", {"class":"subtle start-year"})

years = []
for entry in year_tag:
    year = entry.get_text().strip()
    year = year.replace("(","").replace(")","")
    year = int(year)
    years.append(year)

In [322]:
director_tags = soup.find_all('div', {'class': 'info director'})

In [323]:
directors = []
for d in director_tags:
    directors.append(d.get_text().strip().replace('Directed By:', '').strip())

In [324]:
# actor_tags = soup.find_all('div', {'class': 'info cast'})

In [325]:
# names = []
# for actor in actor_tags:
#     a_tags = a.find_all('a')
#     for a in a_tags:
#         names.append(a.get_text().strip())
    
    # name = a.get_text().strip().replace('Starring:', '').strip()
    # names.append(name)
    # for n in names:
    #     print(n)

In [326]:
# actors = []
# for a in actor_tags:
#     a_tags = a.find_all('a')
#     for b in a_tags:
#         actors.append(b.get_text().strip())

In [327]:
links = []
for div in div_tags:
    link = div.find("a")["href"]
    links.append(link)

In [328]:
distributors = []

for link in links:
    r_sub = requests.get(link)
    soup_sub = BeautifulSoup(r_sub.text)
    try:
        distributor = soup_sub.find("rt-text", {"data-qa": "item-value"}).get_text().strip()
        distributors.append(distributor)
    except:     
        distributors.append("None")
    # try:
    #     producer = soup_sub.find("rt-link", {"href": "/celebrity/"}).get_text().strip()
    #     producers.append(producer)
        
    # except:
    #     producers.append("None")

In [329]:
with open('omdb_api.txt', 'r') as file:
    apikey = file.read()

In [330]:
runtimes = []
genres = []
writers = []
revenue = []
actors = []
ratings = []

for title in titles:
    parameters = {'apikey': apikey, 
             't': title}
    
    response = requests.get('http://www.omdbapi.com', params=parameters)
    film_info = response.json()

    length = film_info['Runtime']
    runtimes.append(length)

    genre = film_info['Genre']
    genres.append(genre)

    writer = film_info['Writer']
    writers.append(writer)

    box_office = film_info['BoxOffice']
    revenue.append(box_office)

    actor = film_info['Actors']
    actors.append(actor)

    rating = film_info['imdbRating']
    ratings.append(rating)

In [331]:
best_pic_df = pd.DataFrame({'Movie Title': titles,
                          'Year Released': years,
                          "Writer(s)": writers,
                          'Director': directors,
                          'Leading Actors': actors,
                          "Distributor": distributors,
                          "Genre": genres,
                          "IMDb Rating": ratings,
                          "Runtime": runtimes,
                          "Box Office": revenue})
best_pic_df.head()

Unnamed: 0,Movie Title,Year Released,Writer(s),Director,Leading Actors,Distributor,Genre,IMDb Rating,Runtime,Box Office
0,Parasite,2019,"Bong Joon Ho, Han Jin-won",Bong Joon Ho,"Song Kang-ho, Lee Sun-kyun, Cho Yeo-jeong",NEON,"Drama, Thriller",8.5,132 min,"$53,369,749"
1,Casablanca,1942,"Julius J. Epstein, Howard Koch, Murray Burnett",Michael Curtiz,"Humphrey Bogart, Ingrid Bergman, Paul Henreid",Warner Bros. Pictures,"Drama, Romance, War",8.5,102 min,"$4,219,709"
2,On the Waterfront,1954,"Budd Schulberg, Malcolm Johnson, Robert Siodmak",Elia Kazan,"Marlon Brando, Karl Malden, Lee J. Cobb",Columbia Pictures,"Crime, Drama, Thriller",8.1,108 min,
3,All About Eve,1950,"Joseph L. Mankiewicz, Mary Orr",Joseph L. Mankiewicz,"Bette Davis, Anne Baxter, George Sanders",20th Century Fox,Drama,8.2,138 min,"$63,463"
4,Moonlight,2016,"Barry Jenkins, Tarell Alvin McCraney",Barry Jenkins,"Mahershala Ali, Naomie Harris, Trevante Rhodes",A24,Drama,7.4,111 min,"$27,854,932"


In [332]:
# Wikipedia API URL to fetch the page content
URL = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture"

# Get the page HTML
response = requests.get(URL)
soup = BeautifulSoup(response.text, 'html.parser')

# Find all tables on the page
tables = soup.find_all("table", {"class": "wikitable"})

In [333]:
movies = []

# Track the most recent year found
current_year = None  

# Iterate through tables
for table in tables:
    # Convert table to DataFrame
    try:
        df = pd.read_html(StringIO(str(table)))[0]
    except:
        continue  # Skip tables that can't be read

    # Check if "Film" is in the column names
    if "Film" not in df.columns:
        continue  # Skip tables without the correct format

    # Process each row in the table
    for row in table.find_all("tr"):
        cols = row.find_all(["th", "td"])  # Extract headers and data cells

        # Check if this row contains the year
        if cols[0].name == "th":  
            current_year = cols[0].text.strip()  # Update the stored year
            cols = cols[1:]  # Remove the year cell from further processing

        if not current_year:
            continue  # Skip rows until a year is found

        # Check if this row has the yellow background (winner)
        is_winner = 'background:#FAEB86' in row.get('style', '')

        # Extract only the FIRST `<a>` tag, which is the movie name
        for col in cols:  
            movie_link = col.find("a")  # Get the first link (movie name)
            if not movie_link:
                continue
            
            movie_name = movie_link.text.strip()
            movie_url = "https://en.wikipedia.org" + movie_link["href"]  # Full Wikipedia link

            # Append to the results
            movies.append((current_year, movie_name, movie_url, is_winner))
            break  # Stop after getting the first `<a>` (ignores producer links)

# Convert to DataFrame
movies_df = pd.DataFrame(movies, columns=["Year", "Movie", "Wikipedia Link", "Winner"])

In [334]:
# clean the years - note: 1933 is missing??
movies_df.loc[0:2, 'Year'] = 1927
movies_df.loc[3:7, 'Year'] = 1928
movies_df.loc[8:12, 'Year'] = 1929
movies_df.loc[13:17, 'Year'] = 1930
movies_df.loc[18:25, 'Year'] = 1931
movies_df.loc[26:35, 'Year'] = 1932

for i in range(36, len(movies_df)):
    value = movies_df['Year'].iloc[i]  # Get the value of the 'YearString' column at the current index

    # Use regex to extract the year (first 4 digits)
    match = re.search(r'\b(\d{4})\b', value)
    
    # If a match is found, directly update the 'Year' column with the extracted year
    if match:
        movies_df.at[i, 'Year'] = match.group(1)
    else:
        movies_df.at[i, 'Year'] = None

In [335]:
movies_df.head()

Unnamed: 0,Year,Movie,Wikipedia Link,Winner
0,1927,Wings,https://en.wikipedia.org/wiki/Wings_(1927_film),True
1,1927,7th Heaven,https://en.wikipedia.org/wiki/7th_Heaven_(1927...,False
2,1927,The Racket,https://en.wikipedia.org/wiki/The_Racket_(1928...,False
3,1928,The Broadway Melody,https://en.wikipedia.org/wiki/The_Broadway_Melody,True
4,1928,Alibi,https://en.wikipedia.org/wiki/Alibi_(1929_film),False


In [336]:
# budgets = []
# moneys = []
# for movie_url in movies_df["Wikipedia Link"]:
#     response = requests.get(movie_url)
#     soup = BeautifulSoup(response.text, 'html.parser')
#     try:
#         # Find all rows that contain the 'Budget' and 'Box office' data
#         budget_row = soup.find('th', string='Budget').find_next('td')
#         # Extract the text from the corresponding <td> elements
#         budget_value = budget_row.get_text(strip=True)
#         budgets.append(budget_value)
#     except:     
#         budgets.append("None")
#     try:
#         box_office_row = soup.find('th', string='Box office').find_next('td') 
#         box_office_value = box_office_row.get_text(strip=True)
#         moneys.append(box_office_value)
#     except:
#         moneys.append("None")

In [None]:
distributors2 = []

for movie_url in movies_df['Wikipedia Link']:
    response = requests.get(movie_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    distributed_by_row = soup.find('th', string='Distributed by')

    if distributed_by_row:
        td_element = distributed_by_row.find_next('td')

        # Check if there is an anchor tag inside the <td>
        a_tag = td_element.find('a')
        if a_tag:
            #print(f"a Tag for {movie_url}")
            dist_company = a_tag.get_text(strip=True)  # Get text from <a> tag
        else:
            #print(f"td tag for {movie_url}")
            dist_company = td_element.get_text(strip=True)  # Get text directly from <td>
    else:
        #print(f"did not find for {movie_url}")
        dist_company = 'N/A'

    distributors2.append(dist_company)


did not find for https://en.wikipedia.org/wiki/Cries_and_Whispers


In [363]:
movies_df['Distributor'] = distributors2

In [None]:
producers2 = []

for movie_url in movies_df['Wikipedia Link']:
    response = requests.get(movie_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    produced_by_row = soup.find('th', string='Produced by')

    if produced_by_row:
        td_element = produced_by_row.find_next('td')
        # Check if there are any <a> tags in the <td>
        a_tags = td_element.find_all('a')
        if a_tags:
            # Extract names if there are <a> tags
            producer_names = [a_tag.get_text(strip=True) for a_tag in a_tags]
        else:
            # Otherwise, it's plain text in the <td>
            producer_names = [td_element.get_text(strip=True)]
    else:
        print("Produced by information not found.")

['Lucien Hubbard', 'Adolph Zukor', 'Jesse L. Lasky', 'B.P. Schulberg', 'Otto Hermann Kahn', '[1]', '[a]']
['William Fox']
['Howard Hughes']
['Irving Thalberg', 'Lawrence Weingarten']
['Roland West']
['Irving Thalberg', 'Harry Rapf']
['Winfield Sheehan']
Produced by information not found.
['Carl Laemmle']
['Irving Thalberg']
['Jack L. Warner', 'Darryl F. Zanuck']
['Robert Z. Leonard']
['Ernst Lubitsch']
['William LeBaron', '[1]']
Produced by information not found.
['Howard Hughes']
['Adolph Zukor', 'Jesse L. Lasky', 'B. P. Schulberg']
['Irving Thalberg']
['Irving Thalberg']
['Samuel Goldwyn']
Produced by information not found.
['Harry Rapf', 'Irving Thalberg']
['Hal B. Wallis']
['Ernst Lubitsch']
['Adolph Zukor']
['Ernst Lubitsch']
['Winfield R. Sheehan']
['Darryl F. Zanuck']
['Edward A. BlattBenjamin Glazer']
['Hal B. Wallis']
['Harry Cohn']
['Merian C. Cooper']
['Alexander KordaLudovico Toeplitz']
['William LeBaron']
['Albert Lewin']
['Henry King']
['Harry Cohn']
['Irving Thalberg']
[

KeyboardInterrupt: 

In [None]:
runtimes2 = []
genres2 = []
writers2 = []
revenue2 = []
actors2 = []
ratings2 = []

for index, row in movies_df.iterrows():
    name = row['Movie']
    year = row['Year']

    parameters = {'apikey': apikey, 
                  't': name,
                  'y': year}
    
    response = requests.get('http://www.omdbapi.com', params=parameters)
    movie_info = response.json()

    # length2 = movie_info['Runtime']
    # runtimes2.append(length2)

    # genre2 = movie_info['Genre']
    # genres2.append(genre2)

    # writer2 = movie_info['Writer']
    # writers2.append(writer2)

    # box_office2 = movie_info['BoxOffice']
    # revenue2.append(box_office2)

    # actor2 = movie_info['Actors']
    # actors2.append(actor2)

    # rating2 = movie_info['imdbRating']
    # ratings2.append(rating2)

    length2 = movie_info.get('Runtime', 'N/A')
    runtimes2.append(length2)

    genre2 = movie_info.get('Genre', 'N/A')
    genres2.append(genre2)

    writer2 = movie_info.get('Writer', 'N/A')
    writers2.append(writer2)

    box_office2 = movie_info.get('BoxOffice', 'N/A')
    revenue2.append(box_office2)

    actor2 = movie_info.get('Actors', 'N/A')
    actors2.append(actor2)

    rating2 = movie_info.get('imdbRating', 'N/A')
    ratings2.append(rating2)

KeyboardInterrupt: 

In [346]:
runtimes2 = []
genres2 = []
writers2 = []
revenue2 = []
actors2 = []
ratings2 = []

for movie in movies_df['Movie']:

    parameters = {'apikey': apikey, 
                  't': movie}
    
    response = requests.get('http://www.omdbapi.com', params=parameters)
    movie_info = response.json()

    # length2 = movie_info['Runtime']
    # runtimes2.append(length2)

    # genre2 = movie_info['Genre']
    # genres2.append(genre2)

    # writer2 = movie_info['Writer']
    # writers2.append(writer2)

    # box_office2 = movie_info['BoxOffice']
    # revenue2.append(box_office2)

    # actor2 = movie_info['Actors']
    # actors2.append(actor2)

    # rating2 = movie_info['imdbRating']
    # ratings2.append(rating2)

    length2 = movie_info.get('Runtime', 'N/A')
    runtimes2.append(length2)

    genre2 = movie_info.get('Genre', 'N/A')
    genres2.append(genre2)

    writer2 = movie_info.get('Writer', 'N/A')
    writers2.append(writer2)

    box_office2 = movie_info.get('BoxOffice', 'N/A')
    revenue2.append(box_office2)

    actor2 = movie_info.get('Actors', 'N/A')
    actors2.append(actor2)

    rating2 = movie_info.get('imdbRating', 'N/A')
    ratings2.append(rating2)

In [365]:
actors2

["Clara Bow, Charles 'Buddy' Rogers, Richard Arlen",
 'Stephen Collins, Catherine Hicks, Jessica Biel',
 'Robert Mitchum, Lizabeth Scott, Robert Ryan',
 'Bessie Love, Anita Page, Charles King',
 'Chester Morris, Harry Stubbs, Mae Busch',
 'Conrad Nagel, Jack Benny, John Gilbert',
 'Edmund Lowe, Warner Baxter, Dorothy Burgess',
 'Mel Gibson, Heath Ledger, Joely Richardson',
 'Felix Kammerer, Albrecht Schuch, Aaron Hilmer',
 'Chester Morris, Wallace Beery, Lewis Stone',
 'George Arliss, Doris Lloyd, David Torrence',
 'Norma Shearer, Robert Montgomery, Chester Morris',
 'Maurice Chevalier, Jeanette MacDonald, Lupino Lane',
 'Richard Dix, Irene Dunne, Estelle Taylor',
 'Ann Harding, Clive Brook, Conrad Nagel',
 'Jack Lemmon, Walter Matthau, Susan Sarandon',
 'Ed Devereaux, Tony Bonner, Ken James',
 'Harry Carey, Edwina Booth, Duncan Renaldo',
 'Greta Garbo, John Barrymore, Joan Crawford',
 'Ronald Colman, Helen Hayes, Richard Bennett',
 'James Dunn, Sally Eilers, Minna Gombell',
 'Jon Voig

In [271]:
# movies_df['Runtime'] = runtimes2
# movies_df['Genre'] = genres2
# movies_df['Writer'] = writers2
# movies_df['BoxOffice'] = revenue2
# movies_df['Actors'] = actors2
# movies_df['IMDbRating'] = ratings2