In [241]:
#director, lead actors, writers, producers, genre, budget, film length, revenue

In [242]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import re
import urllib.parse
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from pathlib import Path
from io import StringIO

In [243]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [244]:
url = "https://editorial.rottentomatoes.com/guide/oscars-best-and-worst-best-pictures/"
r = requests.get(url)

In [245]:
text = r.text
soup = BeautifulSoup(r.text, 'html.parser')

In [246]:
div_tags = soup.find_all('div', {'class': 'article_movie_title'})

In [247]:
titles = []
for div in div_tags:
    a_tags = div.find_all('a')
    for a in a_tags:
        titles.append(a.get_text().strip())

In [248]:
year_tag = soup.find_all("span", {"class":"subtle start-year"})

years = []
for entry in year_tag:
    year = entry.get_text().strip()
    year = year.replace("(","").replace(")","")
    year = int(year)
    years.append(year)

In [249]:
director_tags = soup.find_all('div', {'class': 'info director'})

In [250]:
directors = []
for d in director_tags:
    directors.append(d.get_text().strip().replace('Directed By:', '').strip())

In [251]:
# actor_tags = soup.find_all('div', {'class': 'info cast'})

In [252]:
# names = []
# for actor in actor_tags:
#     a_tags = a.find_all('a')
#     for a in a_tags:
#         names.append(a.get_text().strip())
    
    # name = a.get_text().strip().replace('Starring:', '').strip()
    # names.append(name)
    # for n in names:
    #     print(n)

In [253]:
# actors = []
# for a in actor_tags:
#     a_tags = a.find_all('a')
#     for b in a_tags:
#         actors.append(b.get_text().strip())

In [254]:
links = []
for div in div_tags:
    link = div.find("a")["href"]
    links.append(link)

In [255]:
distributors = []

for link in links:
    r_sub = requests.get(link)
    soup_sub = BeautifulSoup(r_sub.text)
    try:
        distributor = soup_sub.find("rt-text", {"data-qa": "item-value"}).get_text().strip()
        distributors.append(distributor)
    except:     
        distributors.append("None")
    # try:
    #     producer = soup_sub.find("rt-link", {"href": "/celebrity/"}).get_text().strip()
    #     producers.append(producer)
        
    # except:
    #     producers.append("None")

In [256]:
with open('omdb_api.txt', 'r') as file:
    apikey = file.read()

In [257]:
parameters = {'apikey': apikey, 
             't': 'parasite'}

In [258]:
response = requests.get('http://www.omdbapi.com', params=parameters)
print(response.ok)
print(response.url)

True
http://www.omdbapi.com/?apikey=6e062c76&t=parasite


In [259]:
film_info = response.json()
film_info['Runtime']

'132 min'

In [260]:
runtimes = []
genres = []
writers = []
revenue = []
actors = []
ratings = []

for title in titles:
    parameters = {'apikey': apikey, 
             't': title}
    
    response = requests.get('http://www.omdbapi.com', params=parameters)
    film_info = response.json()

    length = film_info['Runtime']
    runtimes.append(length)

    genre = film_info['Genre']
    genres.append(genre)

    writer = film_info['Writer']
    writers.append(writer)

    box_office = film_info['BoxOffice']
    revenue.append(box_office)

    actor = film_info['Actors']
    actors.append(actor)

    rating = film_info['imdbRating']
    ratings.append(rating)

In [261]:
best_pic_df = pd.DataFrame({'Movie Title': titles,
                          'Year Released': years,
                          "Writer(s)": writers,
                          'Director': directors,
                          'Leading Actors': actors,
                          "Distributor": distributors,
                          "Genre": genres,
                          "IMDb Rating": ratings,
                          "Runtime": runtimes,
                          "Box Office": revenue})
best_pic_df.head()

Unnamed: 0,Movie Title,Year Released,Writer(s),Director,Leading Actors,Distributor,Genre,IMDb Rating,Runtime,Box Office
0,Parasite,2019,"Bong Joon Ho, Han Jin-won",Bong Joon Ho,"Song Kang-ho, Lee Sun-kyun, Cho Yeo-jeong",NEON,"Drama, Thriller",8.5,132 min,"$53,369,749"
1,Casablanca,1942,"Julius J. Epstein, Howard Koch, Murray Burnett",Michael Curtiz,"Humphrey Bogart, Ingrid Bergman, Paul Henreid",Warner Bros. Pictures,"Drama, Romance, War",8.5,102 min,"$4,219,709"
2,On the Waterfront,1954,"Budd Schulberg, Malcolm Johnson, Robert Siodmak",Elia Kazan,"Marlon Brando, Karl Malden, Lee J. Cobb",Columbia Pictures,"Crime, Drama, Thriller",8.1,108 min,
3,All About Eve,1950,"Joseph L. Mankiewicz, Mary Orr",Joseph L. Mankiewicz,"Bette Davis, Anne Baxter, George Sanders",20th Century Fox,Drama,8.2,138 min,"$63,463"
4,Moonlight,2016,"Barry Jenkins, Tarell Alvin McCraney",Barry Jenkins,"Mahershala Ali, Naomie Harris, Trevante Rhodes",A24,Drama,7.4,111 min,"$27,854,932"


In [263]:
# Wikipedia API URL to fetch the page content
URL = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture"

# Get the page HTML
response = requests.get(URL)
soup = BeautifulSoup(response.text, 'html.parser')

# Find all tables on the page
tables = soup.find_all("table", {"class": "wikitable"})

In [290]:
movies = []

# Track the most recent year found
current_year = None  

# Iterate through tables
for table in tables:
    # Convert table to DataFrame
    try:
        df = pd.read_html(StringIO(str(table)))[0]
    except:
        continue  # Skip tables that can't be read

    # Check if "Film" is in the column names
    if "Film" not in df.columns:
        continue  # Skip tables without the correct format

    # Process each row in the table
    for row in table.find_all("tr"):
        cols = row.find_all(["th", "td"])  # Extract headers and data cells

        # Check if this row contains the year
        if cols[0].name == "th":  
            current_year = cols[0].text.strip()  # Update the stored year
            cols = cols[1:]  # Remove the year cell from further processing

        if not current_year:
            continue  # Skip rows until a year is found

        # Check if this row has the yellow background (winner)
        is_winner = 'background:#FAEB86' in row.get('style', '')

        # Extract only the FIRST `<a>` tag, which is the movie name
        for col in cols:  
            movie_link = col.find("a")  # Get the first link (movie name)
            if not movie_link:
                continue
            
            movie_name = movie_link.text.strip()
            movie_url = "https://en.wikipedia.org" + movie_link["href"]  # Full Wikipedia link

            # Append to the results
            movies.append((current_year, movie_name, movie_url, is_winner))
            break  # Stop after getting the first `<a>` (ignores producer links)

# Convert to DataFrame
movies_df = pd.DataFrame(movies, columns=["Year", "Movie", "Wikipedia Link", "Winner"])

In [302]:
# total of winners
movies_df['Year']

0      1927
1      1927
2      1927
3      1928
4      1928
5      1928
6      1928
7      1928
8      1929
9      1929
10     1929
11     1929
12     1929
13     1930
14     1930
15     1930
16     1930
17     1930
18     1931
19     1931
20     1931
21     1931
22     1931
23     1931
24     1931
25     1931
26     1932
27     1932
28     1932
29     1932
30     1932
31     1932
32     1932
33     1932
34     1932
35     1932
36     1934
37     1934
38     1934
39     1934
40     1934
41     1934
42     1934
43     1934
44     1934
45     1934
46     1934
47     1934
48     1935
49     1935
50     1935
51     1935
52     1935
53     1935
54     1935
55     1935
56     1935
57     1935
58     1935
59     1935
60     1936
61     1936
62     1936
63     1936
64     1936
65     1936
66     1936
67     1936
68     1936
69     1936
70     1937
71     1937
72     1937
73     1937
74     1937
75     1937
76     1937
77     1937
78     1937
79     1937
80     1938
81     1938
82     1938
83  

In [301]:
# clean the years note: 1933 is missing??
movies_df.loc[0:2, 'Year'] = 1927
movies_df.loc[3:7, 'Year'] = 1928
movies_df.loc[8:12, 'Year'] = 1929
movies_df.loc[13:17, 'Year'] = 1930
movies_df.loc[18:25, 'Year'] = 1931
movies_df.loc[26:35, 'Year'] = 1932

for i in range(36, len(movies_df)):
    value = movies_df['Year'].iloc[i]  # Get the value of the 'YearString' column at the current index

    # Use regex to extract the year (first 4 digits)
    match = re.search(r'\b(\d{4})\b', value)
    
    # If a match is found, directly update the 'Year' column with the extracted year
    if match:
        movies_df.at[i, 'Year'] = match.group(1)
    else:
        movies_df.at[i, 'Year'] = None

In [304]:
movies_df.head()

Unnamed: 0,Year,Movie,Wikipedia Link,Winner
0,1927,Wings,https://en.wikipedia.org/wiki/Wings_(1927_film),True
1,1927,7th Heaven,https://en.wikipedia.org/wiki/7th_Heaven_(1927...,False
2,1927,The Racket,https://en.wikipedia.org/wiki/The_Racket_(1928...,False
3,1928,The Broadway Melody,https://en.wikipedia.org/wiki/The_Broadway_Melody,True
4,1928,Alibi,https://en.wikipedia.org/wiki/Alibi_(1929_film),False


In [266]:
# budgets = []
# moneys = []
# for movie_url in movies_df["Wikipedia Link"]:
#     response = requests.get(movie_url)
#     soup = BeautifulSoup(response.text, 'html.parser')
#     try:
#         # Find all rows that contain the 'Budget' and 'Box office' data
#         budget_row = soup.find('th', string='Budget').find_next('td')
#         # Extract the text from the corresponding <td> elements
#         budget_value = budget_row.get_text(strip=True)
#         budgets.append(budget_value)
#     except:     
#         budgets.append("None")
#     try:
#         box_office_row = soup.find('th', string='Box office').find_next('td') 
#         box_office_value = box_office_row.get_text(strip=True)
#         moneys.append(box_office_value)
#     except:
#         moneys.append("None")

In [267]:
# moneys

In [268]:
movies_df['Movie'] = movies_df['Movie'].apply(lambda x: str(x) if pd.notna(x) else 'Unknown')

In [312]:
runtimes2 = []
genres2 = []
writers2 = []
revenue2 = []
actors2 = []
ratings2 = []

for index, row in movies_df.iterrows():
    name = row['Movie']
    year = row['Year']

    parameters = {'apikey': apikey, 
                  't': name,
                  'y': year}
    
    response = requests.get('http://www.omdbapi.com', params=parameters)
    movie_info = response.json()

    # length2 = movie_info['Runtime']
    # runtimes2.append(length2)

    # genre2 = movie_info['Genre']
    # genres2.append(genre2)

    # writer2 = movie_info['Writer']
    # writers2.append(writer2)

    # box_office2 = movie_info['BoxOffice']
    # revenue2.append(box_office2)

    # actor2 = movie_info['Actors']
    # actors2.append(actor2)

    # rating2 = movie_info['imdbRating']
    # ratings2.append(rating2)

    length2 = movie_info.get('Runtime', 'N/A')
    runtimes2.append(length2)

    genre2 = movie_info.get('Genre', 'N/A')
    genres2.append(genre2)

    writer2 = movie_info.get('Writer', 'N/A')
    writers2.append(writer2)

    box_office2 = movie_info.get('BoxOffice', 'N/A')
    revenue2.append(box_office2)

    actor2 = movie_info.get('Actors', 'N/A')
    actors2.append(actor2)

    rating2 = movie_info.get('imdbRating', 'N/A')
    ratings2.append(rating2)

In [313]:
runtimes2

['144 min',
 '110 min',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 '95 min',
 '113 min',
 'N/A',
 'N/A',
 '90 min',
 'N/A',
 '107 min',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 '108 min',
 '90 min',
 '86 min',
 '89 min',
 'N/A',
 'N/A',
 '93 min',
 'N/A',
 'N/A',
 '80 min',
 '92 min',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 '98 min',
 'N/A',
 '105 min',
 '109 min',
 '100 min',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A

In [271]:
movies_df['Runtime'] = runtimes2
movies_df['Genre'] = genres2
movies_df['Writer'] = writers2
movies_df['BoxOffice'] = revenue2
movies_df['Actors'] = actors2
movies_df['IMDbRating'] = ratings2