In [None]:
# Import packages
import requests
from bs4 import BeautifulSoup
import time
import json
import csv
import pandas as pd
import os

# NOTE

It will take **approximately 10 hours to scrape all the data** due to a high delay when loading the The-Numbers.com website

# The-Numbers.com

## 6500 Movies

In [None]:
def get_data(page_url):
    time.sleep(2)
    web_request = requests.get(page_url, headers = {'User-agent':'ABC'}).text
    soup = BeautifulSoup(web_request)
    table = soup.find_all('tr')
    j = 0
    data_list = []
    for movie in table:
        if j != 0:
            title = movie.find('b').text
            release_year = movie.find_all('td')[1].text
            production_budget = movie.find_all('td')[3].text
            domestic_gross = movie.find_all('td')[4].text
            data_list.append(
                {
                    'title':title,
                    'release_year':release_year,
                    'production_budget':production_budget,
                    'domestic_gross':domestic_gross
                })
        else:
            j = j + 1
    return data_list

In [None]:
# Generate the list of URLs programmatically
base_url = 'https://www.the-numbers.com/movie/budgets/all'
page_urls = [base_url] + [f"{base_url}/{i}" for i in range(101, 6500, 100)]

In [None]:
data_movie_list = []

for page_url in page_urls:
    print(f"Processing: {page_url}")
    data_movie_list.extend(get_data(page_url))

In [None]:
with open('../data/the_number_movie_raw_data.csv','w',newline='',encoding='utf-8') as file:
    writer = csv.DictWriter(file,fieldnames = ['title','release_year','production_budget','domestic_gross'])
    writer.writeheader()
    writer.writerows(data_movie_list)

## 40000 Movies

In [None]:
def get_data(page_url):
    time.sleep(2)
    web_request = requests.get(page_url, headers = {'User-agent':'ABC'}, timeout=10).text
    soup = BeautifulSoup(web_request)
    movie_table = soup.find('tbody').find_all('tr')
    data_list = []
    for movie in movie_table:
        rank = movie.find('td').text
        year = movie.find('a').text
        title = movie.find('b').text
        domestic_gross = movie.find_all('td')[4].text
        data_list.append({
            'rank':rank,
            'year':year,
            'title':title,
            'domestic_gross':domestic_gross
        })
    return data_list

In [None]:
# Generate the list of URLs programmatically
base_url = "https://www.the-numbers.com/box-office-records/worldwide/all-movies/cumulative/all-time"
page_urls = [base_url] + [f"{base_url}/{i}" for i in range(101, 40000, 100)]

In [None]:
data_movie_list = []

for page_url in page_urls:
    print(f"Processing: {page_url}")
    data_movie_list.extend(get_data(page_url))

In [None]:
with open('../data/the_number_40000_movie_raw_data.csv','w',newline='',encoding='utf-8') as file:
    writer = csv.DictWriter(file,fieldnames = ['rank','year','title','domestic_gross'])
    writer.writeheader()
    writer.writerows(data_movie_list)

## Franchise Names

In [None]:
def get_data(page_url):
    time.sleep(2)
    web_request = requests.get(page_url, headers = {'User-agent':'ABC'}, timeout=10).text
    soup = BeautifulSoup(web_request)
    table = soup.find('table',attrs = {'id':'franchise_overview'}).find_all('tr')
    data_list = []
    j = 0
    for movie in table:
        if j != 0:
            franchise = movie.find_all('td')[0].text
            num_of_movies = movie.find_all('td')[1].text
            domestic_gross = movie.find_all('td')[2].text
            adjusted_inflation_domestic_gross = movie.find_all('td')[3].text
            worldwide_gross = movie.find_all('td')[4].text
            first_year = movie.find_all('td')[5].text
            last_year = movie.find_all('td')[6].text
            num_of_years = movie.find_all('td')[7].text
            data_list.append({
                'franchise':franchise,
                'num_of_movies':num_of_movies,
                'domestic_gross':domestic_gross,
                'adjusted_inflation_domestic_gross':adjusted_inflation_domestic_gross,
                'worldwide_gross':worldwide_gross,
                'first_year':first_year,
                'last_year':last_year
            })
        else:
            j = j + 1
    return data_list

In [None]:
# Define base URL
base_url = "https://www.the-numbers.com/movies/franchises#franchise_overview=od3"
page_urls = [base_url] + [f"https://www.the-numbers.com/movies/franchises#franchise_overview=p{i}:od3" for i in range(1, 34)]

In [None]:
data_movie_list = []

for page_url in page_urls:
    print(f"Processing: {page_url}")
    data_movie_list.extend(get_data(page_url))

In [None]:
with open('../data/the_number_franchise_raw_data.csv','w',newline='',encoding='utf-8') as file:
    writer = csv.DictWriter(file,fieldnames = ['franchise','num_of_movies','domestic_gross',
                                               'adjusted_inflation_domestic_gross','worldwide_gross','first_year','last_year'])
    writer.writeheader()
    writer.writerows(data_movie_list)

## Franchise Movies

In [None]:
the_numbers_franchise_df = pd.read_csv('../data/the_number_franchise_raw_data.csv')

In [None]:
the_numbers_franchise_df.drop_duplicates(keep = 'first',inplace=True)

In [None]:
the_numbers_franchise_df.loc[the_numbers_franchise_df['franchise'].str.contains("'s")]

In [None]:
# Remove all occurrences of "'s" from the 'franchise' column
the_numbers_franchise_df["franchise"] = the_numbers_franchise_df["franchise"].str.replace("'s", "s", regex=True)

In [None]:
# Remove all occurrences of ":" from the 'franchise' column
the_numbers_franchise_df["franchise"] = the_numbers_franchise_df["franchise"].str.replace(":", "", regex=True)

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("Sony’s Marvel Universe", "Sonys Marvel Universe")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("Father Chuck O'Malley", "Father Chuck O Malley")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("Monsters, Inc.", "Monsters Inc")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("Food, Inc.", "Food Inc")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("Wreck-It Ralph", "Wreck-It-Ralph")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("G.I. Joe", "G I Joe")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("Oh, God!", "Oh God")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("Daddy’s Home", "Daddys Home")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("Breakin'", "Breakin")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("Don’t Breathe", "Dont Breathe")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("God’s Not Dead", "Gods Not Dead")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("Nim’s Island", "Nims Island")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("House of 1,000 Corpses", "House of 1000 Corpses")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("R.I.P.D", "R I P D")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("Surfs Up", "Surf s Up")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("Happily N'Ever After", "Happily Never After")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("C.H.U.D.", "C H U D")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("F/X", "FX")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("A Dog’s Purpose", "Dogs Purpose A")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("A Fish Called Wanda", "Fish Called Wanda A")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("The Emperor’s New Groove", "Emperors New Groove The")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("Crouching Tiger, Hidden Dragon", "Crouching Tiger Hidden Dragon")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("No Retreat, No Surrender", "No Retreat No Surrender")

In [None]:
def reverse_title_words(title):
    words = title.split()
    if words[0] == "The":
        return " ".join(words[1:] + [words[0]])  # Move "The" to the end
    return title  # Keep other entries unchanged

In [None]:
# Apply the function to the 'franchise' column
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].apply(reverse_title_words)

In [None]:
the_numbers_franchise_df.loc[the_numbers_franchise_df['franchise'].str.contains("Marine")]

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("Sandlot The", "The Sandlot")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("Marine The", "The Marine")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("Art of War The", "The Art of War")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("An Inconvenient Truth", "Inconvenient Truth An")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("Kenneth Branaghs Hercule Poirot", "Kenneth Branagh s Hercule Poirot")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("Hitman’s Bodguard The", "Hitmans Bodguard The")

In [None]:
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].replace("Fox and the Hound The", "The Fox and the Hound")

In [None]:
the_numbers_franchise_df = the_numbers_franchise_df.loc[the_numbers_franchise_df['franchise'] != "The Poseidon Adventure"]

In [None]:
the_numbers_franchise_df = the_numbers_franchise_df.loc[the_numbers_franchise_df['franchise'] != "Road To …"]

In [None]:
the_numbers_franchise_df = the_numbers_franchise_df.loc[the_numbers_franchise_df['franchise'] != "V/H/S"]

In [None]:
the_numbers_franchise_df = the_numbers_franchise_df.loc[the_numbers_franchise_df['franchise'] != "Bill & Ted"]

In [None]:
the_numbers_franchise_df = the_numbers_franchise_df.loc[the_numbers_franchise_df['franchise'] != "The Emperor’s New Groove"]

In [None]:
the_numbers_franchise_df = the_numbers_franchise_df.loc[the_numbers_franchise_df['franchise'] != "Harold & Kumar"]

In [None]:
the_numbers_franchise_df = the_numbers_franchise_df.loc[the_numbers_franchise_df['franchise'] != "The Hitman’s Bodguard"]

In [None]:
the_numbers_franchise_df = the_numbers_franchise_df.loc[the_numbers_franchise_df['franchise'] != "Dungeons & Dragons"]

In [None]:
the_numbers_franchise_df = the_numbers_franchise_df.loc[the_numbers_franchise_df['franchise'] != "Stepfather, The"]

In [None]:
the_numbers_franchise_df = the_numbers_franchise_df.loc[the_numbers_franchise_df['franchise'] != "Haikyuu!!"]

In [None]:
the_numbers_franchise_df = the_numbers_franchise_df.loc[the_numbers_franchise_df['franchise'] != "Cheech & Chong"]

In [None]:
the_numbers_franchise_df = the_numbers_franchise_df.loc[the_numbers_franchise_df['franchise'] != "A Quiet Place"]

In [None]:
the_numbers_franchise_df = the_numbers_franchise_df.loc[the_numbers_franchise_df['franchise'] != "Rugrats Movie"]

In [None]:
the_numbers_franchise_df = the_numbers_franchise_df.loc[the_numbers_franchise_df['franchise'] != "Don"]

In [None]:
the_numbers_franchise_df = the_numbers_franchise_df.loc[the_numbers_franchise_df['franchise'] != "Sinister"]

In [None]:
the_numbers_franchise_df = the_numbers_franchise_df.loc[the_numbers_franchise_df['franchise'] != "Donald Hamiltons Matt Helm"]

In [None]:
the_numbers_franchise_df = the_numbers_franchise_df.loc[the_numbers_franchise_df['franchise'] != "Garry Marshalls Holiday Franchise"]

In [None]:
the_numbers_franchise_df = the_numbers_franchise_df.loc[the_numbers_franchise_df['franchise'] != "Every Which Way But Loose"]

In [None]:
# Exclude franchises that have only 1 movie
the_numbers_franchise_df = the_numbers_franchise_df.loc[the_numbers_franchise_df['num_of_movies'] != 1]

In [None]:
def reverse_title_words_2(title):
    words = title.split()
    if words[0] == "A":
        return " ".join(words[1:] + [words[0]])  # Move "The" to the end
    return title  # Keep other entries unchanged

In [None]:
# Apply the function to the 'franchise' column
the_numbers_franchise_df['franchise'] = the_numbers_franchise_df['franchise'].apply(reverse_title_words_2)

In [None]:
the_numbers_franchise_df.loc[the_numbers_franchise_df['franchise'].str.contains("/")]

In [None]:
the_numbers_franchise_df['domestic_gross'] = the_numbers_franchise_df['domestic_gross'].replace({'\$':'',',':''}, regex=True).astype(float)

In [None]:
the_numbers_franchise_df['adjusted_inflation_domestic_gross'] = the_numbers_franchise_df['adjusted_inflation_domestic_gross'].replace({'\$':'',',':''}, regex=True).astype(float)

In [None]:
the_numbers_franchise_df['worldwide_gross'] = the_numbers_franchise_df['worldwide_gross'].replace({'\$':'',',':''}, regex=True).astype(float)

In [None]:
# Exclude franchises that have adjusted_inflation_domestic_gross equal to 0 or NaN
the_numbers_franchise_df = the_numbers_franchise_df.loc[
    (the_numbers_franchise_df['adjusted_inflation_domestic_gross'] != 0) &
    (~the_numbers_franchise_df['adjusted_inflation_domestic_gross'].isna())
]

In [None]:
the_numbers_franchise_df['average_domestic_gross'] = the_numbers_franchise_df['domestic_gross'] / the_numbers_franchise_df['num_of_movies']

In [None]:
# Exclude franchises that have average domestic_gross lower than 1,000,000
the_numbers_franchise_df = the_numbers_franchise_df.loc[
    the_numbers_franchise_df['average_domestic_gross'] > 1000000
]

In [None]:
# Function to transform franchise names
def transform_franchise(name):
    if "-" in name:
        return name  # Keep as is if already contains '-'
    return "-".join(name.split())  # Replace spaces with '-'

In [None]:
# Apply transformation
the_numbers_franchise_df["franchise_transform"] = the_numbers_franchise_df["franchise"].astype(str).apply(transform_franchise)

In [None]:
the_numbers_franchise_df = the_numbers_franchise_df[['franchise','franchise_transform']]

In [None]:
# Generate franchise URLs
the_numbers_franchise_df["franchise_url"] = the_numbers_franchise_df["franchise_transform"].apply(
    lambda x: f"https://www.the-numbers.com/movies/franchise/{x}#tab=summary"
)

In [None]:
# File path
file_path = '../data/the_number_franchise_details.csv'

In [None]:
# Function to read existing data from CSV
def load_existing_data(file_path):
    existing_data = set()
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            for row in reader:
                existing_data.add(row['title'])  # Assuming title is unique
    return existing_data

In [None]:
# Function to scrape data
def get_data(page_url):
    time.sleep(2)
    web_request = requests.get(page_url, headers={'User-agent': 'DXYZ_2506'}, timeout=300).text
    soup = BeautifulSoup(web_request, 'html.parser')

    table = soup.find('table', attrs={'id': 'franchise_movies_overview'}).find('tbody').find_all('tr')
    data_list = []

    for movie in table:
        release_date = movie.find_all('td')[0].text.strip()
        title = movie.find_all('td')[1].text.strip()
        production_budget = movie.find_all('td')[2].text.strip()
        domestic_gross = movie.find_all('td')[4].text.strip()

        data_list.append({
            'release_date': release_date,
            'title': title,
            'production_budget': production_budget,
            'domestic_gross': domestic_gross
        })

    return data_list

In [None]:
# Load existing data
existing_titles = load_existing_data(file_path)

In [None]:
# Create a list of unique franchise URLs
page_urls = the_numbers_franchise_df["franchise_url"].unique().tolist()

In [None]:
# Open CSV in append mode
with open(file_path, 'a', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=['release_date', 'title', 'production_budget', 'domestic_gross'])

    # If file is new, write header
    if os.stat(file_path).st_size == 0:
        writer.writeheader()

    # Iterate over page URLs
    for page_url in page_urls:
        print(f"Processing: {page_url}")
        new_data = get_data(page_url)

        for movie in new_data:
            if movie['title'] not in existing_titles:
                writer.writerow(movie)
                existing_titles.add(movie['title'])  # Update existing data set
            else:
                print(f"Skipping {movie['title']} (Already scraped)")

## Cast Data

### Leading Cast

In [None]:
def get_data(page_url):
    time.sleep(2)
    web_request = requests.get(page_url, headers = {'User-agent':'ABC'}).text
    soup = BeautifulSoup(web_request)
    table = soup.find('tbody').find_all('tr')
    data_list = []
    for actor in table:
        name = actor.find('b').text
        domestic_gross = actor.find_all('td')[2].text
        num_movies = actor.find_all('td')[3].text
        avg_gross = actor.find_all('td')[4].text
        data_list.append({
            'name':name,
            'domestic_gross':domestic_gross,
            'num_movies':num_movies,
            'avg_gross':avg_gross
        })
    return data_list

In [None]:
# Generate the list of URLs programmatically
base_url = 'https://www.the-numbers.com/box-office-star-records/domestic/lifetime-acting/top-grossing-leading-stars'
page_urls = [base_url] + [f'{base_url}/{i}' for i in range(101,9900,100)]

In [None]:
data_leading_list = []

for page_url in page_urls:
    print(f"Processing: {page_url}")
    data_leading_list.extend(get_data(page_url))

In [None]:
with open('../data/the_number_leading_cast_raw_data.csv','w',newline='',encoding='utf-8') as file:
    writer = csv.DictWriter(file,fieldnames = ['name','domestic_gross','num_movies','avg_gross'])
    writer.writeheader()
    writer.writerows(data_leading_list)

### Supporting Cast

In [None]:
def get_data(page_url):
    time.sleep(2)
    web_request = requests.get(page_url, headers = {'User-agent':'ABC'}).text
    soup = BeautifulSoup(web_request)
    table = soup.find('tbody').find_all('tr')
    data_list = []
    for actor in table:
        name = actor.find('b').text
        domestic_gross = actor.find_all('td')[2].text
        num_movies = actor.find_all('td')[3].text
        avg_gross = actor.find_all('td')[4].text
        data_list.append({
            'name':name,
            'domestic_gross':domestic_gross,
            'num_movies':num_movies,
            'avg_gross':avg_gross
        })
    return data_list

In [None]:
# Generate the list of URLs programmatically
base_url = 'https://www.the-numbers.com/box-office-star-records/domestic/lifetime-acting/top-grossing-supporting-stars'
page_urls = [base_url] + [f'{base_url}/{i}' for i in range(101,70100,100)]

In [None]:
data_supporting_list = []

for page_url in page_urls:
    print(f"Processing: {page_url}")
    data_supporting_list.extend(get_data(page_url))

In [None]:
with open('../data/the_number_supporting_cast_raw_data.csv','w',newline='',encoding='utf-8') as file:
    writer = csv.DictWriter(file,fieldnames = ['name','domestic_gross','num_movies','avg_gross'])
    writer.writeheader()
    writer.writerows(data_supporting_list)

# Consumer Price Index

In [None]:
page_url = 'https://www.minneapolisfed.org/about-us/monetary-policy/inflation-calculator/consumer-price-index-1913-'
web_request = requests.get(page_url, headers={'User-agent':'ABC'}).text
soup = BeautifulSoup(web_request)

In [None]:
# Find the table body and rows
table = soup.find('tbody')
rows = table.find_all('tr')

In [None]:
# Extract headers
headers = [header.text.strip() for header in rows[0].find_all('th')]

# Extract data
data_list = []
for row in rows[1:]:  # Skip the header row
    cols = row.find_all('div')  # Look for <div> elements within the <td> tags
    if cols:
        year = cols[0].text.strip()
        cpi = cols[1].text.strip()
        inflation = cols[2].text.strip() if len(cols) > 2 else ""
        data_list.append({
            "year": year,
            "cpi": cpi,
            "rate_of_inflation": inflation
        })

In [None]:
with open('../data/consumer_price_index.csv','w',newline='',encoding='utf-8') as file:
    writer = csv.DictWriter(file,fieldnames = ['year','cpi','rate_of_inflation'])
    writer.writeheader()
    writer.writerows(data_list)

# DIRECTOR GENDER BENCHMARK

## BBC

In [None]:
page_url = 'https://www.bbc.com/culture/article/20191125-the-100-greatest-films-directed-by-women-poll'
web_request = requests.get(page_url, headers = {'User-agent':'ABC'}).text
soup = BeautifulSoup(web_request)

In [None]:
p_tags = soup.find_all('p', attrs={'class':'sc-eb7bd5f6-0 fYAfXe'})

In [None]:
# Extract the movie list
movies = []
for p in p_tags:
    text = p.get_text().strip()
    # Check for the format of the list items: starts with a number and ends with details in parentheses
    if text and text[0].isdigit() and '(' in text and ')' in text:
        movies.append(text)

In [None]:
# List to store extracted details
data_list = []

for text in movies:
    # Split and process the string
    title = text.split(".")[1].split("(")[0].strip()  # Extract the title
    rest = text.split("(")[1].strip()  # Extract the part after '('
    if "(" in text and "," in rest:
        director = rest.split(",")[0].strip()  # Extract the director's name
        release_year = rest.split(",")[1].split(")")[0].strip()  # Extract the release year
        data_list.append({
            'title': title,
            'director_name': director,
            'release_year': release_year
        })

In [None]:
with open('../data/index_bbc_raw_data.csv','w',newline='',encoding='utf-8') as file:
    writer = csv.DictWriter(file,fieldnames = ['title','director_name','release_year'])
    writer.writeheader()
    writer.writerows(data_list)

## Billboard

In [None]:
pip install selenium

In [None]:
# URL of the webpage
page_url = 'https://www.billboard.com/lists/barbie-highest-grossing-films-women-directors-top-30-all-time-billboard-list'

In [None]:
# Set up Selenium WebDriver
driver = webdriver.Chrome()  # Make sure ChromeDriver is installed and in PATH
driver.get(page_url)

In [None]:
# Wait for the page to fully load (adjust time if necessary)
time.sleep(5)

In [None]:
# Parse the rendered page content with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [None]:
# Find the desired div by its class
gallery_slides = soup.find(attrs={'class': 'c-gallery-vertical__slides'})

In [None]:
data_list = []
# Iterate over each movie slide in the gallery
for movie in gallery_slides.find_all('div', class_='c-gallery-vertical__slide-wrapper'):
    # Extract title and release year
    title_text = movie.find('h2', attrs={'class': 'c-gallery-vertical-featured-image__title'}).text.strip()
    title, release_year = title_text.rsplit('(', 1)
    title = title.strip()
    release_year = release_year.replace(')', '').strip()
    
    # Extract director name specific to this movie
    director_tag = movie.find('p', class_='paragraph larva // lrv-u-margin-lr-auto lrv-a-font-body-m')
    if director_tag and 'Director:' in director_tag.text:
        director_name = director_tag.text.replace('Director:', '').strip()
    else:
        director_name = None  # Handle missing director information
        
    # Append to the list
    data_list.append({
        'title': title,
        'release_year': release_year,
        'director_name': director_name
    })

In [None]:
with open('../data/index_billboard_raw_data.csv','w',newline='',encoding='utf-8') as file:
    writer = csv.DictWriter(file,fieldnames = ['title','release_year','director_name'])
    writer.writeheader()
    writer.writerows(data_list)

## Indiewire

In [None]:
page_url = 'https://www.indiewire.com/feature/female-directors-best-movies-directed-by-women-1202045399/'
web_request = requests.get(page_url, headers = {'User-agent':'ABC'}).text
soup = BeautifulSoup(web_request)

In [None]:
movie_list = soup.find_all('h3')

In [None]:
movies = []
for h3 in movie_list:
    text = h3.get_text()
    if "“" in text and "”" in text:  # Check if the format matches
        title = text.split("“")[1].split("”")[0]  # Extract the title
        rest = text.split("”")[1].strip()
        if "(" in rest and "," in rest:
            director = rest.split("(")[1].split(",")[0].strip()
            release_year = rest.split(",")[-1].split(")")[0].strip()
            movies.append({
                'title': title,
                'director_name': director,
                'release_year': release_year
            })

In [None]:
# List to store extracted details
data_list = []

for text in movies:
    # Split and process the string
    title = text.split(".")[1].split("(")[0].strip()  # Extract the title
    rest = text.split("(")[1].strip()  # Extract the part after '('
    if "(" in text and "," in rest:
        director = rest.split(",")[0].strip()  # Extract the director's name
        release_year = rest.split(",")[1].split(")")[0].strip()  # Extract the release year
        data_list.append({
            'title': title,
            'director_name': director,
            'release_year': release_year
        })

In [None]:
with open('../data/index_indiewire_raw_data.csv','w',newline='',encoding='utf-8') as file:
    writer = csv.DictWriter(file,fieldnames = ['title','director_name','release_year'])
    writer.writeheader()
    writer.writerows(data_list)

## Rotten Tomatoes

In [None]:
page_url = 'https://editorial.rottentomatoes.com/guide/best-movies-directed-by-women-of-the-21st-century/'
web_request = requests.get(page_url, headers = {'User-agent':'ABC'}).text
soup = BeautifulSoup(web_request)

In [None]:
# Find all sections with class 'row countdown-item'
sections = soup.find_all('div', class_='row countdown-item')

In [None]:
def get_data(page_url):
    # Fetch the page content
    response = requests.get(page_url, headers={'User-Agent': 'ABC'}).text
    soup = BeautifulSoup(response)

    # Find all sections with class 'row countdown-item'
    sections = soup.find_all('div', class_='row countdown-item')

    # List to store scraped data
    data_list = []

    # Iterate over each movie section
    for section in sections:
        # Extract movie title
        title_tag = section.find('h2')
        title = title_tag.find('a').text.strip() if title_tag and title_tag.find('a') else None

        # Extract director information
        director_section = section.find('div', class_='info director')
        director_tag = director_section.find('a') if director_section else None
        director_name = director_tag.text.strip() if director_tag else None

        # Extract cast information
        cast_section = section.find('div', class_='info cast')
        cast_tags = cast_section.find_all('a') if cast_section else []
        cast_names = [cast.text.strip() for cast in cast_tags]
        cast_string = ', '.join(cast_names)  # Convert list to comma-separated string

        # Append the extracted data to the list
        data_list.append({
            'title': title,
            'director': director_name,
            'cast': cast_names
        })
    
    return data_list

In [None]:
all_data = []
urls = [
    'https://editorial.rottentomatoes.com/guide/best-movies-directed-by-women-of-the-21st-century/',
    'https://editorial.rottentomatoes.com/guide/best-movies-directed-by-women-of-the-21st-century/2/',
    'https://editorial.rottentomatoes.com/guide/best-movies-directed-by-women-of-the-21st-century/3/'
]

In [None]:
for url in urls:
    print(f"Scraping: {url}")
    all_data.extend(get_data(url))
    time.sleep(2)

In [None]:
with open('../data/index_rotten_tomatoes_raw_data.csv','w',newline='',encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames = ['title','director','cast'])
    writer.writeheader()
    writer.writerows(all_data)

## Wiki

In [None]:
page_url = 'https://en.wikipedia.org/wiki/List_of_female_film_and_television_directors'
web_request = requests.get(page_url, headers = {'User-agent':'ABC'}).text
soup = BeautifulSoup(web_request)

In [None]:
table = soup.find_all(attrs={'class':'div-col'})

In [None]:
data_list = []

# Loop through each <li> tag inside the table
for section in table:
    for li in section.find_all('li'):  # Iterate through each <li>
        # Find the <a> tag within the <li> (this contains the director's name)
        director_tag = li.find('a')
        if director_tag:
            director_name = director_tag.text.strip()  # Extract clean director name
            data_list.append({
                'director_name': director_name
            })

In [None]:
with open('../data/wiki_raw_data.csv','w',newline='',encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames = ['director_name'])
    writer.writeheader()
    writer.writerows(data_list)