In [None]:
import requests

from bs4 import BeautifulSoup as BS
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.select import Select

import time

import pandas as pd
import numpy as np

In [None]:
#Creating list of movie ratings as used by filmratings.com
ratings = pd.DataFrame(['G', 'PG', 'PG-13', 'R', 'NC-17'], columns=['rating'])
ratings['key'] = 1

#Creating list of years of interest
years = pd.DataFrame(list(range(1992, 2023)), columns = ['year'])
years['key'] = 1

#"Cross-joining" these two lists to create combinations of every year and rating
year_rating = pd.merge(years, ratings, on = 'key').drop('key', 1)

year_rating.info()

In [None]:
def get_mpaa_ratings(year, rating):
    
    time.sleep(1)
    
    #get the filmratings.com search running
    driver.get(f"https://www.filmratings.com/Search?filmYear={year}&filmRating={rating}")
    
    time.sleep(2)

    #find and click on the "View in HTML" button
    view_html = driver.find_element(by=By.CSS_SELECTOR, value="[href = 'javascript:handleFullResults();']")
    view_html.click()
    
    time.sleep(1)
    
    #close the original tab
    driver.close()
    
    #switch to the new window that opens with the results in html
    driver.switch_to.window(driver.window_handles[-1])
    
    #create the soup
    content = driver.page_source
    soup = BS(content)

    #pull out the div tags that contain the information about the movies
    div = soup.find_all('div')

    #there is a pattern that repeats every 7 div tags, allowing us to identify the # of entries per page
    entries = int(len(div) / 7 - 1)
    
    #print a little note so the observer knows where the process is
    print('Pulling', year, rating, 'movies with', entries, 'total entries')

    #create a temporary dataframe to store the data from each page
    temp_df = pd.DataFrame()

    temp_df['title'] = [div[8+7*i].text for i in range(0, entries)]
    temp_df['rating'] = [div[9+7*i].text for i in range(0, entries)]
    temp_df['reason'] = [div[10+7*i].text for i in range(0, entries)]
    temp_df['distributor'] = [div[11+7*i].text for i in range(0, entries)]
    temp_df['alt_titles'] = [div[12+7*i].text for i in range(0, entries)]
    temp_df['other'] = [div[13+7*i].text for i in range(0, entries)]
    
    #adding the temp_df data into the global mpaa_df to create one complete dataframe
    global mpaa_df
    mpaa_df = pd.concat([mpaa_df, temp_df])
    mpaa_df = mpaa_df.reset_index(drop=True)
    

In [None]:
mpaa_df = pd.DataFrame()

#initiate Selenium
driver = webdriver.Chrome()

for ind, row in year_rating.iterrows():
    #grab each year/rating combination
    year = str(row['year'])
    rating = row['rating']
    
    get_mpaa_ratings(year, rating)
  


In [None]:
all_mpaa_info = mpaa_df

In [None]:
all_mpaa_info['year'] = all_mpaa_info['title'].str.extract('.+\((\d{4})\)')

all_mpaa_info

In [None]:
all_mpaa_info['title'] = all_mpaa_info['title'].str[:-6].str.strip()
all_mpaa_info

In [None]:
#all_mpaa_info.to_csv('../data/mpaa_data.csv')