#### Notes
- This notebook is using the IMDB IDs collected from the first scrape to retrieve the parental guidance page for each movie
- From this page, the rating reason will be scraped (if available) as well as the MPAA Certificate #
- This only applies to movies rated something other than "G" since G rated movies don't have rating reasons

In [None]:
import requests
from bs4 import BeautifulSoup as BS

import pandas as pd
import numpy as np
import pickle
import re

In [None]:
first = pd.read_pickle('../data/imdb1992-2001.pkl')
second = pd.read_pickle('../data/imdb2002-2009.pkl')
third = pd.read_pickle('../data/imdb2010-2018.pkl')
fourth = pd.read_pickle('../data/imdb2019-2021.pkl')
fifth = pd.read_pickle('../data/imdb2022.pkl')

imdb = pd.concat([first, second, third, fourth, fifth])
imdb.info()

In [None]:
reason_ids = imdb.loc[imdb['imdb_mpaas'] != 'G'].reset_index(drop=True)['imdb_ids']
len(reason_ids)

In [None]:
ids = []
reasons = []
notes = []

for r_id in reason_ids:
    url = f'https://www.imdb.com/title/{r_id}/parentalguide?ref_=tt_stry_pg'
    soup = BS(requests.get(url).text)
    try:
        reason = soup.find('tr', attrs = {'id': "mpaa-rating"}).text.replace('MPAA', '').strip()
    except:
        reason = "None"
        
    try:
        note = re.findall('United States:.+\n.*(?:certificate|No. )(.+)\)\n', soup.find_all('td', attrs = {'class' : ''})[1].text)
    except:
        note = ["None"]
    
    ids.append(r_id)
    reasons.append(reason)
    notes.append(note)
    
    
reason_df = pd.DataFrame(
    {'imdb_id' : ids,
     'rating_reasons' : reasons,
     'rating_notes' : notes}

)

In [None]:
reason_df.to_pickle('../data/reasons.pkl')