#### Notes
- This notebook is using the IMDB IDs collected from the first scrape to retrieve the parental guidance page for each movie
- From this page, the rating reason will be scraped (if available) as well as the MPAA Certificate #
- This only applies to movies rated something other than "G" since G rated movies don't have rating reasons

In [None]:
import requests
from bs4 import BeautifulSoup as BS

import pandas as pd
import numpy as np
import pickle
import re
from tqdm import tqdm

In [None]:
first = pd.read_pickle('../data/imdb1992-2001.pkl')
second = pd.read_pickle('../data/imdb2002-2009.pkl')
third = pd.read_pickle('../data/imdb2010-2018.pkl')
fourth = pd.read_pickle('../data/imdb2019-2021.pkl')
fifth = pd.read_pickle('../data/imdb2022.pkl')

imdb = pd.concat([first, second, third, fourth, fifth])
imdb.info()

In [None]:
reason_ids = imdb.loc[imdb['imdb_mpaas'] != 'G'].reset_index(drop=True)['imdb_ids']
len(reason_ids)

In [None]:
ids = []
reasons = []
notes = []

for r_id in reason_ids:
    url = f'https://www.imdb.com/title/{r_id}/parentalguide?ref_=tt_stry_pg'
    soup = BS(requests.get(url).text)
    try:
        reason = soup.find('tr', attrs = {'id': "mpaa-rating"}).text.replace('MPAA', '').strip()
    except:
        reason = "None"
        
    try:
        note = re.findall('United States:.+\n.*(?:certificate|No. )(.+)\)\n', soup.find_all('td', attrs = {'class' : ''})[1].text)
    except:
        note = ["None"]
    
    ids.append(r_id)
    reasons.append(reason)
    notes.append(note)
    
    
reason_df = pd.DataFrame(
    {'imdb_ids' : ids,
     'rating_reasons' : reasons,
     'rating_notes' : notes}

)

In [None]:
missing_reasons = reason_df.loc[reason_df['rating_reasons'] == 'None']['imdb_ids']

In [None]:
ids = []
reasons = []
notes = []

for r_id in tqdm(missing_reasons):
    url = f'https://www.imdb.com/title/{r_id}/parentalguide?ref_=tt_stry_pg'
    soup = BS(requests.get(url).text)
    try:
        reason = soup.find('tr', attrs = {'id': "mpaa-rating"}).text.replace('MPAA', '').strip()
    except:
        reason = "None"
        
    try:
        note = re.findall('United States:.+\n.*(?:certificate|No. )(.+)\)\n', soup.find('td', attrs = {'class' : ''}).text)
    except:
        note = ["None"]
    
    ids.append(r_id)
    reasons.append(reason)
    notes.append(note)
    
    
missing_reason_df = pd.DataFrame(
    {'imdb_ids' : ids,
     'rating_reasons' : reasons,
     'rating_notes' : notes})

In [None]:
reason_df = pd.concat([reason_df.loc[~(reason_df['imdb_ids'].isin(missing_reasons))], missing_reason_df])

In [None]:
g_ids = imdb.loc[imdb['imdb_mpaas'] == 'G'].reset_index(drop=True)['imdb_ids']

In [None]:
ids = []
reasons = []
notes = []

for r_id in tqdm(g_ids):
    url = f'https://www.imdb.com/title/{r_id}/parentalguide?ref_=tt_stry_pg'
    soup = BS(requests.get(url).text)
    
    reason = ""
        
    try:
        note = re.findall('United States:.+\n.*(?:certificate|No. |pca|cert|#)(.+)\)\n', soup.find('td', attrs = {'class' : ''}).text)
    except:
        note = ["None"]
    
    ids.append(r_id)
    reasons.append(reason)
    notes.append(note)
    
    
g_reason_df = pd.DataFrame(
    {'imdb_ids' : ids,
     'rating_reasons' : reasons,
     'mpaa_cert_#' : notes})

In [None]:
reason_df = pd.concat([reason_df, g_reason_df])

In [None]:
still_missing = reason_df.loc[(reason_df['rating_reasons'] == 'None')
                              &
                              (reason_df['rating_notes'].isna())]['imdb_ids']

In [None]:
ids = []
reasons = []
notes = []

for r_id in tqdm(still_missing):
    url = f'https://www.imdb.com/title/{r_id}/parentalguide?ref_=tt_stry_pg'
    soup = BS(requests.get(url).text)
    
    reason = ""
        
    try:
        note = re.findall('United States:.+\n.*(?:certificate|No. |pca|cert|#)(.+)\)\n', soup.find('td', attrs = {'class' : ''}).text)
    except:
        note = ["None"]
    
    ids.append(r_id)
    reasons.append(reason)
    notes.append(note)
    
    
still_missing_reason_df = pd.DataFrame(
    {'imdb_ids' : ids,
     'rating_reasons' : reasons,
     'mpaa_cert_#' : notes})

In [None]:
reason_df = pd.concat([reason_df.loc[reason_df['imdb_ids'].isin(still_missing), still_missing_reason_df]

In [None]:
reason_redos = pd.read_pickle('../data/reason_redo_Sat.pkl')

In [None]:
ids = []
reasons = []
notes = []

for r_id in tqdm(reason_redos):
    url = f'https://www.imdb.com/title/{r_id}/parentalguide?ref_=tt_stry_pg'
    soup = BS(requests.get(url).text)
    try:
        reason = soup.find('tr', attrs = {'id': "mpaa-rating"}).text.replace('MPAA', '').strip()
    except:
        reason = "None"
        
    try:
        note = re.findall('United States:.+\n.*(?:certificate|Certificate|CERTIFICATE|No\.|NO\.|pca|cert|#)(.+)\)\n', soup.find_all('td', attrs = {'class' : ''})[1].text)
    except:
        note = ["None"]
    
    ids.append(r_id)
    reasons.append(reason)
    notes.append(note)
    
    
reason_redo_df_aa = pd.DataFrame(
    {'imdb_ids' : ids,
     'rating_reasons' : reasons,
     'mpaa_cert_#' : notes})

In [None]:
reason_redo_df_a

In [None]:
reason_redo_aa = reason_redo_df_aa.explode('mpaa_cert_#').reset_index(drop = True)

In [None]:
reason_redo_aa['mpaa_cert_#'] = [str(x).strip().replace('#', '') for x in reason_redo_aa['mpaa_cert_#']]

In [None]:
reason_redo_aa[['mpaa_cert_#', 'mpaa_cert_note']] = reason_redo_aa['mpaa_cert_#'].str.split(',', n=1, expand=True)

In [None]:
redo_b = reason_redo_a.loc[(reason_redo_a['mpaa_cert_#'].isin(['None', 'nan','']))
                           &
                           (reason_redo_a['rating_reasons'] == 'None')]['imdb_ids'].reset_index(drop=True)

In [None]:
redo_b

In [None]:
reason_redo_aa.loc[~((reason_redo_aa['mpaa_cert_#'].isin(['None', 'nan']))
                                     &
                                     (reason_redo_aa['mpaa_cert_#'] == 'None'))]

In [None]:
redo_success_a

In [None]:
reason_redo_a.to_pickle('../data/reason_redo_a.pkl')

In [None]:
ids = []
reasons = []
notes = []

for r_id in tqdm(redo_b):
    url = f'https://www.imdb.com/title/{r_id}/parentalguide?ref_=tt_stry_pg'
    soup = BS(requests.get(url).text)
    try:
        reason = soup.find('tr', attrs = {'id': "mpaa-rating"}).text.replace('MPAA', '').strip()
    except:
        reason = "None"
        
    try:
        note = re.findall('United States:.+\n.*(?:certificate|No\.|NO\.|pca|cert|#)(.+)\)\n', soup.find('td', attrs = {'class' : ''}).text)
    except:
        note = ["None"]
    
    ids.append(r_id)
    reasons.append(reason)
    notes.append(note)
    
    
reason_redo_df_b = pd.DataFrame(
    {'imdb_ids' : ids,
     'rating_reasons' : reasons,
     'mpaa_cert_#' : notes})

In [None]:
reason_redo_b = reason_redo_df_b.explode('mpaa_cert_#').reset_index(drop = True)

In [None]:
reason_redo_b['mpaa_cert_#'] = [str(x).strip().replace('#', '') for x in reason_redo_b['mpaa_cert_#']]

In [None]:
reason_redo_b.loc[reason_redo_b['rating_reasons'] != 'None']

In [None]:
reason_redo_b[['mpaa_cert_#', 'mpaa_cert_note']] = reason_redo_b['mpaa_cert_#'].str.split(',', n=1, expand=True)

In [None]:
redo_b = reason_redo_a.loc[(reason_redo_a['mpaa_cert_#'].isin(['None', 'nan','']))]['imdb_ids'].reset_index(drop=True)

In [None]:
redo_success_a = reason_redo_a.loc[~(reason_redo_a['mpaa_cert_#'].isin(['None', 'nan']))]

In [None]:
additional_years_reason_df_b

In [None]:
ids = []
reasons = []
notes = []

for r_id in tqdm(g_ids):
    url = f'https://www.imdb.com/title/{r_id}/parentalguide?ref_=tt_stry_pg'
    soup = BS(requests.get(url).text)
    
    reason = ""
        
    try:
        note = re.findall('United States:.+\n.*(?:certificate|No\. |pca|cert|#)(.+)\)\n', soup.find_all('td', attrs = {'class' : ''}[1]).text)
    except:
        note = ["None"]
    
    ids.append(r_id)
    reasons.append(reason)
    notes.append(note)
    
    
g_reason_df_b = pd.DataFrame(
    {'imdb_ids' : ids,
     'rating_reasons' : reasons,
     'mpaa_cert_#' : notes})

In [None]:
additional_reasons = additional_years_reason_df.explode('mpaa_cert_#').reset_index(drop = True)

In [None]:
additional_reasons['mpaa_cert_#'] = [str(x).strip().replace('#', '') for x in additional_reasons['mpaa_cert_#']]

In [None]:
additional_reasons[['mpaa_cert_#', 'mpaa_cert_note']] = additional_reasons['mpaa_cert_#'].str.split(',', n=1, expand=True)

In [None]:
additional_reasons.to_pickle('../data/imdb1991_reasons.pkl')

In [None]:
url = 'https://www.imdb.com/title/tt0101252/parentalguide?ref_=tt_stry_pg'
soup = BS(requests.get(url).text)
re.findall('United States:.+\n.*(?:certificate|No. )(.+)\)\n', soup.find('td', attrs = {'class' : ''}).text)

In [None]:
soup.find_all('td', attrs = {'class' : ''})[1]

In [None]:
#rating_reasons.to_pickle('../data/rating_reasons.pkl')