In [None]:
import pandas as pd
import numpy as np
import pickle
import re

In [None]:
first = pd.read_pickle('../data/imdb1992-2001.pkl')
second = pd.read_pickle('../data/imdb2002-2009.pkl')
third = pd.read_pickle('../data/imdb2010-2018.pkl')
fourth = pd.read_pickle('../data/imdb2019-2021.pkl')
fifth = pd.read_pickle('../data/imdb2022.pkl')
sixth = pd.read_pickle('../data/imdb1991.pkl')
imdb = pd.concat([first, second, third, fourth, fifth, sixth]).reset_index(drop=True)

imdb.info()

In [None]:
imdb['votes'] = [re.search('Votes:\\\\n(.+?)\'', str(x))[1] if 'Votes' in str(x) else '0' for x in imdb['imdb_votes_and_gross']]

In [None]:
imdb['gross'] = [re.search('Gross:\\\\n(.+?)\'', str(x))[1] if 'Gross' in str(x) else None for x in imdb['imdb_votes_and_gross']]

In [None]:
imdb['votes'] = imdb['votes'].str.replace(',', '').astype(int)

In [None]:
imdb = imdb.drop(columns = 'imdb_votes_and_gross')

In [None]:
# The release years column has some other data, cleaning it up, splitting it out
imdb['release_years'] = imdb['release_years'].str.strip('I X V ( )')

imdb[['release_year', 'release_note']] = imdb['release_years'].str.split(pat = ' ', n = 1, expand=True)

imdb = imdb.drop(columns='release_years')

In [None]:
# Get the IMDB rating values out of list format
imdb['imdb_ratings'] = pd.DataFrame(imdb['imdb_ratings'].tolist(), index=imdb.index)

Do some EDA, look for movies that need to be removed from the dataset

In [None]:
imdb['release_year'].value_counts().sort_index()

In [None]:
# Confirmed that the 3 movies without years associated with them are not among those with official MPAA ratings
imdb = imdb.loc[imdb['release_year'] != ''].reset_index(drop=True)

In [None]:
imdb['imdb_mpaas'].value_counts()

In [None]:
# Dropping all movies with a non-MPAA rating
ratings = ['G', 'PG', 'PG-13', 'R', 'NC-17']

imdb = imdb.loc[imdb['imdb_mpaas'].isin(ratings)].reset_index(drop = True)

In [None]:
imdb.info()

#### Bring in Rating Reason info

In [None]:
reasons_df = pd.read_pickle('../data/rating_reasons.pkl')
reasons_df.info()

In [None]:
more_reasons = pd.read_pickle('../data/more_rating_reasons.pkl')
more_reasons.info()

In [None]:
additional_reasons = pd.read_pickle('../data/imdb1991_reasons.pkl')
additional_reasons.info()

In [None]:
reason_redo_a = pd.read_pickle('../data/reason_redo_a.pkl')
reason_redo_a.info()

In [None]:
reasons = pd.concat([reasons_df.loc[~(reasons_df['imdb_ids'].isin(pd.concat([more_reasons['imdb_ids'],
                                                                           reason_redo_a['imdb_ids']])))], 
                     more_reasons.loc[~(more_reasons['imdb_ids'].isin(reason_redo_a['imdb_ids']))], 
                     additional_reasons,
                     reason_redo_a])

reasons.info()

In [None]:
reasons['mpaa_cert_#'] = reasons['mpaa_cert_#'].str.replace('nan', 'None')

In [None]:
reasons['mpaa_cert_#'] = (
    reasons['mpaa_cert_#']
    .str.replace('no.', '')
    .str.replace(':', '')
    .str.replace('subtitled', '')
    .str.replace('dubbed', '')
    .str.replace('s', '')
    .str.replace('.', '')
    .str.replace('number', '')
    .str.replace('ificate', '')
    .str.strip()
)

In [None]:
#Remove punctuation from rating reasons
reasons['rating_reasons'] = reasons['rating_reasons'].str.strip('.()')

In [None]:
reasons = reasons.drop_duplicates().reset_index(drop=True)
reasons.info()

In [None]:
# Drop rows where only difference is mpaa_cert_note
reasons = reasons.loc[~reasons.duplicated(subset = ['imdb_ids', 'rating_reasons', 'mpaa_cert_#'], keep = 'first')].reset_index(drop=True)
reasons.info()

In [None]:
# Some of the duplicates are caused by rows with no cert number while others do have the number
indices = reasons.loc[(reasons.duplicated(subset = ['imdb_ids', 'rating_reasons'], keep = False))
            &
            (reasons['mpaa_cert_#'] == 'None')].index

In [None]:
reasons = reasons.drop(index = indices).reset_index(drop=True)
reasons.info()

In [None]:
# Remaining duplication of IMDB ids are due to movies haveing multiple MPAA certificate numbers
reasons.loc[(reasons.duplicated(subset = ['imdb_ids'], keep = False))]

#### Merge the two dataframes

In [None]:
imdb_reasons = pd.merge(imdb, reasons, on = 'imdb_ids', how = 'left')
imdb_reasons.info()

In [None]:
imdb_reasons['mpaa_cert_#'].value_counts()

In [None]:
# did some research and confirmed that only id tt0113799 is a movie that was rated by the mpaa
imdb_reasons.loc[imdb_reasons['mpaa_cert_#'] == '40417']

In [None]:
# Removing other movies from above list
imdb_reasons = imdb_reasons.loc[~(imdb_reasons['imdb_ids'].isin(['tt1530535', 'tt6264824', 'tt6266826']))].reset_index(drop=True)

In [None]:
#These appear to all be part of what the mpaa treated as a single entity, so leaving them in
imdb_reasons.loc[imdb_reasons['mpaa_cert_#'] == '44423']

In [None]:
# The MPAA did not rate any of these
imdb_reasons.loc[imdb_reasons['mpaa_cert_#'] == 'n/a']

In [None]:
# Removing all the movies from the above list
imdb_reasons = imdb_reasons.loc[~(imdb_reasons['mpaa_cert_#'] == 'n/a')]

In [None]:
# none of these rows need to be kept
imdb_reasons.loc[imdb_reasons['mpaa_cert_#'] == 'uitable for young children']

In [None]:
# Removing all the movies from the above list
imdb_reasons = imdb_reasons.loc[~(imdb_reasons['mpaa_cert_#'] == 'uitable for young children')]

In [None]:
# Confirmed that these were not rated by the mpaa
imdb_reasons = imdb_reasons.loc[~(imdb_reasons['imdb_ids'].isin(['tt10158634', 'tt1563725']))]

In [None]:
# Looking at instances when a rating reason plus the certificate # appear repeatedly
imdb_reasons.loc[(imdb_reasons['mpaa_cert_#'] != 'nan')
                 &
                 (imdb_reasons.duplicated(subset = ['rating_reasons', 'mpaa_cert_#'], keep=False))].sort_values('mpaa_cert_#')

In [None]:
#Confirmed none of these were rated by the MPAA
imdb_reasons = imdb_reasons.loc[~(imdb_reasons['imdb_ids'].isin(['tt0106405', 'tt2727252', 'tt0173796', 'tt0216487', 
                                                                    'tt0275624', 'tt0290759', 'tt12069870', 'tt0438908', 
                                                                    'tt1249415', 'tt1522334', 'tt4789192', 'tt7098674',
                                                                    'tt6769936', 'tt8332364', 'tt7779590', 'tt6657132']))]

In [None]:
# Identified which of these needed to be removed
imdb_reasons.loc[(imdb_reasons['mpaa_cert_#'] != 'nan')
                 &
                 (imdb_reasons.duplicated(subset = ['rating_reasons', 'mpaa_cert_#', 'mpaa_cert_note'], keep=False))].sort_values('mpaa_cert_#')

In [None]:
#Confirmed none of these were rated by the MPAA
imdb_reasons = imdb_reasons.loc[~(imdb_reasons['imdb_ids'].isin(['tt0103888', 'tt1213590','tt5852632', 'tt8246646', 
                                                                 'tt0903606', 'tt9278312', 'tt2990140', 'tt10525124']))]

In [None]:
# The cert # for the 2018 movie should be 51820, the reating reasons are the same but for one word
imdb_reasons.loc[imdb_reasons['imdb_titles'] == 'The Guilty']

In [None]:
imdb_reasons.loc[(imdb_reasons['imdb_titles'] == 'The Guilty')
                 &
                 (imdb_reasons['release_year'] == '2018'), ['rating_reasons', 'mpaa_cert_#']] = ['None', '51820']

In [None]:
# tt1663333 Triggerman cert # is 46371, need to drop other one
imdb_reasons = (
    imdb_reasons.loc[~((imdb_reasons['imdb_ids'] == 'tt1663333')
                       &
                       (imdb_reasons['mpaa_cert_#'] == '46137'))]
)

In [None]:
# Remaing certificate # is correct, but reason is NOT 
imdb_reasons.loc[imdb_reasons['imdb_titles'] == 'Triggerman', 'rating_reasons'] = 'None'

In [None]:
# Identify which of these needs to be removed from data set
imdb_reasons.loc[(imdb_reasons['mpaa_cert_#'] != 'nan')
                 &
                 (imdb_reasons.duplicated(subset = ['mpaa_cert_#'], keep=False))].sort_values('mpaa_cert_#')

In [None]:
# Remove movies as identified from above
imdb_reasons = imdb_reasons.loc[~(imdb_reasons['imdb_ids'].isin(['tt3216920', 'tt0112995', 'tt0116084', 'tt0393010', 
                                                                 'tt0291823', 'tt0326441', 'tt1705056', 'tt2290641', 
                                                                 'tt1645754', 'tt2764834', 'tt2196059', 'tt2828140', 
                                                                 'tt2949588', 'tt1937101', 'tt5564792', 'tt7812828', 
                                                                 'tt1492705', 'tt12281046', 'tt11110770', 'tt7402432', 
                                                                 'tt5594776', 'tt7145178', 'tt8761814', 'tt5828918',
                                                                 'tt1441912', 'tt1188484', 'tt1472122', 'tt1072458', 
                                                                 'tt1362064', 'tt0472102', 'tt0928372', 'tt1077258',
                                                                 'tt1006947', 'tt0488967','tt2022357', 'tt1111231' ]))]

In [None]:
# Looking into remaining instances when an mpaa certificate # appears multiple times in the dataset
# The 
imdb_reasons['mpaa_cert_#'].value_counts().head(60)

In [None]:
imdb_reasons.loc[imdb_reasons['mpaa_cert_#'] == '40726']

In [None]:
#reason_redo = 
imdb_reasons.loc[imdb_reasons['mpaa_cert_#'] == 'None']#['imdb_ids']
#reason_redo.to_pickle('../data/reason_redo_Sat.pkl')

In [None]:
# There are 7408 movies from IMDB that have neither MPAA certificate # nor rating reason in their database
imdb_reasons.loc[((imdb_reasons['rating_reasons'] == 'None')
                  &
                  (imdb_reasons['mpaa_cert_#'] == 'None')
                 ) ]

In [None]:
imdb_reasons = imdb_reasons.reset_index(drop = True)
imdb_reasons.info()

In [None]:
imdb_reasons.loc[(imdb_reasons['imdb_titles'] == 'Shattered')
                & 
                 (imdb_reasons['release_year'] == '2007')
                &
                 (imdb_reasons['votes'] == 32825), ['rating_reasons', 'mpaa_cert_#']] = ['', '43167']

In [None]:
imdb_reasons.loc[(imdb_reasons['votes'].isna())
                 &
                 (imdb_reasons['mpaa_cert_#'] == 'None')]

# These entries either weren't really rated by the MPAA or even if they were, the lack of "success" information 
#makes attempting to merge them lack meaning
imdb_reasons = imdb_reasons.loc[~((imdb_reasons['votes'].isna())
                 &
                 (imdb_reasons['mpaa_cert_#'] == 'None'))].reset_index(drop=True)

In [None]:
imdb_reasons.to_pickle('../data/imdb_reasons.pkl')

In [None]:
imdb_reasons.loc[imdb_reasons['mpaa_cert_#']=='None']['imdb_mpaas'].value_counts()

In [None]:
imdb_reasons['mpaa_cert_#'].value_counts().sort_index()

In [None]:
imdb_reasons.loc[imdb_reasons['imdb_mpaas']=='G']

In [None]:
imdb_reasons.loc[imdb_reasons['imdb_titles']=='Babes in Toyland']