In [None]:
import pandas as pd
import numpy as np
import pickle
import re

In [None]:
imdb_reasons = pd.read_pickle('../data/imdb_reasons.pkl')
imdb_reasons.info()

In [None]:
# Going to have to deal with these separately
imdb_dup_certs = imdb_reasons.loc[(imdb_reasons.duplicated(subset = ['mpaa_cert_#'], keep = False))
                                  &
                                  (imdb_reasons['mpaa_cert_#'] != 'None')].sort_values('mpaa_cert_#').reset_index(drop=True)

In [None]:
imdb_reasons = imdb_reasons.loc[~(imdb_reasons['mpaa_cert_#'].isin(imdb_dup_certs['mpaa_cert_#']))].reset_index(drop=True)

In [None]:
mpaa_details = pd.read_pickle('../data/mpaa_details.pkl')
mpaa_details.info()

In [None]:
# Going to have to deal with these separately
mpaa_dup_certs = mpaa_details.loc[mpaa_details.duplicated(subset = ['mpaa_cert_#'], keep = False)].sort_values('mpaa_cert_#').reset_index(drop=True)

In [None]:
mpaa_details = mpaa_details.loc[~(mpaa_details['mpaa_cert_#'].isin(mpaa_dup_certs['mpaa_cert_#']))].reset_index(drop=True)

#### Merge the two main datasets

In [None]:
outer_merge = pd.merge(imdb_reasons, mpaa_details, on = 'mpaa_cert_#', how = 'outer')

In [None]:
# Amazing Stories was a TV show, not sure why it got rated by MPAA, dropping those rows
outer_merge = outer_merge.loc[~(outer_merge['mpaa_title'].fillna('').str.contains('Amazing Stories'))]

In [None]:
imdb_no_match = outer_merge.loc[outer_merge['mpaa_title'].isna()]
mpaa_no_match = outer_merge.loc[outer_merge['imdb_ids'].isna()]

In [None]:
imdb_info = imdb_reasons.loc[imdb_reasons['imdb_ids'].isin(imdb_no_match['imdb_ids'])]
mpaa_info = mpaa_details.loc[mpaa_details['mpaa_cert_#'].isin(mpaa_no_match['mpaa_cert_#'])]

In [None]:
print(len(imdb_info))
print(len(mpaa_info))

In [None]:
# These movies originally released prior to the 1992-2022 timeframe except for one that was released in 2023
mpaa_info.loc[mpaa_info['other_info'].str.contains('Re-Rating')].sort_values('rating_year')

In [None]:
# These are all older movies that were originally released prior to rating reasons being required, will drop them
mpaa_info.loc[mpaa_info['other_info'].str.contains('Re-Issue')]

In [None]:
# These are also all older movies that were re-rated to use the current rating system
mpaa_info.loc[mpaa_info['other_info'].str.contains('Rating Symbol')]

In [None]:
outer_merge = outer_merge.loc[~(((outer_merge['other_info'].str.contains('Re-Rating'))
                            |
                             (outer_merge['other_info'].str.contains('Re-Issue'))
                             |
                             (outer_merge['other_info'].str.contains('Rating Symbol')))
                            &
                            (outer_merge['imdb_ids'].isna()))].reset_index(drop=True)

In [None]:
imdb_info.loc[(imdb_info['rating_reasons'] == 'None')
              & 
              (imdb_info['mpaa_cert_#'] == 'None')
              &
              (~imdb_info['release_note'].isna())]

#After a looking at a selection of the movies, they do not appear to have been rated by the MPAA, so removing them
imdbids_to_remove_a = imdb_info.loc[(imdb_info['rating_reasons'] == 'None')
              & 
              (imdb_info['mpaa_cert_#'] == 'None')
              &
              (~imdb_info['release_note'].isna())]['imdb_ids']

In [None]:
outer_merge = outer_merge.loc[~(outer_merge['imdb_ids'].isin(imdbids_to_remove_a))]

In [None]:
imdb_info.loc[imdb_info['rating_reasons'] == '']

# none of these 3 were rated by the MPAA despite IMDB having a certificate # for them
imdbids_to_remove_b = imdb_info.loc[imdb_info['rating_reasons'] == '']['imdb_ids']

In [None]:
outer_merge = outer_merge.loc[~(outer_merge['imdb_ids'].isin(imdbids_to_remove_b))]

In [None]:
imdb_info.loc[(imdb_info['imdb_genres'].str.contains('Short'))
                & 
                (imdb_info['rating_reasons'] == 'None')]

# After a looking at a selection of the movies, they do not appear to have been rated by the MPAA, so removing them
imdbids_to_remove_c = imdb_info.loc[(imdb_info['imdb_genres'].str.contains('Short'))
                                    & 
                                    (imdb_info['rating_reasons'] == 'None')]['imdb_ids']

In [None]:
outer_merge = outer_merge.loc[~(outer_merge['imdb_ids'].isin(imdbids_to_remove_c))]

In [None]:
imdb_no_match_b = outer_merge.loc[outer_merge['mpaa_title'].isna()]
mpaa_no_match_b = outer_merge.loc[outer_merge['imdb_ids'].isna()]

In [None]:
imdb_info_b = imdb_reasons.loc[(imdb_reasons['imdb_ids'].isin(imdb_no_match_b['imdb_ids']))]
mpaa_info_b = mpaa_details.loc[mpaa_details['mpaa_cert_#'].isin(mpaa_no_match_b['mpaa_cert_#'])]

In [None]:
print(len(imdb_info_b))
print(len(mpaa_info_b))

In [None]:
imdb_info_b.loc[(imdb_info_b['imdb_ratings'].isna())
               & 
              (imdb_info_b['rating_reasons'] == 'None')]

# After a looking at a selection of the movies, they do not appear to have been rated by the MPAA, so removing them
imdbids_to_remove_d = imdb_info_b.loc[(imdb_info_b['imdb_ratings'].isna())
                                      & 
                                      (imdb_info_b['rating_reasons'] == 'None')]['imdb_ids']

In [None]:
outer_merge = outer_merge.loc[~(outer_merge['imdb_ids'].isin(imdbids_to_remove_d))]

In [None]:
imdb_info_b.loc[(imdb_info_b['release_year'] == '1991')
                &
                (imdb_info_b['rating_reasons'] == 'None')]


# These movies were rated prior to reasons being required
imdbids_to_remove_e = imdb_info_b.loc[(imdb_info_b['release_year'] == '1991')
                                      &
                                      (imdb_info_b['rating_reasons'] == 'None')]['imdb_ids']

In [None]:
outer_merge = outer_merge.loc[~(outer_merge['imdb_ids'].isin(imdbids_to_remove_e))]

In [None]:
imdb_no_match_c = outer_merge.loc[outer_merge['mpaa_title'].isna()]
mpaa_no_match_c = outer_merge.loc[outer_merge['imdb_ids'].isna()]

In [None]:
imdb_info_c = imdb_reasons.loc[(imdb_reasons['imdb_ids'].isin(imdb_no_match_c['imdb_ids']))]
mpaa_info_c = mpaa_details.loc[mpaa_details['mpaa_cert_#'].isin(mpaa_no_match_c['mpaa_cert_#'])]

In [None]:
print(len(imdb_info_c))
print(len(mpaa_info_c))

In [None]:
imdb_info_c.loc[(imdb_info_c['mpaa_cert_#'] != 'None')
                &
                (imdb_info_c['rating_reasons'] == 'None')]

# These movies do not align with any MPAA rated movies OR they were rated prior to rating reasons
imdbids_to_remove_f = imdb_info_c.loc[(imdb_info_c['mpaa_cert_#'] != 'None')
                                      &
                                      (imdb_info_c['rating_reasons'] == 'None')]['imdb_ids']

In [None]:
outer_merge = outer_merge.loc[~(outer_merge['imdb_ids'].isin(imdbids_to_remove_f))]

Now need to deal with the movies that appear multiple times, sometimes with all info, sometimes not

In [None]:
dup_ids = (
    outer_merge.loc[(outer_merge['imdb_ids'].isin(imdb_info_c['imdb_ids']))
                    &
                    (~outer_merge['mpaa_title'].isna())
                    ]['imdb_ids']
)

In [None]:
dup_imbdid_indices = (
    outer_merge.loc[(outer_merge['imdb_ids'].isin(dup_ids))
                    &
                    (outer_merge['mpaa_title'].isna())
                    ].index
)

In [None]:
outer_merge = (
    outer_merge.drop(index=dup_imbdid_indices)
    .reset_index(drop=True)
)

In [None]:
imdb_info_c = imdb_info_c.loc[~imdb_info_c['imdb_ids'].isin(dup_ids)]

In [None]:
imdb_no_match_d = outer_merge.loc[outer_merge['mpaa_title'].isna()]
mpaa_no_match_d = outer_merge.loc[outer_merge['imdb_ids'].isna()]

In [None]:
imdb_info_d = imdb_reasons.loc[(imdb_reasons['imdb_ids'].isin(imdb_no_match_d['imdb_ids']))]
mpaa_info_d = mpaa_details.loc[mpaa_details['mpaa_cert_#'].isin(mpaa_no_match_d['mpaa_cert_#'])]

In [None]:
print(len(imdb_info_d))
print(len(mpaa_info_d))

Now comes the more complicated efforts to merge the datasets

First, I will save the component of the merged data that worked just using the MPAA certiicate number, removing the remaining portions that didn't merge.

I will then attempt to merge the remaining data (incluiding those elements of each data set that had duplicate MPAA certificate numbers) based on movie title (after removing "The", making lower case, and substituting all & symbols for "and") along with other data (rating, maybe years).

In [None]:
main_merge = (
    outer_merge.loc[~((outer_merge['imdb_ids'].isin(imdb_info_d['imdb_ids']))
                    |
                    (outer_merge['mpaa_cert_#'].isin(mpaa_info_d['mpaa_cert_#'])))].reset_index(drop = True)

)

In [None]:
imdbs = pd.concat([imdb_info_d, imdb_dup_certs])
mpaas = pd.concat([mpaa_info_d, mpaa_dup_certs])
print(len(imdbs))
print(len(mpaas))

In [None]:
imdbs['match_titles'] = (
    imdbs['imdb_titles']
    .str.lower()
    .str.replace('&', 'and')
    .str.replace('the ', '')
    .str.replace('a ', '')
    .str.strip('.:!? ,')
)
imdbs

In [None]:
mpaas['match_titles'] = (
    mpaas['mpaa_title']
    .str.lower()
    .str.replace('&', 'and')
    .str.replace('the ', '')
    .str.replace('a ', '')
    .str.replace(', the', '')
    .str.replace(', a', '')
    .str.strip('.:!? ,')
)
mpaas

In [None]:
mpaas.loc[mpaas['mpaa_title'].str.contains(', The')]

In [None]:
title_rating_year_merge = pd.merge(imdbs, mpaas, left_on = ['match_titles', 'imdb_mpaas', 'release_year'], right_on = ['match_titles', 'mpaa_rating', 'rating_year'], how = 'inner')
title_rating_year_merge

Going to have to think how to remove the MPAA rows out of mpaas dataframe since cert numbers can no longer be assumed to be unique

Need to remove Sam and Kate (as opposed to Sam & Kate) for imdb_title from the dataframe

In [None]:
title_merge.loc[~((title_merge['imdb_ids'].isna())
                |
                (title_merge['mpaa_title'].isna()))]

In [None]:
title_merge.loc[title_merge['mpaa_title'].isna()]

In [None]:
mpaa_details.loc[mpaa_details['mpaa_cert_#'] == '32854']