In [1]:
import numpy as np
import pandas as pd
from pandas_profiling import Prof=ileReport

In [2]:
all_mpaa_info = pd.read_csv('../data/mpaa_data.csv')
all_mpaa_info = all_mpaa_info.drop(columns = 'Unnamed: 0')
mpaa_info = all_mpaa_info.drop_duplicates()
mpaa_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21907 entries, 0 to 21928
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        21907 non-null  object
 1   rating       21907 non-null  object
 2   reason       21054 non-null  object
 3   distributor  21894 non-null  object
 4   alt_titles   6274 non-null   object
 5   other        1519 non-null   object
 6   year         21907 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 1.3+ MB


In [3]:
mpaa_info['rating'].value_counts()

R        13266
PG-13     4832
PG        2908
G          845
NC-17       56
Name: rating, dtype: int64

In [None]:
#dropping reissued movies
mpaa_info = mpaa_info.loc[~mpaa_info['other'].fillna('').str.contains('Re-Issue')]
mpaa_info.info()

In [None]:
#dropping movies that only appear in this time frame because their rating symbol was updated
mpaa_info = mpaa_info.loc[~mpaa_info['other'].fillna('').str.contains('Rating Symbol Changed')]
mpaa_info.info()

In [None]:
#dropping movies that only appear in this time frame because they were re-rated for release on home video
mpaa_info = (
    mpaa_info
    .loc[~((mpaa_info['other'].fillna('').str.contains('Re-Rating'))
           &
           (mpaa_info['distributor'].fillna('').str.contains('Home ')))]
)
mpaa_info.info()

In [None]:
#dropping rows that are only different in alt_titles or other columns (4 rows)
mpaa_info = mpaa_info.drop_duplicates(subset = ['title', 'reason', 'year'])
mpaa_info.info()

In [None]:
#clean up the distributor column a bit
mpaa_info['distributor'] = (
    mpaa_info['distributor']
    .str.replace(' Inc.', '')
    .str.replace(' LLC', '')
    .str.replace(' L.P.', '')
    .str.replace('Co\.', 'Company')
    .str.replace('Corp\.', 'Corporation')
    .str.replace('Lions Gate', 'Lionsgate')
    .str.replace('Lionsgate Entertainment', 'Lionsgate')
    .str.replace('Paramount Classics, A Division Of Paramount Pictures Corporation', 'Paramount Classics')
    .str.replace('Paramount Classics', 'Paramount Classics, A Division Of Paramount Pictures Corporation')
    .str.replace('Paramount Pictures Corporation', 'Paramount Pictures')
    .str.replace('Paramount Pictures', 'Paramount Pictures Corporation')
    .str.replace('Universal Home Entertainment Productions', 'Universal Home Entertainment')
    .str.replace('Universal Home Entertainment', 'Universal Pictures Home Entertainment')
    .str.replace('Universal Studios Home Entertainment Productions', 'Universal Studios Home Entertainment')
    .str.replace('Universal 1440 Entertainment', 'Universal 1440')
    .str.replace('Universal 1440', 'Universal 1440 Entertainment')
    .str.replace('Ltd.', 'Limited')
    .str.replace('Distribution', 'Distribut')
    .str.replace('Distribut', 'Distribution')
    .str.replace('Walt Disney Studios Motion Pictures', 'Walt Disney Studios Motion Picture')
    .str.replace('Walt Disney Studios Motion Picture', 'Walt Disney Studios Motion Pictures')
    .str.replace('Trimark Pictures A Division Of Vidmark', 'Trimark Pictures, A Division Of Vidmark')
    .str.replace('Screen Media Ventures', 'Screen Media')
    .str.replace('Screen Media Films', 'Screen Media')
    .str.replace('Screen Media', 'Screen Media Ventures')
    .str.replace('Int\'l', 'International')
    .str.replace('Weinstein Company,, The', 'The Weinstein Company')
    .str.replace('Weinstein Company, The', 'The Weinstein Company')
    .str.replace('Weinstein Company, LCC, The', 'The Weinstein Company')
    .str.replace('Concorde-New Horizons Pictures Corporation', 'Concorde - New Horizons Corporation')
    .str.replace('  ', ' ')
    .str.strip(',')
    .str.strip()
)

In [None]:
mpaa_info['distributor'].value_counts()

In [None]:
profile = ProfileReport(mpaa_info, title="Pandas Profiling Report")

In [None]:
profile.to_notebook_iframe()

In [None]:
#pulling instances where "Rerate After Appeal" is noted in the "other" column
rerate_titles = mpaa_info.loc[mpaa_info['other'].fillna('').str.contains("Rerate After Appeal")]['title']

In [None]:
#looking specifically at instances when the rerating may have resulted in a second entry for the same film
mpaa_info.loc[mpaa_info['title'].isin(rerate_titles)].loc[mpaa_info[['title', 'year']].duplicated(keep=False)].sort_values('title')

In [None]:
#looking for instances when the rerate may have occurred in a different, but adjacent, year
mpaa_info.loc[mpaa_info['title'].isin(rerate_titles)].loc[mpaa_info[['title']].duplicated(keep=False)].sort_values('title')

In [None]:
#pulling instances of any type of appeal that was noted in the "other" column
appeal_titles = mpaa_info.loc[mpaa_info['other'].fillna('').str.contains("Appeal")].sort_values('title')['title']

appeal_titles

In [None]:
#looking for instances when the appeal appears to have created a second entry for the same film
mpaa_info.loc[mpaa_info['title'].isin(appeal_titles)].loc[mpaa_info[['title', 'year']].duplicated(keep=False)].sort_values('title')

In [None]:
mpaa_info.loc[mpaa_info['title'].isin(appeal_titles)].loc[mpaa_info[['title']].duplicated(keep=False)].sort_values('title')

In [None]:
#Edited versions can be truly alternate versions that were released, trying to not count those as duplicates
#but this method didn't quite work
(
    mpaa_info
    .loc[~mpaa_info['other'].fillna('').str.contains("Edited Version")]
    .loc[mpaa_info[['title', 'year']].duplicated(keep=False)]
    .sort_values('title')
    .tail(40)
)

In [None]:
#here looking at Re-Rating NOT associated with appeals, necessarily
mpaa_info.loc[mpaa_info['other'].fillna('').str.contains("Re-Rating")].sort_values('title')

In [None]:
#mpaa_info.to_csv('../data/clean_mpaa_data.csv')