In [None]:
import pandas as pd
import numpy as np
import pickle
import re

In [None]:
imdb_mpaa = pd.read_pickle('../data/final_imdb_mpaa.pkl')

In [None]:
# Correct datatypes

imdb_mpaa['metascores'] = imdb_mpaa['metascores'].replace('', np.nan).astype("Int64")
imdb_mpaa['imdb_ratings'] = imdb_mpaa['imdb_ratings'].astype(float)

imdb_mpaa['gross_num'] = imdb_mpaa['gross'].str.strip('M$').astype(float) * 1000000

imdb_mpaa['release_year'] = imdb_mpaa['release_year'].astype("Int64")
imdb_mpaa['rating_year'] = imdb_mpaa['rating_year'].astype("Int64")

In [None]:
# Add a column to make it easier to calculate weighted average imdb_ratings
imdb_mpaa['vote*rating'] = imdb_mpaa['votes'] * imdb_mpaa['imdb_ratings']

In [None]:
imdb_mpaa.info()

In [None]:
imdb_mpaa.groupby('mpaa_rating').agg(
    count_gross = ('gross_num', 'count'),
    avg_gross = ('gross_num', 'mean'),
    med_gross = ('gross_num', 'median'),
    max_gross = ('gross_num', 'max'),
    sum_gross = ('gross_num', 'sum'),
    count_votes = ('votes', 'count'), 
    avg_votes = ('votes', 'mean'),
    med_votes = ('votes', 'median'),
    max_votes = ('votes', 'max'),
    sum_votes = ('votes', 'sum'),
    sum_vote_rating = ('vote*rating', 'sum'),
    count_imdb_ratings = ('imdb_ratings', 'count'), 
    avg_imdb_ratings = ('imdb_ratings', 'mean'),
    med_imdb_ratings = ('imdb_ratings', 'median'),
    max_imdb_ratings = ('imdb_ratings', 'max'),
    count_metascores = ('metascores', 'count'), 
    avg_metascores = ('metascores', 'mean'),
    med_metascores = ('metascores', 'median'),
    max_metascores = ('metascores', 'max'),
    total_movies = ('clean_titles', 'count')

)

In [None]:
imdb_mpaa.columns

In [None]:
# Rename columns to match data setup in Shiny app
imdb_mpaa = imdb_mpaa.rename(columns = {
    'clean_titles' : 'title', 
    'rating_year' : 'year', 
    'mpaa_rating' : 'rating', 
    'mpaa_reason' : 'reason',
    'mpaa_cert_#' : 'mpaa_certificate',
    'vote*rating' : 'vote_x_rating'
})

In [None]:
# removing movies rated in 2023 since we don't have a full year of data to use for comparison
imdb_mpaa = imdb_mpaa.loc[imdb_mpaa['year'] < 2023].reset_index(drop= True)

# Dropping movies rated 1991 and earlier, prior to rating reasons being required
imdb_mpaa = imdb_mpaa.loc[~(imdb_mpaa['year'] <= 1991)].reset_index(drop= True)

In [None]:
# There are 3 non-G movies without reasons from 1992. Dropping them as well
imdb_mpaa = imdb_mpaa.loc[~((imdb_mpaa['reason'] == '')
                            &
                            (imdb_mpaa['rating'] != 'G'))].reset_index(drop= True)

In [None]:
imdb_mpaa.loc[imdb_mpaa['rating'] == 'G', 'reason'] = 'Rated G'

In [None]:
imdb_mpaa.info()

In [None]:
imdb_mpaa.loc[imdb_mpaa['title'].str.contains("CERTIFICATE")]

In [None]:
imdb_mpaa.loc[imdb_mpaa['title'].str.contains("Beginner's Guide To Skinny"), "title"] = "The Beginner's Guide to Skinny-Dipping"

In [None]:
imdb_mpaa.loc[imdb_mpaa['title'].str.contains("Babies"), "title"] = "Babies"

In [None]:
imdb_mpaa.loc[imdb_mpaa['title'].str.contains("Hobbit: An"), "title"] = "The Hobbit: An Unexpected Journey Extended Edition"

In [None]:
imdb_mpaa.to_csv('../data/imdb_mpaa_clean.csv')

In [None]:
imdb_mpaa['votes'].describe()

In [None]:
imdb_mpaa.loc[imdb_mpaa['title'].str.contains('Showgirls')]

In [None]:
imdb_mpaa.loc[(~imdb_mpaa['votes'].isna())
             &
              (imdb_mpaa['imdb_ratings'].isna())]

In [None]:
imdb_mpaa.loc[~(imdb_mpaa['gross_num'].isna())].groupby(['rating', 'year'])['gross_num'].mean().reset_index()

In [None]:
imdb_mpaa.loc[~(imdb_mpaa['gross_num'].isna())]['reason'].value_counts()

In [None]:
imdb_mpaa.loc[imdb_mpaa['rating'] == 'G', 'reason']