In [1]:
import pandas as pd
import numpy as np
import math
from pandas_profiling import ProfileReport
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import linear_model
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier

In [2]:
movies = pd.read_csv('movie_metadata.csv')

In [3]:
# create a new column, transform 'genres' and 'plot_keywords' into lists, 
# store the lists in the new columns
movies['genre_list'] = movies['genres'].copy()
movies['genre_list'] = movies['genre_list'].str.split('|')

movies['keyword_list'] = movies['plot_keywords'].copy()
movies['keyword_list'] = movies['keyword_list'].str.split('|')


# create a dataframe of columns with items that movie studios have control over
theatre_choices = movies[['color', 'director_name', 'duration', 
                                  'actor_2_name', 'actor_1_name', 'movie_title', 'actor_3_name', 
                                  'facenumber_in_poster', 'language', 'country', 'content_rating',
                                  'budget', 'title_year', 'gross', 'aspect_ratio', 'genre_list', 'keyword_list']]


genres = set([item for lists in theatre_choices['genre_list'] for item in lists])

# create boolean columns for each genre
# Below code was taken from: 
# https://towardsdatascience.com/dealing-with-list-values-in-pandas-dataframes-a177e534f173
# Create empty dict
genre_dict = {}

# Loop through all the tags
for i, item in enumerate(genres):
    # Apply boolean mask
    genre_dict[item] = theatre_choices['genre_list'].apply(lambda x: item in x)

# Return the results as a dataframe, change True and false values to 0 and 1
genre_frame =  pd.DataFrame(genre_dict)
genre_frame = genre_frame.astype(int)


studios_complete = pd.merge(theatre_choices, genre_frame, left_index = True, right_index = True)        


In [4]:
# in inflation document - the countries on the left were give the values for the country or region on the right
# Argentina: Latin America & Caribbean
# Finland: EU
# Indonesia: East Asia & Pacific (excluding high income)
# New Line: looked up, changed to USA
# Official site: looked up, changed to USA
# South Korea: East Asia & Pacific
# Taiwan: China
# Thailand: East Asia & Pacific (excluding high income)
# West Germany - Russia
# Soviet Union - Russiainflation = pd.read_csv("price_index.csv")
inflation = pd.read_csv("price_index.csv")

# replace string values with "Missing"
strings = ['color', 'director_name',  'actor_2_name', 'actor_1_name', 'content_rating',
       'movie_title', 'actor_3_name',  'language', 'country', 'genre_list', 'aspect_ratio']

for item in strings:
    studios_complete[item].fillna("Missing", inplace = True)

    
# calculate mean inflation rate for each country.  Use this rate to fill in NaN values    

inflation['mean'] = inflation.mean(axis = 1, numeric_only = True)
inflation = inflation.set_index('country')

# fill missing years with 2005, missing countries with USA (most common)
studios_complete['title_year'].fillna(2005, inplace = True)
studios_complete['country'].replace('Missing', 'USA', inplace = True)

#determine inflation rate for each movie, movies before 1960 given mean inflation rate
studios_complete['title_year'] = studios_complete['title_year'].astype(int)

for index, year, country in studios_complete[['title_year', 'country']].itertuples():
    stryear = str(year)
    strcountry = str(country)
    if year >= 1960:
        studios_complete['rate'] = inflation.loc[[strcountry],[stryear]].iloc[0][0]
    else:
        studios_complete['rate'] = inflation.loc[[strcountry],['mean']].iloc[0][0]

# find budget and gross adjusted for inflation
studios_complete['budget_infl'] = (studios_complete['budget']/studios_complete['rate']) * 100
studios_complete['gross_infl'] = (studios_complete['gross']/studios_complete['rate']) * 100

#create column to record if movies were rated for TV and then consolidate ratings into smaller
# categories
studios_complete['format'] = np.where(studios_complete['content_rating'].str.match('TV'), "TV", "Movie")
studios_complete['content_rating'].replace({'Not Rated': 'Unrated', 'Approved': 'Unrated', 
                                           'TV-G': 'G',  'TV-PG': 'PG', 'TV-MA': 'R',
                                           'TV-Y': 'G', 'TV-14': 'PG-13', 'Passed': 'Not Rated',
                                           'TV-Y7': 'PG', 'M': 'PG', 'GP': 'PG'}, inplace = True)
                                           
studios_complete['profit'] = studios_complete['gross_infl'] - studios_complete['budget_infl']
studios_complete.drop_duplicates(subset=['color', 'director_name', 'duration', 
                                  'actor_2_name', 'actor_1_name', 'movie_title', 'actor_3_name', 
                                  'facenumber_in_poster', 'language', 'country', 'content_rating',
                                  'budget', 'title_year', 'gross', 'aspect_ratio'], inplace = True)

In [5]:
studios_profile = ProfileReport(studios_complete, title = "Movie Report")

studios_profile.to_file("studios.html")


HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=62.0), HTML(value='')))




HBox(children=(HTML(value='Generate report structure'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Render HTML'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Export report to file'), FloatProgress(value=0.0, max=1.0), HTML(value='')))


