In [1]:
# to store and manipulate data
import pandas as pd
import numpy as np
import math

# for quick data analysis
from pandas_profiling import ProfileReport

# for normalizing profits
from sklearn.preprocessing import MinMaxScaler

# for measuring the accuracy of the predictions with bag of words
from sklearn import metrics

# used to select most predictive key words
from textblob.classifiers import NaiveBayesClassifier
from textblob.blob import TextBlob 

## for processing
import re
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

## for bag-of-words
from sklearn import feature_extraction, model_selection, naive_bayes, metrics 
from sklearn import pipeline, manifold, preprocessing, feature_selection

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tasbe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tasbe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
movies_df = pd.read_csv('movie_metadata.csv')

#creates a profile document which analyses each of the variables in the dataframe 
# this takes awhile uncomment code below to run

# movies_profile = ProfileReport(movies_df, title = "Movie Report", explorative=True)
# movies_profile.to_file("movies.html")

# Information Gained From the Profile

**Numerical**

director_facebook_likes, *duration*,  actor_1_facebook_likes, actor_2_facebook_likes, actor_3_facebook_likes, cast_total_facebook_likes, movie_facebook_likes, 

num_voted_users, num_critic_for_reviews,  num_user_for_reviews   

gross, budget,

*imbd_score*, facenumber_in_poster

*Italitized items have close to a normal distribution. The rest of the values are skewed.*

**Categorical**

*director_name, actor_1_name, actor_2_name, actor_3_name*

language, country, content_rating

*movie_title, movie_imdb_link*,  aspect_ratio, color

*Italitized items have high cardinality.*

**Imbalanced Variables**: color, language, country

**List Variables**: genres, plot keywords

**Large Percentage Missing**: director_facebook_likes, gross, budget, title_year

**Extreme Values** : Most of the numerical values have outliers/extreme values. 

# Getting More Accurate Data

: When looking through the budget and profit data to explore some of the more extreme values, 
: I noticed that the gross value seemed to refer to different IMBD values.  Sometimes it referred to the
first weekend in the US and Canada and other times it was close, but not an exact match, to the all time gross.
This would obviously affect the results of the income.  To solve this problem, I downloaded world-wide revenue 
values from [The Movie Database](https://developers.themoviedb.org/3). I also downloaded the budget values. With this information, I calculated the profits for each movie. This was done in this [notebook](https://github.com/MariannBea/Movie-Studio-Analysis/blob/1ea166b5340df00c818f158ac52404fd979c0e74/Notebooks/Get%20Budget%20Info.ipynb)

This information will be mereged with the movies_df in the cell below.

In [3]:
movies_df.drop(columns = ['gross', 'budget'], inplace = True)

# read budget, revenue and profit data in from csv.
budget_data = pd.read_csv('movie_profits.csv')
budget_data.drop_duplicates(inplace = True)
budget_data.columns = ['movie_title', 'budget', 'revenue', 'profit']

movies_df = pd.merge(movies_df, budget_data, on='movie_title')

In [4]:
#analyse the merged dataframe
# uncomment the lines below to run the profile report again

# movies_profile = ProfileReport(movie_info, title = "Merged_Movies", explorative=True)
# movies_profile.to_file("merged_movies.html")

In [5]:
# remove 0 values for budget and revenue. This was the value used by the Movie Database to indicate
# a value had not yet been entered.
movies_df = movies_df[movies_df.budget != 0]
movies_df = movies_df[movies_df.revenue != 0]

Profit information was adjusted for inflation over time. 

Inflation information was found at: https://data.worldbank.org/indicator/FP.CPI.TOTL.ZG 

Some countries were not present in the data. The countries on the left were give the values for the country or region on the right as they were closest economic match. 

* Argentina: Latin America & Caribbean 
* Finland: EU Indonesia: East Asia & Pacific (excluding high income) 
* South Korea: East Asia & Pacific 
* Taiwan: China 
* Thailand: East Asia & Pacific (excluding high income) 
* West Germany - Russia Soviet Union - Russia

In [6]:
inflation = pd.read_csv("price_index.csv")

# calculate mean inflation rate for each country.  Use this rate to fill in NaN values    
inflation['mean'] = inflation.mean(axis = 1, numeric_only = True)
inflation = inflation.set_index('country')

# fill missing years with 2005, missing countries with USA (most common)
movies_df['title_year'].fillna(2005, inplace = True)
movies_df['country'].fillna('USA', inplace = True)

#determine inflation rate for each movie, movies before 1960 given mean inflation rate
movies_df['title_year'] = movies_df['title_year'].astype(int)

for index, year, country in movies_df[['title_year', 'country']].itertuples():
    stryear = str(year)
    strcountry = str(country)
    if year >= 1960:
        movies_df['rate'] = inflation.loc[[strcountry],[stryear]].iloc[0][0]
    else:
        movies_df['rate'] = inflation.loc[[strcountry],['mean']].iloc[0][0]

# find budget, gross and profits adjusted for inflation
movies_df['budget_infl'] = (movies_df['budget']/movies_df['rate']) * 100
movies_df['gross_infl'] = (movies_df['revenue']/movies_df['rate']) * 100
movies_df['profit_infl'] = (movies_df['profit']/movies_df['rate']) * 100

movies_df.drop_duplicates(inplace = True)

In [22]:
# find the firstand third quartile amounts to use to categorize the values
Q1 = movies_df['profit_infl'].quantile(0.25)
Q3 = movies_df['profit_infl'].quantile(0.75) 

#normalize the inflation-adjusted profit values
scaler = MinMaxScaler()
movies_df['normalized_profit'] = scaler.fit_transform(movies_df['profit_infl'].values.reshape(-1, 1))

11929309505.107433


In [8]:
# divide the profits into categories to use when creating decision trees
movies_df['profit_str'] = None

movies_df['profit_str'] = np.where(movies_df['profit_infl'] >= Q3, 'Success', movies_df['profit_str'])

movies_df['profit_str'] = np.where(movies_df['profit_infl'].between(Q1, Q3),
                                              'Some Profits', movies_df['profit_str'])
movies_df['profit_str'] = np.where(movies_df['profit_infl'].between(0, Q1),
                                              'Low Profits', movies_df['profit_str'])
movies_df['profit_str'] = np.where(movies_df['profit_infl'] <= 0,
                                              'Failure', movies_df['profit_str'])

#drop any values that were not filled
movies_df.dropna(how='any', inplace=True)

#check to see how many items are in each category
print(movies_df['profit_str'].value_counts())

Some Profits    1160
Success          598
Failure          593
Name: profit_str, dtype: int64


In [9]:
# find the first, second and third quartile of the inflation-adjusted budget
Q1 = movies_df['budget_infl'].quantile(0.25)
Q2 = movies_df['budget_infl'].quantile(0.50)
Q3 = movies_df['budget_infl'].quantile(0.75)

# divide budget into categories to use in decision tree analysis
movies_df['budget_quartile'] = None

movies_df['budget_quartile'] = np.where(movies_df['budget_infl'] >= Q3, 100, movies_df['budget_quartile'])

movies_df['budget_quartile'] = np.where(movies_df['budget_infl'].between(Q2, Q3),
                                              75, movies_df['budget_quartile'])
movies_df['budget_quartile'] = np.where(movies_df['budget_infl'].between(Q1, Q2),
                                              50, movies_df['budget_quartile'])
movies_df['budget_quartile'] = np.where(movies_df['budget_infl'].between(0, Q1),
                                              25, movies_df['budget_quartile'])

In [10]:
# transform the 'genres' column into binary columns for each genre

# create a new column, transform 'genres' into  a lists, 
# store the lists in the new columns
movies_df['genre_list'] = movies_df['genres'].copy()
movies_df['genre_list'] = movies_df['genre_list'].str.split('|')
movies_df['genre_count'] =  movies_df['genre_list'].str.len()

# create boolean columns for each genre

genres = set([item for lists in movies_df['genre_list'] for item in lists])

# Below code was taken from: 
# https://towardsdatascience.com/dealing-with-list-values-in-pandas-dataframes-a177e534f173
# Create empty dict
genre_dict = {}

# Loop through all the tags
for i, item in enumerate(genres):
    # Apply boolean mask
    genre_dict[item] = movies_df['genre_list'].apply(lambda x: item in x)

# Return the results as a dataframe, change True and false values to 0 and 1
genre_frame =  pd.DataFrame(genre_dict)
genre_frame = genre_frame.astype(int)

#merge the genre dataframe back with the original one
movies_df = pd.merge(movies_df, genre_frame, left_index = True, right_index = True)
# delete list so that duplicates can be dropped
movies_df.drop(columns = ['genre_list'], inplace = True)
movies_df.drop_duplicates(inplace = True)

#recreate list
movies_df['genre_list'] = movies_df['genres'].copy()
movies_df['genre_list'] = movies_df['genre_list'].str.split('|')

In [11]:
#consolidate ratings into smaller categories
movies_df['content_rating'].replace({'Not Rated': 'Unrated', 'Approved': 'Unrated', 
                                           'TV-G': 'G',  'TV-PG': 'PG', 'TV-MA': 'R',
                                           'TV-Y': 'G', 'TV-14': 'PG-13', 'Passed': 'Unrated',
                                           'TV-Y7': 'PG', 'M': 'PG', 'GP': 'PG'}, inplace = True)
movies_df['ratings'] = movies_df['content_rating'].copy()
movies_df = pd.get_dummies(movies_df, columns=['ratings'])

In [12]:
#count how many movies a director or actor took part in up to the current movie

# sort movies by year so that the earliest movie a director made will have a count of one.
movies_df = movies_df.sort_values(by = ['title_year']).reset_index(drop = True)

# dictionary to store the count for each director
director_count = {}

# The first time a director is encountered in the dataframe, they will be given a count of one.  
# Each additional movie will add to the total
for index, title, director, year in movies_df[['movie_title','director_name', 'title_year']].itertuples():
    if director in director_count:
        movies_df.loc[movies_df['movie_title'] == title,'director_count'] = (director_count[director] + 1)
        director_count[director] += 1
    else:
        movies_df.loc[movies_df['movie_title'] == title,'director_count'] = 1
        director_count[director] = 1
        
# indicate whether or not a director made 5 or more movies        
movies_df['directed_5_plus'] = 'no'
movies_df['directed_5_plus'] = np.where(movies_df['director_count'] >= 5, 'yes', movies_df['directed_5_plus'])
movies_df = pd.get_dummies(movies_df, columns=['directed_5_plus'])

In [13]:
# This code replicates the code above, but for each actor
actor_count = {}

for index, title, actor1, actor2, actor3 in movies_df[['movie_title','actor_1_name', 'actor_2_name', 'actor_3_name']].itertuples():
    if actor1 in actor_count:
        movies_df.loc[movies_df['movie_title'] == title,'actor1_count'] = (actor_count[actor1] + 1)
        actor_count[actor1] += 1
    else:
        movies_df.loc[movies_df['movie_title'] == title,'actor1_count'] = 1
        actor_count[actor1] = 1
    if actor2 in actor_count:
        movies_df.loc[movies_df['movie_title'] == title,'actor2_count'] = (actor_count[actor2] + 1)
        actor_count[actor1] += 1
    else:
        movies_df.loc[movies_df['movie_title'] == title,'actor2_count'] = 1
        actor_count[actor2] = 1
    if actor3 in actor_count:
        movies_df.loc[movies_df['movie_title'] == title,'actor3_count'] = (actor_count[actor3] + 1)
        actor_count[actor1] += 1
    else:
        movies_df.loc[movies_df['movie_title'] == title,'actor3_count'] = 1
        actor_count[actor3] = 1
        
movies_df['actor_count'] = movies_df['actor1_count'] + movies_df['actor2_count'] + movies_df['actor3_count']

In [14]:
dtf = movies_df[['movie_title','profit_str', 'plot_keywords']]

## rename columns
dtf = dtf.rename(columns={"profit_str":"y", "plot_keywords":"text"})

#https://towardsdatascience.com/text-analysis-feature-engineering-with-nlp-502d6ea9225d

def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    
    text = re.sub(r'[^\w\s]', ' ', str(text).lower())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
   
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in lst_stopwords]
            
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
        
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
     
     ## remove duplicate words
    lst_text = set(lst_text)
    
#     ## back to string from list
    text = ", ".join(lst_text)
    return text

lst_stopwords = nltk.corpus.stopwords.words("english")


dtf["text_clean"] = dtf["text"].apply(lambda x: utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, lst_stopwords= lst_stopwords))

dtf.drop(columns = ['text'], axis=1, inplace = True)


In [15]:
## split dataset
dtf_train, dtf_test = model_selection.train_test_split(dtf, test_size=0.3)

## get target
y_train = dtf_train["y"].values
y_test = dtf_test["y"].values

## Count (classic BoW)
vectorizer = feature_extraction.text.CountVectorizer(max_features=10000, ngram_range=(1,2))

## Tf-Idf (advanced variant of BoW)
vectorizer = feature_extraction.text.TfidfVectorizer(max_features=10000, ngram_range=(1,2))

corpus = dtf_train["text_clean"]
vectorizer.fit(corpus)

X_train = vectorizer.transform(corpus)
dic_vocabulary = vectorizer.vocabulary_

y = dtf_train["y"]

X_names = vectorizer.get_feature_names()

p_value_limit = 0.85

dtf_features = pd.DataFrame()

for cat in np.unique(y):
    chi2, p = feature_selection.chi2(X_train, y==cat)
    
    dtf_features = dtf_features.append(pd.DataFrame(
                   {"feature":X_names, "score":1-p, "y":cat}))
    
    dtf_features = dtf_features.sort_values(["y","score"], 
                    ascending=[True,False])
    
    dtf_features = dtf_features[dtf_features["score"]>p_value_limit]
    
X_names = dtf_features["feature"].unique().tolist()

for cat in np.unique(y):
    print("# {}:".format(cat))
    print("  . selected features:",
         len(dtf_features[dtf_features["y"]==cat]))
    print("  . top features:", ",".join(
dtf_features[dtf_features["y"]==cat]["feature"].values[:5]))
    print(" ")

#create a list to store the top features found
features = ""

features += " ".join(dtf_features[dtf_features["y"]=='Failure']["feature"].values[:25])

features += " ".join(dtf_features[dtf_features["y"]=='Average']["feature"].values[:25])

features += " ".join(dtf_features[dtf_features["y"]=='Success']["feature"].values[:30])

features = features.split()

vectorizer = feature_extraction.text.TfidfVectorizer(vocabulary=X_names)
vectorizer.fit(corpus)

X_train = vectorizer.transform(corpus)
dic_vocabulary = vectorizer.vocabulary_

classifier = naive_bayes.MultinomialNB()

## pipeline
model = pipeline.Pipeline([("vectorizer", vectorizer),  
                           ("classifier", classifier)])
## train classifier
model["classifier"].fit(X_train, y_train)

## test
X_test = dtf_test["text_clean"].values
predicted = model.predict(X_test)
predicted_prob = model.predict_proba(X_test)

# classes = np.unique(y_test)
# y_test_array = pd.get_dummies(y_test, drop_first=False).values
    
## Accuracy, Precision, Recall
accuracy = metrics.accuracy_score(y_test, predicted)

print("Accuracy:",  round(accuracy,2))

print("Detail:")
print(metrics.classification_report(y_test, predicted))

# # create a column with only key words that are from the feature list created above
movies_df['features'] = dtf['text_clean'].apply(lambda x: ''.join([word for word in x.split() if word in (features)]))

# Failure:
  . selected features: 23
  . top features: sex,fighter,friendship friend,immigrant,bus
 
# Some Profits:
  . selected features: 4
  . top features: bear,epic,artist,combat
 
# Success:
  . selected features: 51
  . top features: epic,tale,bear,fish,british
 
Accuracy: 0.5
Detail:
              precision    recall  f1-score   support

     Failure       0.45      0.03      0.05       183
Some Profits       0.51      0.94      0.66       356
     Success       0.38      0.09      0.14       167

    accuracy                           0.50       706
   macro avg       0.45      0.35      0.29       706
weighted avg       0.47      0.50      0.38       706



In [16]:
# # Below code was taken from: 
# # https://towardsdatascience.com/dealing-with-list-values-in-pandas-dataframes-a177e534f173
# #Create empty dict

feature_dict = {}


# Loop through all the tags
for i, item in enumerate(features):
    # Apply boolean mask
    feature_dict[item] = movies_df['features'].apply(lambda x: item in x)

# Return the results as a dataframe, change True and false values to 0 and 1
feature_frame =  pd.DataFrame(feature_dict)
feature_frame = feature_frame.astype(int)

# #merge the genre dataframe back with the original one
movies_df = pd.merge(movies_df, feature_frame, left_index = True, right_index = True)

More recent movies are more likely to have a predictive value for what features in future movies are likely to lead to success.  1999 was chosen as the initial cut off date because it was the low end of the interquartile range from the orignal set of data.  However, both a wider and narrower date range will be explored.

In [17]:
import pickle

# save the model to disk
filename = 'model1.sav'
pickle.dump(model, open(filename, 'wb'))

In [18]:
#save the dataframe to a csv file so that it can be used in analysis in another notebook
movies_df.fillna({'actor_3_facebook_likes': movies_df.actor_3_facebook_likes.mean(),
                                 'actor_2_facebook_likes': movies_df.actor_2_facebook_likes.mean(),
                                 'facenumber_in_poster': movies_df.facenumber_in_poster.mean(), 
                                 'actor_1_facebook_likes': movies_df.actor_1_facebook_likes.mean(), 
                                 'aspect_ratio': movies_df.aspect_ratio.mean(), 
                                 'content_rating': movies_df.aspect_ratio.mode(), 
                                 'actor_2_name': 'unknown', 'actor_1_name': 'unknown',
                                 'actor_3_name': 'unknown','plot_keywords': 'missing'}, inplace=True)

In [19]:
#create a dataframe for movies that are after 1999
movies_df = movies_df.loc[movies_df['title_year'] >= 1999]
movies_df = movies_df.drop(columns = ['color', 'num_voted_users', 'country', 'num_user_for_reviews', 'num_critic_for_reviews'])

In [20]:
#store cleaned data in a csv to be used in analysis
movies_df.to_csv("movies_df_cleaned.csv", index = False)


In [21]:
#creates a profile document which analyses each of the variables in the dataframe 
# uncomment the lines below to run the profile report again

# profitable_profile = ProfileReport(recent_profitable_movies, title = "Movie Report", explorative=True)
# profitable_profile.to_file("profitable.html")