In [1]:
import pandas as pd
import numpy as np
import math

from pandas_profiling import ProfileReport

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
from sklearn import metrics

from graphviz import Source
from sklearn import tree

Because there seemed to be more of a correlation with genre and profits than the other features explored through linear regression, I wanted to see if a decision tree would be able to indicate which genres were correlated more often with more profits. 

In [2]:
# read in csv created in data cleaning notebook
recent_profitable_movies = pd.read_csv("recent_profitable_movies.csv")

In [3]:
#create a decision tree
X = recent_profitable_movies[['Thriller', 'Fantasy', 'Comedy',
       'Animation', 'History', 'Western', 'Music', 'Family', 
       'Action', 'Drama', 'Mystery', 'Sci-Fi', 'War', 
       'Horror', 'Crime', 'Romance']]

y = recent_profitable_movies['profit']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
enc = KBinsDiscretizer(n_bins=10, encode='onehot-dense', strategy='kmeans')

ytrain_reshape = y_train.values.reshape(-1, 1)
y_train = enc.fit_transform(ytrain_reshape)
ytest_reshape = y_test.values.reshape(-1, 1)
y_test = enc.fit_transform(ytest_reshape)

# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.35294117647058826


In [4]:
# create a textual representation
text_representation = tree.export_text(clf, feature_names=['Thriller', 'Fantasy', 'Comedy',
       'Animation', 'History', 'Western', 'Music', 'Family', 
       'Action', 'Drama', 'Mystery', 'Sci-Fi', 'War', 
       'Horror', 'Crime', 'Romance'])
print(text_representation)


|--- Drama <= 0.50
|   |--- Animation <= 0.50
|   |   |--- Action <= 0.50
|   |   |   |--- Fantasy <= 0.50
|   |   |   |   |--- Sci-Fi <= 0.50
|   |   |   |   |   |--- Romance <= 0.50
|   |   |   |   |   |   |--- Family <= 0.50
|   |   |   |   |   |   |   |--- Horror <= 0.50
|   |   |   |   |   |   |   |   |--- Comedy <= 0.50
|   |   |   |   |   |   |   |   |   |--- Thriller <= 0.50
|   |   |   |   |   |   |   |   |   |   |--- Music <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 3
|   |   |   |   |   |   |   |   |   |   |--- Music >  0.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: 2
|   |   |   |   |   |   |   |   |   |--- Thriller >  0.50
|   |   |   |   |   |   |   |   |   |   |--- Mystery <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |   |   |   |   |   |   |   |--- Mystery >  0.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |   |   |   |  

Looking at the textual representation of the tree, it seemed that some genres could be recommended as those most likely to make profits.  However, I was unsure what the class numbers indicated.  In order to determine this, I printed the edges of the bins that the KBinsDiscretizer created.  I then created my own categories based on these bins. After creating a decision tree with these bins, I experimented with changing the values to see if this would result in more accurate preditions. I also removed some of the genres as features.  In addition, I limited the decision tree to a maximum depth of 7.  With these modifications, I was able to increase the accuracy score from .35 to .55. 

In [5]:
# find out where the edges of the bins are
print(enc.bin_edges_ )

[array([5.70616136e+06, 7.51826751e+09, 1.86467591e+10, 3.17545252e+10,
       4.62234339e+10, 6.21267164e+10, 7.72910933e+10, 9.30068715e+10,
       1.10125224e+11, 1.25313274e+11, 1.33771328e+11])]


In [7]:
#use the edges to make categories that easier to understand in the decision tree representation
recent_profitable_movies['profit_str'] = np.nan

recent_profitable_movies['profit_str'] = np.where(recent_profitable_movies['profit'] >= 110125224001, '10 most profits', recent_profitable_movies['profit_str'])
recent_profitable_movies['profit_str'] = np.where(recent_profitable_movies['profit'].between(93006871501, 11012522400),
                                              '9 high profits', recent_profitable_movies['profit_str'])
recent_profitable_movies['profit_str'] = np.where(recent_profitable_movies['profit'].between(77291093301, 93006871500),
                                              '8 excellent profits', recent_profitable_movies['profit_str'])
recent_profitable_movies['profit_str'] = np.where(recent_profitable_movies['profit'].between(62126716401, 77291093300),
                                              '7 very good profits', recent_profitable_movies['profit_str'])
recent_profitable_movies['profit_str'] = np.where(recent_profitable_movies['profit'].between(46223433901, 62126716400),
                                              '6 good profits', recent_profitable_movies['profit_str'])
recent_profitable_movies['profit_str'] = np.where(recent_profitable_movies['profit'].between(31754525201, 46223433900),
                                              '5 mediocre profits', recent_profitable_movies['profit_str'])
recent_profitable_movies['profit_str'] = np.where(recent_profitable_movies['profit'].between(18646759101, 31754525200),
                                              '4 some profits',recent_profitable_movies['profit_str'])
recent_profitable_movies['profit_str'] = np.where(recent_profitable_movies['profit'].between(7518267511, 18646759100),
                                              '3 few profits', recent_profitable_movies['profit_str'])
recent_profitable_movies['profit_str'] = np.where(recent_profitable_movies['profit'].between(5706161, 7518267511),
                                              '2 very few profits', recent_profitable_movies['profit_str'])
recent_profitable_movies['profit_str'] = np.where(recent_profitable_movies['profit'].between(0, 5706161),
                                              '1 least profits', recent_profitable_movies['profit_str'])

In [8]:
# create a new decision tree using the categories above
X = recent_profitable_movies[['Adventure','Thriller', 'Fantasy', 'Comedy',
       'Animation', 'History', 'Music', 'Family', 
       'Action', 'Drama', 'Mystery', 'Sci-Fi',
       'Musical', 'Romance']] # Features

y = recent_profitable_movies['profit_str']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Create Decision Tree classifer object
clf = DecisionTreeClassifier(max_depth= 7)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))


Accuracy: 0.5574229691876751


In [9]:
# print textual representation of the tree
text_representation = tree.export_text(clf, feature_names=['Adventure','Thriller', 'Fantasy', 'Comedy',
       'Animation', 'History', 'Music', 'Family', 
       'Action', 'Drama', 'Mystery', 'Sci-Fi',
       'Musical', 'Romance'])
print(text_representation)

|--- Adventure <= 0.50
|   |--- Drama <= 0.50
|   |   |--- Musical <= 0.50
|   |   |   |--- Animation <= 0.50
|   |   |   |   |--- Romance <= 0.50
|   |   |   |   |   |--- Action <= 0.50
|   |   |   |   |   |   |--- Fantasy <= 0.50
|   |   |   |   |   |   |   |--- class: 2 very few profits
|   |   |   |   |   |   |--- Fantasy >  0.50
|   |   |   |   |   |   |   |--- class: 2 very few profits
|   |   |   |   |   |--- Action >  0.50
|   |   |   |   |   |   |--- Mystery <= 0.50
|   |   |   |   |   |   |   |--- class: 2 very few profits
|   |   |   |   |   |   |--- Mystery >  0.50
|   |   |   |   |   |   |   |--- class: 3 few profits
|   |   |   |   |--- Romance >  0.50
|   |   |   |   |   |--- Action <= 0.50
|   |   |   |   |   |   |--- Music <= 0.50
|   |   |   |   |   |   |   |--- class: 2 very few profits
|   |   |   |   |   |   |--- Music >  0.50
|   |   |   |   |   |   |   |--- class: 3 few profits
|   |   |   |   |   |--- Action >  0.50
|   |   |   |   |   |   |--- Thriller <= 0.50


In [None]:
# fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (40,40), dpi=600)

# tree.plot_tree(clf,
#                feature_names=[ 'News', 'Thriller', 'Fantasy', 'Comedy',
#        'Animation', 'History', 'Western', 'Music', 'Family', 'Short',
#        'Game-Show', 'Action', 'Sport', 'Drama', 'Mystery', 'Sci-Fi',
#        'Biography', 'Documentary', 'War', 'Film-Noir', 'Horror', 'Crime',
#        'Reality-TV', 'Musical', 'Adventure', 'Romance'], 
#                filled = True);
# fig.savefig('imagename.png')

In [10]:
unique_label = np.unique([y_test, y_pred])
cmtx = pd.DataFrame(
    metrics.confusion_matrix(y_test, y_pred, labels=unique_label), 
    index=['true:{:}'.format(x) for x in unique_label], 
    columns=['pred:{:}'.format(x) for x in unique_label])
print(cmtx)

                          pred:10 most profits  pred:2 very few profits  \
true:10 most profits                         0                        1   
true:2 very few profits                      0                      182   
true:3 few profits                           0                       82   
true:4 some profits                          0                       20   
true:5 mediocre profits                      0                       11   
true:6 good profits                          0                        5   
true:7 very good profits                     0                        1   
true:8 excellent profits                     0                        3   
true:nan                                     0                        1   

                          pred:3 few profits  pred:4 some profits  \
true:10 most profits                       1                    1   
true:2 very few profits                    7                    5   
true:3 few profits                        

#### Analysis of Results

Looking carefully over the decision tree, I found that these were the most profitable combinations of genres:

adventure, animation, family, musical: 5
adventure, animation: 6
adventure, action, animation, comedy: 6
adventure,  romance, drama: 6
adventure, action, animation: 7
adventure, fantasy, romance: 7
adventure, fantasy, mystery, family: 8

It seems that movies that fit into the adventure genre are generally a good bet when it comes to making movies. However, looking at the confusion matrix and the accuracy score, it appears that this model could be refined further before any definite recommendations are made.