In [1]:
#used to store and manipulate data
import pandas as pd
import numpy as np

#https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler

#used to create training and testing sets
from sklearn.model_selection import train_test_split 
#from sklearn.preprocessing import KBinsDiscretizer

#used to create decision tree models
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier 

# used to determine accuracy of models
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score
#from sklearn.model_selection import RepeatedStratifiedKFold

# import seaborn as sns
# import matplotlib.pyplot as plt

In [2]:
# read in csv created in data cleaning notebook
movies_df = pd.read_csv('movies_df_cleaned.csv')

In [3]:
#create a decision tree to look at the connection between genre and profits

#select features to compare with profits
X = movies_df[['Thriller', 'Fantasy','Animation', 'Family',
       'Action','Drama', 'Sci-Fi', 'Crime', 'Adventure', 'Romance']]
y = movies_df['profit_str']

#divide data into testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

#over and undersample to even-out imbalanced dataset, pipeline code taken from here: 
#https://machinelearningmastery.com/combine-oversampling-and-undersampling-for-imbalanced-classification/
over = SMOTE(sampling_strategy={"Success": 600 ,"Failure": 500})
under = RandomUnderSampler(sampling_strategy = {"Some Profits": 650})
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

# transform the dataset
X_train, y_train = pipeline.fit_resample(X_train, y_train)

# Create Decision Tree classifer object
clf = DecisionTreeClassifier(max_depth= 7)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

#print confusion matrix
unique_label = np.unique([y_test, y_pred])
print()
print("Confusion Matrix")
print()
cmtx = pd.DataFrame(
    metrics.confusion_matrix(y_test, y_pred, labels=unique_label), 
    index=['true:{:}'.format(x) for x in unique_label], 
    columns=['pred:{:}'.format(x) for x in unique_label])
print(cmtx)

Accuracy: 0.49469214437367304

Confusion Matrix

                   pred:Failure  pred:Some Profits  pred:Success
true:Failure                  9                 75            21
true:Some Profits            31                167            43
true:Success                 10                 58            57


In [4]:
# create a textual representation
text_representation = tree.export_text(clf, feature_names=['Thriller', 'Fantasy','Animation', 'Family',
       'Action','Drama', 'Sci-Fi', 'Crime', 'Adventure', 'Romance'])
print(text_representation)


|--- Adventure <= 0.50
|   |--- Drama <= 0.50
|   |   |--- Animation <= 0.50
|   |   |   |--- Action <= 0.50
|   |   |   |   |--- Thriller <= 0.50
|   |   |   |   |   |--- Sci-Fi <= 0.50
|   |   |   |   |   |   |--- Crime <= 0.50
|   |   |   |   |   |   |   |--- class: Some Profits
|   |   |   |   |   |   |--- Crime >  0.50
|   |   |   |   |   |   |   |--- class: Failure
|   |   |   |   |   |--- Sci-Fi >  0.50
|   |   |   |   |   |   |--- Romance <= 0.50
|   |   |   |   |   |   |   |--- class: Some Profits
|   |   |   |   |   |   |--- Romance >  0.50
|   |   |   |   |   |   |   |--- class: Failure
|   |   |   |   |--- Thriller >  0.50
|   |   |   |   |   |--- Crime <= 0.50
|   |   |   |   |   |   |--- Fantasy <= 0.50
|   |   |   |   |   |   |   |--- class: Some Profits
|   |   |   |   |   |   |--- Fantasy >  0.50
|   |   |   |   |   |   |   |--- class: Some Profits
|   |   |   |   |   |--- Crime >  0.50
|   |   |   |   |   |   |--- Romance <= 0.50
|   |   |   |   |   |   |   |--- class

In [5]:
# create a new decision tree using the categories above
X = movies_df[['ratings_G','ratings_PG', 'ratings_PG-13', 'ratings_R']] # Features

y = movies_df['profit_str']

#divide data into testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


#over and undersample to even-out imbalanced dataset, pipeline code taken from here: 
#https://machinelearningmastery.com/combine-oversampling-and-undersampling-for-imbalanced-classification/
over = SMOTE(sampling_strategy={"Success": 600 ,"Failure": 550})
under = RandomUnderSampler(sampling_strategy = {"Some Profits": 600})
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

# transform the dataset
X_train, y_train = pipeline.fit_resample(X_train, y_train)

# Create Decision Tree classifer object
clf = DecisionTreeClassifier(max_depth= 7)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

#print confusion matrix
unique_label = np.unique([y_test, y_pred])
print()
print("Confusion Matrix")
print()
cmtx = pd.DataFrame(
    metrics.confusion_matrix(y_test, y_pred, labels=unique_label), 
    index=['true:{:}'.format(x) for x in unique_label], 
    columns=['pred:{:}'.format(x) for x in unique_label])
print(cmtx)

Accuracy: 0.32696390658174096

Confusion Matrix

                   pred:Failure  pred:Some Profits  pred:Success
true:Failure                 55                  0            50
true:Some Profits           123                  0           118
true:Success                 26                  0            99


In [6]:
# print textual representation of the tree
text_representation = tree.export_text(clf, feature_names=['ratings_G','ratings_PG', 'ratings_PG-13', 'ratings_R'])
print(text_representation)

|--- ratings_R <= 0.50
|   |--- ratings_PG <= 0.50
|   |   |--- ratings_G <= 0.50
|   |   |   |--- ratings_PG-13 <= 0.50
|   |   |   |   |--- class: Failure
|   |   |   |--- ratings_PG-13 >  0.50
|   |   |   |   |--- class: Success
|   |   |--- ratings_G >  0.50
|   |   |   |--- class: Success
|   |--- ratings_PG >  0.50
|   |   |--- class: Success
|--- ratings_R >  0.50
|   |--- class: Failure



In [7]:
# create a new decision tree using the categories above
X = movies_df[['ratings_G','ratings_PG', 'ratings_PG-13', 'ratings_R', 
               'Thriller', 'Fantasy','Animation', 'Family',
               'Action','Drama', 'Sci-Fi', 'Crime', 'Adventure', 'Romance']] # Features

y = movies_df['profit_str']

#divide data into testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


#over and undersample to even-out imbalanced dataset, pipeline code taken from here: 
#https://machinelearningmastery.com/combine-oversampling-and-undersampling-for-imbalanced-classification/
over = SMOTE(sampling_strategy={"Success": 600 ,"Failure": 500})
under = RandomUnderSampler(sampling_strategy = {"Some Profits": 635})
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

# transform the dataset
X_train, y_train = pipeline.fit_resample(X_train, y_train)

# Create Decision Tree classifer object
clf = DecisionTreeClassifier(max_depth= 5)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

#print confusion matrix
unique_label = np.unique([y_test, y_pred])
print()
print("Confusion Matrix")
print()
cmtx = pd.DataFrame(
    metrics.confusion_matrix(y_test, y_pred, labels=unique_label), 
    index=['true:{:}'.format(x) for x in unique_label], 
    columns=['pred:{:}'.format(x) for x in unique_label])
print(cmtx)

Accuracy: 0.4989384288747346

Confusion Matrix

                   pred:Failure  pred:Some Profits  pred:Success
true:Failure                 27                 51            27
true:Some Profits            50                144            47
true:Success                 10                 51            64


In [8]:
# print textual representation of the tree
text_representation = tree.export_text(clf, feature_names=['ratings_G','ratings_PG', 'ratings_PG-13', 
                                       'ratings_R', 'Thriller', 'Fantasy','Animation', 'Family',
                                       'Action','Drama', 'Sci-Fi', 'Crime', 'Adventure', 'Romance'])
print(text_representation)

|--- Adventure <= 0.50
|   |--- ratings_R <= 0.50
|   |   |--- Animation <= 0.50
|   |   |   |--- Action <= 0.50
|   |   |   |   |--- ratings_G <= 0.50
|   |   |   |   |   |--- class: Some Profits
|   |   |   |   |--- ratings_G >  0.50
|   |   |   |   |   |--- class: Success
|   |   |   |--- Action >  0.50
|   |   |   |   |--- ratings_PG-13 <= 0.50
|   |   |   |   |   |--- class: Failure
|   |   |   |   |--- ratings_PG-13 >  0.50
|   |   |   |   |   |--- class: Success
|   |   |--- Animation >  0.50
|   |   |   |--- Crime <= 0.50
|   |   |   |   |--- ratings_PG-13 <= 0.50
|   |   |   |   |   |--- class: Success
|   |   |   |   |--- ratings_PG-13 >  0.50
|   |   |   |   |   |--- class: Failure
|   |   |   |--- Crime >  0.50
|   |   |   |   |--- class: Some Profits
|   |--- ratings_R >  0.50
|   |   |--- Drama <= 0.50
|   |   |   |--- Thriller <= 0.50
|   |   |   |   |--- Sci-Fi <= 0.50
|   |   |   |   |   |--- class: Some Profits
|   |   |   |   |--- Sci-Fi >  0.50
|   |   |   |   |   |

In [9]:
# create a new decision tree using the categories above
features = movies_df['features'].dropna().unique()

X = movies_df[['breast', 'immigrant', 'fish', 'drug',  'sex', 'mafia',  'toy',
 'escape',  'friend', 'battle',  'epic', 'sword',
 'princess', 'secret', 'panic', 'sequel', 'hard']] # Features

y = movies_df['profit_str']

#divide data into testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


#over and undersample to even-out imbalanced dataset, pipeline code taken from here: 
#https://machinelearningmastery.com/combine-oversampling-and-undersampling-for-imbalanced-classification/
over = SMOTE(sampling_strategy={"Success": 600 ,"Failure": 500})
under = RandomUnderSampler(sampling_strategy = {"Some Profits": 600})
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

# transform the dataset
X_train, y_train = pipeline.fit_resample(X_train, y_train)

# Create Decision Tree classifer object
clf = DecisionTreeClassifier(max_depth= 7)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

#print confusion matrix
unique_label = np.unique([y_test, y_pred])
print()
print("Confusion Matrix")
print()
cmtx = pd.DataFrame(
    metrics.confusion_matrix(y_test, y_pred, labels=unique_label), 
    index=['true:{:}'.format(x) for x in unique_label], 
    columns=['pred:{:}'.format(x) for x in unique_label])
print(cmtx)

Accuracy: 0.2802547770700637

Confusion Matrix

                   pred:Failure  pred:Some Profits  pred:Success
true:Failure                  5                  1            99
true:Some Profits             3                  4           234
true:Success                  2                  0           123


In [10]:
# create a new decision tree using the categories above
X = movies_df[['Thriller', 'Fantasy','Animation', 'Family','Action','Drama',
               'Sci-Fi', 'Crime', 'Adventure', 'Romance', 'facenumber_in_poster',
               'ratings_G','ratings_PG', 'ratings_PG-13', 'ratings_R',
                'budget_quartile']] # Features

y = movies_df['profit_str']

#divide data into testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


#over and undersample to even-out imbalanced dataset, pipeline code taken from here: 
#https://machinelearningmastery.com/combine-oversampling-and-undersampling-for-imbalanced-classification/
over = SMOTE(sampling_strategy={"Success": 600 ,"Failure": 550})
under = RandomUnderSampler(sampling_strategy = {"Some Profits": 650})
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

# transform the dataset
X_train, y_train = pipeline.fit_resample(X_train, y_train)

# Create Decision Tree classifer object
clf = DecisionTreeClassifier(max_depth= 5)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

#print confusion matrix
unique_label = np.unique([y_test, y_pred])
print()
print("Confusion Matrix")
print()
cmtx = pd.DataFrame(
    metrics.confusion_matrix(y_test, y_pred, labels=unique_label), 
    index=['true:{:}'.format(x) for x in unique_label], 
    columns=['pred:{:}'.format(x) for x in unique_label])
print(cmtx)

Accuracy: 0.4968152866242038

Confusion Matrix

                   pred:Failure  pred:Some Profits  pred:Success
true:Failure                 38                 51            16
true:Some Profits            72                124            45
true:Success                 19                 34            72


In [11]:
#print textual representation of the tree
text_representation = tree.export_text(clf, feature_names=['Thriller', 'Fantasy','Animation', 
               'Family','Action','Drama', 'Sci-Fi', 'Crime', 'Adventure', 'Romance', 'facenumber_in_poster',
               'ratings_G','ratings_PG', 'ratings_PG-13', 'ratings_R', 'budget_quartile'])
print(text_representation)

|--- budget_quartile <= 87.50
|   |--- budget_quartile <= 62.50
|   |   |--- ratings_R <= 0.50
|   |   |   |--- facenumber_in_poster <= 1.94
|   |   |   |   |--- Thriller <= 0.50
|   |   |   |   |   |--- class: Some Profits
|   |   |   |   |--- Thriller >  0.50
|   |   |   |   |   |--- class: Some Profits
|   |   |   |--- facenumber_in_poster >  1.94
|   |   |   |   |--- Family <= 0.50
|   |   |   |   |   |--- class: Some Profits
|   |   |   |   |--- Family >  0.50
|   |   |   |   |   |--- class: Some Profits
|   |   |--- ratings_R >  0.50
|   |   |   |--- Crime <= 0.50
|   |   |   |   |--- Drama <= 0.50
|   |   |   |   |   |--- class: Some Profits
|   |   |   |   |--- Drama >  0.50
|   |   |   |   |   |--- class: Failure
|   |   |   |--- Crime >  0.50
|   |   |   |   |--- budget_quartile <= 37.50
|   |   |   |   |   |--- class: Some Profits
|   |   |   |   |--- budget_quartile >  37.50
|   |   |   |   |   |--- class: Failure
|   |--- budget_quartile >  62.50
|   |   |--- Drama <= 0.50