In [212]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os
from joblib import dump, load

In [213]:
data = pd.read_csv(os.path.join('Added_good_or_bad_8_column.csv'))
data.head()

Unnamed: 0,imdb_title_id,original_title,year,genre,duration,language,production_company,avg_vote,votes,budget,Good_or_Bad_Movie,number_of_genres,number_of_languages,first_genre,Good_or_Bad_Movie_6_or_higher,Good_or_Bad_Movie_8_or_higher
0,tt7939428,10 Things We Should Do Before We Break Up,2020,Romance,74,English,,4.7,486,25000000,0,1,1,Romance,0,0
1,tt10740584,Abigail Haunting,2020,Horror,85,English,Indie Film Factory,3.8,398,60000,0,1,1,Horror,0,0
2,tt11032990,Malibu Rescue: The Next Wave,2020,"Action, Adventure, Comedy",68,English,Pacific Bay Entertainment,4.5,386,5000000,0,3,1,Action,0,0
3,tt8461224,The Tax Collector,2020,"Action, Crime, Drama",95,English,Cedar Park Entertainment,4.7,4862,30000000,0,3,1,Action,0,0
4,tt11547496,Camp Blood 8: Revelations,2020,Horror,82,English,Sterling Entertainment,2.2,114,10000,0,1,1,Horror,0,0


In [214]:
avg_vote_rounded = list(map(round, data["avg_vote"]))
data["avg_vote_rounded"]=avg_vote_rounded

In [215]:
# Create a new dataframe with the columns needed for ML
data2 = data.drop(columns=['imdb_title_id','original_title','genre','language','production_company','first_genre','votes'])
data2.head()

Unnamed: 0,year,duration,avg_vote,budget,Good_or_Bad_Movie,number_of_genres,number_of_languages,Good_or_Bad_Movie_6_or_higher,Good_or_Bad_Movie_8_or_higher,avg_vote_rounded
0,2020,74,4.7,25000000,0,1,1,0,0,5
1,2020,85,3.8,60000,0,1,1,0,0,4
2,2020,68,4.5,5000000,0,3,1,0,0,4
3,2020,95,4.7,30000000,0,3,1,0,0,5
4,2020,82,2.2,10000,0,1,1,0,0,2


In [216]:
#Try to bin the budget, add a new column to the dataframe and run the model
cut_bins = [100000,1000000,10000000,100000000,1000000000,10000000000]
cut_labels = ["<100K","100K-1M","1M-10M","10M-100M","100M-1B"]
data2['budget_bin'] = pd.cut(data2['budget'], bins=cut_bins, labels=cut_labels)

In [217]:
#Try to bin the duration, see what range has the most movies
pd.cut(data['duration'], bins=6).value_counts()

(86.0, 130.0]     7530
(41.736, 86.0]    2291
(130.0, 174.0]     456
(174.0, 218.0]      49
(218.0, 262.0]      13
(262.0, 306.0]       6
Name: duration, dtype: int64

In [314]:
#Try to bin the duration, add a new column to the dataframe and run the model
cut_bins_dur = [30,60,90,120,305,500]
cut_labels_dur = ["<30min","30-60min","60-90min","90-120min",">120min"]

#cut_bins_dur = [30,40,50,60,70,80,90,100,110,120,305,500]
#cut_labels_dur = ["<30min","30-40min","40-50min","50-60min","60-70min","70-80min","80-90min","90-100min","100-110min","110-120min",">120min"]

data2['duration_bin'] = pd.cut(data2['duration'], bins=cut_bins_dur, labels=cut_labels_dur)


In [315]:
# Budget_bin is a categorical variable, so split up the categorical variables determine if that leads to a better model
data3 = pd.get_dummies(data2)
data3.head()

Unnamed: 0,year,duration,avg_vote,budget,Good_or_Bad_Movie,number_of_genres,number_of_languages,Good_or_Bad_Movie_6_or_higher,Good_or_Bad_Movie_8_or_higher,avg_vote_rounded,budget_bin_<100K,budget_bin_100K-1M,budget_bin_1M-10M,budget_bin_10M-100M,budget_bin_100M-1B,duration_bin_<30min,duration_bin_30-60min,duration_bin_60-90min,duration_bin_90-120min,duration_bin_>120min
0,2020,74,4.7,25000000,0,1,1,0,0,5,0,0,1,0,0,0,1,0,0,0
1,2020,85,3.8,60000,0,1,1,0,0,4,0,0,0,0,0,0,1,0,0,0
2,2020,68,4.5,5000000,0,3,1,0,0,4,0,1,0,0,0,0,1,0,0,0
3,2020,95,4.7,30000000,0,3,1,0,0,5,0,0,1,0,0,0,0,1,0,0
4,2020,82,2.2,10000,0,1,1,0,0,2,0,0,0,0,0,0,1,0,0,0


In [317]:
# Assign X (data) and y (target)
#X = data3[["budget_bin_<100K","budget_bin_100K-1M","budget_bin_1M-10M","budget_bin_10M-100M","budget_bin_100M-1B","duration_bin_<30min","duration_bin_30-40min","duration_bin_40-50min","duration_bin_50-60min","duration_bin_60-70min","duration_bin_70-80min","duration_bin_80-90min","duration_bin_90-100min","duration_bin_100-110min","duration_bin_110-120min","duration_bin_>120min"]]
#X = data3[["duration","budget_bin_<100K","budget_bin_100K-1M","budget_bin_1M-10M","budget_bin_10M-100M","budget_bin_100M-1B"]]
X = data3[["budget_bin_<100K","budget_bin_100K-1M","budget_bin_1M-10M","budget_bin_10M-100M","budget_bin_100M-1B","duration_bin_<30min","duration_bin_30-60min","duration_bin_60-90min","duration_bin_90-120min","duration_bin_>120min"]]
y = data3["Good_or_Bad_Movie_8_or_higher"]
print(X.shape, y.shape)

(10345, 10) (10345,)


In [318]:
#Split our data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)

## Try Logistic Model First

In [319]:
#Create a Logistic Regression Model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(class_weight="balanced")
#classifier = LogisticRegression()
classifier


LogisticRegression(class_weight='balanced')

In [320]:
#Fit (train) or model using the training data
classifier.fit(X_train, y_train)

LogisticRegression(class_weight='balanced')

In [321]:
#Validate the model using the test data
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8937991990056622
Testing Data Score: 0.8914304123711341


In [322]:
# Need the duration and budget inputs as a list, then put into another list and call predict on it to get the predicted good or bad movie
#predictions = classifier.predict([[100,1000000,1,1]])
#predictions

In [323]:
#Make predictions
predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[1000:5000]}")
print(f"First 10 Actual labels: {y_test[1000:5000].tolist()}")

First 10 Predictions:   [0 1 0 ... 0 1 0]
First 10 Actual labels: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 

In [324]:
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
3099,0,0
3100,1,0
3101,0,0
3102,1,0


## Try a Random Forest Model

In [325]:
# Instantiate model 2 (e.g., random forest model)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=200)
model.fit(X_train, y_train)
model_training_score = round(model.score(X_train, y_train)*100,3)
base_accuracy = round(model.score(X_test, y_test)*100,3)
print(f"Training Data Score: {model_training_score} %")
print(f"Testing Data Score: {base_accuracy} %")

Training Data Score: 97.984 %
Testing Data Score: 97.97 %


In [326]:
#Make predictions
predictions = model.predict(X_test)
print(f"First 10 Predictions:   {predictions[200:300]}")
print(f"First 10 Actual labels: {y_test[200:300].tolist()}")

First 10 Predictions:   [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
First 10 Actual labels: [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [327]:
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
3099,0,0
3100,0,0
3101,0,0
3102,0,0


#### In an attempt to tune hyperparameters I have linked the resource I have used
https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

In [17]:

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# define models and parameters
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))



Best: 0.828645 using {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.828645 (0.001545) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.828420 (0.000400) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.828420 (0.000400) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.828613 (0.001378) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.828420 (0.000400) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.828420 (0.000400) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.828452 (0.001209) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.828420 (0.000400) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.828420 (0.000400) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.828388 (0.001423) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.828420 (0.000400) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.828420 (0.000400) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.828452 (0.001227) wit

