In [27]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os
from joblib import dump, load

In [28]:
data = pd.read_csv(os.path.join('Added_good_or_bad_8_column.csv'))
data.head()

Unnamed: 0,imdb_title_id,original_title,year,genre,duration,language,production_company,avg_vote,votes,budget,Good_or_Bad_Movie,number_of_genres,number_of_languages,first_genre,Good_or_Bad_Movie_6_or_higher,Good_or_Bad_Movie_8_or_higher
0,tt7939428,10 Things We Should Do Before We Break Up,2020,Romance,74,English,,4.7,486,25000000,0,1,1,Romance,0,0
1,tt10740584,Abigail Haunting,2020,Horror,85,English,Indie Film Factory,3.8,398,60000,0,1,1,Horror,0,0
2,tt11032990,Malibu Rescue: The Next Wave,2020,"Action, Adventure, Comedy",68,English,Pacific Bay Entertainment,4.5,386,5000000,0,3,1,Action,0,0
3,tt8461224,The Tax Collector,2020,"Action, Crime, Drama",95,English,Cedar Park Entertainment,4.7,4862,30000000,0,3,1,Action,0,0
4,tt11547496,Camp Blood 8: Revelations,2020,Horror,82,English,Sterling Entertainment,2.2,114,10000,0,1,1,Horror,0,0


In [29]:
# Create a new dataframe with the columns needed for ML
data2 = data.drop(columns=['imdb_title_id','original_title','genre','language','production_company','first_genre','votes'])
data2.head()

Unnamed: 0,year,duration,avg_vote,budget,Good_or_Bad_Movie,number_of_genres,number_of_languages,Good_or_Bad_Movie_6_or_higher,Good_or_Bad_Movie_8_or_higher
0,2020,74,4.7,25000000,0,1,1,0,0
1,2020,85,3.8,60000,0,1,1,0,0
2,2020,68,4.5,5000000,0,3,1,0,0
3,2020,95,4.7,30000000,0,3,1,0,0
4,2020,82,2.2,10000,0,1,1,0,0


In [30]:
#Bin the budget, add a new column to the dataframe and run the model
cut_bins = [100000,1000000,10000000,100000000,1000000000,10000000000]
cut_labels = ["<100K","100K-1M","1M-10M","10M-100M","100M-1B"]
data2['budget_bin'] = pd.cut(data2['budget'], bins=cut_bins, labels=cut_labels)

In [31]:
#Bin the duration, add a new column to the dataframe and run the model
cut_bins_dur = [30,60,90,120,305,500]
cut_labels_dur = ["<30min","30-60min","60-90min","90-120min",">120min"]
data2['duration_bin'] = pd.cut(data2['duration'], bins=cut_bins_dur, labels=cut_labels_dur)

In [32]:
# Budget_bin is a categorical variable, so split up the categorical variables determine if that leads to a better model
data3 = pd.get_dummies(data2)
data3.head()

Unnamed: 0,year,duration,avg_vote,budget,Good_or_Bad_Movie,number_of_genres,number_of_languages,Good_or_Bad_Movie_6_or_higher,Good_or_Bad_Movie_8_or_higher,budget_bin_<100K,budget_bin_100K-1M,budget_bin_1M-10M,budget_bin_10M-100M,budget_bin_100M-1B,duration_bin_<30min,duration_bin_30-60min,duration_bin_60-90min,duration_bin_90-120min,duration_bin_>120min
0,2020,74,4.7,25000000,0,1,1,0,0,0,0,1,0,0,0,1,0,0,0
1,2020,85,3.8,60000,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0
2,2020,68,4.5,5000000,0,3,1,0,0,0,1,0,0,0,0,1,0,0,0
3,2020,95,4.7,30000000,0,3,1,0,0,0,0,1,0,0,0,0,1,0,0
4,2020,82,2.2,10000,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0


In [33]:
# Assign X (data) and y (target)
X = data3[["budget_bin_<100K","budget_bin_100K-1M","budget_bin_1M-10M","budget_bin_10M-100M","budget_bin_100M-1B","duration_bin_<30min","duration_bin_30-60min","duration_bin_60-90min","duration_bin_90-120min","duration_bin_>120min"]]
y = data3["Good_or_Bad_Movie_8_or_higher"]
print(X.shape, y.shape)

(10345, 10) (10345,)


In [34]:
#Split our data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)

## Logistic Regression Model

In [36]:
#Create a Logistic Regression Model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(class_weight="balanced")

In [37]:
#Fit (train) or model using the training data
classifier.fit(X_train, y_train)

LogisticRegression(class_weight='balanced')

In [38]:
#Validate the model using the test data
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8937991990056622
Testing Data Score: 0.8914304123711341


In [39]:
# Need the duration and budget inputs as a list, then put into another list and call predict on it to get the predicted good or bad movie
#predictions = classifier.predict([[100,1000000,1,1]])
#predictions

In [40]:
#Make predictions
predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [0 0 0 0 0 0 0 0 0 0]
First 10 Actual labels: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [41]:
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
3099,0,0
3100,1,0
3101,0,0
3102,1,0


In [42]:
dump(classifier,'Final_ML_Model.joblib')

['Final_ML_Model.joblib']