In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from joblib import dump, load

import os

In [2]:
data = pd.read_csv(os.path.join('Dropped_Less_than_1000_budget_and_other_columns.csv'))
data.head()

Unnamed: 0,imdb_title_id,original_title,year,genre,duration,language,production_company,avg_vote,votes,budget,Good_or_Bad_Movie
0,tt7939428,10 Things We Should Do Before We Break Up,2020,Romance,74,English,,4.7,486,25000000,0
1,tt10740584,Abigail Haunting,2020,Horror,85,English,Indie Film Factory,3.8,398,60000,0
2,tt11032990,Malibu Rescue: The Next Wave,2020,"Action, Adventure, Comedy",68,English,Pacific Bay Entertainment,4.5,386,5000000,0
3,tt8461224,The Tax Collector,2020,"Action, Crime, Drama",95,English,Cedar Park Entertainment,4.7,4862,30000000,0
4,tt11547496,Camp Blood 8: Revelations,2020,Horror,82,English,Sterling Entertainment,2.2,114,10000,0


In [3]:
avg_vote_rounded = list(map(round, data["avg_vote"]))

In [4]:
data["avg_vote_rounded"]=avg_vote_rounded

In [5]:
# Assign X (data) and y (target)
X = data[["duration","budget"]]
y = data["Good_or_Bad_Movie"]
print(X.shape, y.shape)

(10345, 2) (10345,)


In [6]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Good_or_Bad_Movie, dtype: int64

In [7]:
#Split our data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)

In [8]:
#Create a Logistic Regression Model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier


LogisticRegression()

In [9]:
classifier

LogisticRegression()

In [10]:
#Fit (train) or model using the training data
classifier.fit(X_train, y_train)
dump(classifier, 'filename.joblib') 

['filename.joblib']

In [11]:
#Validate the model using the test data
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8247479629885375
Testing Data Score: 0.8369845360824743


In [12]:
# Need the duration and budget inputs as a list, then put into another list and call predict on it to get the predicted good or bad movie
predictions = classifier.predict([[100,1000000]])
predictions

array([0], dtype=int64)

In [13]:
#Make predictions
predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [0 0 0 0 0 0 0 0 0 0]
First 10 Actual labels: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [14]:
pd.set_option('display.max_rows',None)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


#### In an attempt to tune hyperparameters I have linked the resource I have used
https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

In [15]:

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# define models and parameters
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))



Best: 0.828710 using {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.828259 (0.001159) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.828420 (0.000400) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.828420 (0.000400) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.828710 (0.001328) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.828420 (0.000400) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.828420 (0.000400) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.828452 (0.001412) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.828420 (0.000400) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.828420 (0.000400) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.828645 (0.001360) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.828420 (0.000400) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.828420 (0.000400) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.828516 (0.001236) with



In [16]:
classifier = load('filename.joblib')
X=[[500000, 2]]
Y=[0]
result=classifier.predict(X)
print('result :',result)

result : [0]
