# Movie Popularity Predictor

## Imports and Global Variables

In [None]:
import os
import utils
import metrics
import pathlib
import numpy as np
import model_runner as mr

PARENT_PATH = pathlib.Path(os.getcwd()).parent.resolve()
DB_PATH = PARENT_PATH/"StaticDB"
MODEL_PATH = PARENT_PATH/"models"
print(PARENT_PATH)

## Data Preprocessing

In [None]:
data_name = "DATA"
train_df = utils.unpickle_df(DB_PATH/"train_movies.pickle")
"Unpacked training data"

## Model Customization

### Included Features

In [None]:
feature_tup = (
    metrics.Feature(("budget",), metrics.get_numeric),
    metrics.Feature(("belongs_to_collection",), metrics.get_belongs_to_collection),
    metrics.Feature(("genres",), metrics.get_genres),
    metrics.Feature(("original_language",), metrics.get_original_language),
    # metrics.Feature(("popularity",), metrics.get_numeric),
    metrics.Feature(("release_date",), metrics.get_release_year),
    metrics.Feature(("revenue",), metrics.get_numeric),
    metrics.Feature(("runtime",), metrics.get_numeric),
    metrics.Feature(("spoken_languages",), metrics.get_num_spoken_languages),
    # metrics.Feature(("vote_average", "vote_count",), metrics.get_vote_popularity),
)
"Defined feature tuple"

### ML Models

In [None]:
# Linear Regression
from sklearn.linear_model import LinearRegression
model_type = LinearRegression(normalize=True)

empty_model_type = LinearRegression
params = {"normalize": [True, False]}

is_grid_search = False
model_name = "LinearRegression"

In [None]:
# Ridge Regression
from sklearn.linear_model import Ridge
model_type = Ridge(alpha=1, normalize=True)

empty_model_type = Ridge
params = {"alpha": np.logspace(-9, 9, num=1000), "normalize": [True, False]}

is_grid_search = False
model_name = "RidgeRegression"

In [None]:
# Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
model_type = DecisionTreeRegressor()

params = {
    "splitter": ["best"],
    "max_depth": np.linspace(1, 15, 5, dtype=int),
    "min_samples_leaf": np.linspace(1, 10, 5, dtype=int),
    "min_weight_fraction_leaf": np.linspace(0.1, 0.9, 3, dtype=int),
    "max_features": ["auto"],
    "max_leaf_nodes": [None] + list(np.linspace(10, 90, 3, dtype=int))
}
empty_model_type = DecisionTreeRegressor

is_grid_search = False
model_name = "DecisionTreeRegressor"

In [None]:
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
model_type = RandomForestRegressor()

params = {
    "max_depth": np.linspace(1, 15, 5, dtype=int),
    "min_samples_leaf": np.linspace(1, 10, 5, dtype=int),
    "max_features": ["auto"],
    "max_leaf_nodes": [None] + list(np.linspace(10, 90, 3, dtype=int))
}
empty_model_type = RandomForestRegressor

is_grid_search = False
model_name = "RandomForestRegressor"

In [None]:
# Support Vector Machine (SVM) Regressor
from sklearn.svm import LinearSVR
model_type = LinearSVR(epsilon=1e-1, tol=0.1, C=2, max_iter=5000)

params = {
    "tol": np.linspace(1e-5, 1e-3, 5, dtype=int),
    "epsilon": np.linspace(0.00001, 1e-2, 5, dtype=int),
    "C": np.linspace(1, 10, 3, dtype=int),
    "fit_intercept": [True, False],
    "intercept_scaling": np.linspace(1, 10, 5, dtype=int),    
}
empty_model_type = LinearSVR

is_grid_search = False
model_name = "SVMRegressor"

In [None]:
# Grid search
from sklearn.model_selection import GridSearchCV
model_type = GridSearchCV(empty_model_type(), params, cv=10)

is_grid_search = True
model_name = "GridSearch_" + model_name

## Fit the Model

In [None]:
import warnings

model = mr.ModelRunner(model_type, is_grid_search=is_grid_search)
if is_grid_search:
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        model.fit(train_df, feature_tup)
        print("Best Cross Validation Model Score: ", model.get_best_score())
        print("Params for Best Model Score", model.get_best_params())
else:
    model.fit(train_df, feature_tup)

## Save Model as Pickle File

In [None]:
from datetime import datetime

now = datetime.now()
file = MODEL_PATH/(data_name + "_" + model_name + "_" + now.strftime("%m_%d_%Y__%H_%M"))
model.save(file)
file

## Run the Model

### Load the Model from Pickle

In [None]:
file = MODEL_PATH/"DATA_DecisionTreeRegressor_11_19_2021__12_01"
model = mr.load_model(file)

### Predict and Score Testing Data

In [None]:
test_df = utils.unpickle_df(DB_PATH/"test_movies.pickle")
predicted_views_per_day =model.predict(test_df)
print("Model Predictions Score (R^2):", model.get_score(test_df))

### Explain Row of Testing Data

In [None]:
import random

random_rows = tuple(random.sample(range(0, len(test_df)), 10))
model.explain_notebook(train_df, test_df, rows=random_rows)