# Movie Popularity Predictor

## Imports and Global Variables

In [None]:
import utils
import metrics
import pathlib
import numpy as np
import model_runner as mr

import os
PARENT_PATH = pathlib.Path(os.getcwd()).parent.resolve()
DB_PATH = PARENT_PATH/"StaticDB"
MODEL_PATH = PARENT_PATH/"models"
print(PARENT_PATH)

## Data Preprocessing

In [None]:
data_name = "DATA"
train_df = utils.unpickle_df(DB_PATH/"train_movies.pickle")

## Model Customization

### Included Features

In [None]:
feature_tup = (
    utils.Feature(("budget",), metrics.get_budget),
    utils.Feature(("belongs_to_collection",), metrics.get_belongs_to_collection),
)

### ML Models

In [None]:
# Linear Regression
from sklearn.linear_model import LinearRegression
model_type = LinearRegression(normalize=True)

empty_model_type = LinearRegression
params = {"normalize": [True, False]}

is_grid_search = False
model_name = "LinearRegression"

In [None]:
# Ridge Regression
from sklearn.linear_model import Ridge
model_type = Ridge(alpha=1, normalize=True)

empty_model_type = Ridge
params = {"alpha": np.logspace(-9, 9, num=1000), "normalize": [True, False]}

is_grid_search = False
model_name = "RidgeRegression"

In [None]:
# RANSAC Regression with Linear/Ridge Regression as the base model
from sklearn.linear_model import RANSACRegressor
model_type = RANSACRegressor(LinearRegression)

params.update({"base_estimator": [empty_model_type()]})
empty_model_type = RANSACRegressor

is_grid_search = False
model_name = "RANSACRegression_" + model_name

In [None]:
# Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
model_type = DecisionTreeRegressor()

params = {
    "splitter": ["best","random"],
    "max_depth": np.linspace(1, 15, 15),
    "min_samples_leaf": np.linspace(1, 10, 10),
    "min_weight_fraction_leaf": np.linspace(0.1, 0.9, 9),
    "max_features": ["auto","log2","sqrt",None],
    "max_leaf_nodes": [None] + list(np.linspace(10, 90, 9))
}
empty_model_type = DecisionTreeRegressor

is_grid_search = False
model_name = "DecisionTreeRegressor"

In [None]:
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
model_type = RandomForestRegressor()

params = {
    "n_estimators": np.linspace(10, 500, 10),
    "max_depth": np.linspace(1, 15, 15),
    "min_samples_leaf": np.linspace(1, 10, 10),
    "min_weight_fraction_leaf": np.linspace(0.1, 0.9, 9),
    "max_features": ["auto","log2","sqrt",None],
    "max_leaf_nodes": [None] + list(np.linspace(10, 90, 9))
}
empty_model_type = RandomForestRegressor

is_grid_search = False
model_name = "RandomForestRegressor"

In [None]:
# Support Vector Machine (SVM) Regressor
from sklearn.svm import LinearSVR
model_type = LinearSVR()

params = {
    "tol": np.linspace(1e-5, 1e-3, 20),
    "epsilon": np.linspace(0, 1e-2, 10),
    "C": np.linspace(0, 10, 11),
    "fit_intercept": [True, False],
    "intercept_scaling": np.linspace(1, 10, 10),    
}
empty_model_type = LinearSVR

is_grid_search = False
model_name = "SVMRegressor"

In [None]:
# Grid search
from sklearn.model_selection import GridSearchCV
model_type = GridSearchCV(empty_model_type(), params, cv=10)

is_grid_search = True
model_name = "GridSearch_" + model_name

## Fit the Model

In [None]:
model = mr.ModelRunner(model_type, is_grid_search=is_grid_search)
model.fit(train_df, feature_tup)

if is_grid_search:
    print("Best Cross Validation Model Score: ", model.get_best_score())
    print("Params for Best Model Score", model.get_best_params())

## Save Model as Pickle File

In [None]:
from datetime import datetime

now = datetime.now()
file = MODEL_PATH/(data_name + "_" + model_name + "_" + now.strftime("%m_%d_%Y__%H_%M"))
model.save(file)
file

## Run the Model

### Load the Model from Pickle

In [None]:
file = MODEL_PATH/"name"
model = mr.load_model(file)

### Predict and Score Testing Data

In [None]:
test_df = utils.unpickle_df(DB_PATH/"test_movies.pickle")
predicted_views_per_day =model.predict(test_df)
print("Model Predictions Score (R^2):", model.score(test_df))

### Explain Row of Testing Data

In [None]:
model.explain_notebook(train_df, test_df, row=150)