# Movie Popularity Predictor

## Imports and Global Variables

In [None]:
import os
import utils
import metrics
import pathlib
import numpy as np
import pandas as pd
import model_runner as mr

PARENT_PATH = pathlib.Path(os.getcwd()).parent.resolve()
DB_PATH = PARENT_PATH/"StaticDB"
MODEL_PATH = PARENT_PATH/"models"
print(PARENT_PATH)

## Data Preprocessing

In [None]:
train_df_pickled = utils.unpickle_df(DB_PATH/"train_movies.pickle")
pre_cleaned_len = len(train_df_pickled)
train_df_pickled = utils.rm_rows_missing_data(train_df_pickled, 3)
train_df, validation_df = utils.train_test_split(train_df_pickled, 20, 42)

data_name = f"noextras_datalen_{len(train_df)}"
print(f"Unpacked {pre_cleaned_len} rows of training data. Cut down dataset to {len(train_df)+len(validation_df)} training rows."
       " 20% will be dedicated to validation.")

## Model Customization

### Included Features

In [None]:
feature_tup = (
    metrics.Feature(("budget",), metrics.get_numeric),                              # has zeros ro remove
    metrics.Feature(("belongs_to_collection",), metrics.get_belongs_to_collection), # not a strike
    metrics.Feature(("genres",), metrics.get_genres),                               # has some empty lists to remove
    metrics.Feature(("original_language",), metrics.get_original_language),         # cannot find if there are emptys
    metrics.Feature(("views_per_day",), metrics.get_numeric),
    metrics.Feature(("release_date",), metrics.get_release_year),                   # at least has has null values, idk if there are zeros
    metrics.Feature(("revenue",), metrics.get_numeric),                             # has zero values
    metrics.Feature(("runtime",), metrics.get_numeric),                             # could have zeros, int64
    metrics.Feature(("spoken_languages",), metrics.get_num_spoken_languages),       # has some empty lists 
    metrics.Feature(("vote_average", "vote_count",), metrics.get_vote_popularity),
)
f"Defined feature tuple. Using features {[keys for feature in feature_tup for keys in feature.feature_keys]}"

### Correlation Matrix between Training Data and Output

In [None]:
print(mr.ModelRunner.get_corr_matrix(train_df, feature_tup, "popularity").abs().sort_values(ascending=False))

### ML Models
Please run ONE model cell below. If a grid search is desired, run the model cell first, then select the grid search cell. This will run a grid search for the chosen model. Once selected, run the training cell.

In [None]:
# Linear Regression
from sklearn.linear_model import LinearRegression
model_type = LinearRegression(normalize=True)

empty_model_type = LinearRegression
params = {"normalize": [True, False]}

is_grid_search = False
model_name = "LinearRegression"

In [None]:
# Ridge Regression
from sklearn.linear_model import Ridge
model_type = Ridge(alpha=0.14, normalize=True)

empty_model_type = Ridge
params = {"alpha": np.logspace(-9, 9, num=1000), "normalize": [True, False]}

is_grid_search = False
model_name = "RidgeRegression"

In [None]:
# Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
model_type = DecisionTreeRegressor(max_depth=8, max_features='auto', max_leaf_nodes=50, min_samples_leaf=5, min_weight_fraction_leaf=0, splitter='best')

params = {
    "splitter": ["best"],
    "max_depth": np.linspace(1, 15, 5, dtype=int),
    "min_samples_leaf": np.linspace(1, 10, 5, dtype=int),
    "min_weight_fraction_leaf": np.linspace(0.1, 0.9, 3, dtype=int),
    "max_features": ["auto"],
    "max_leaf_nodes": [None] + list(np.linspace(10, 90, 3, dtype=int))
}
empty_model_type = DecisionTreeRegressor

is_grid_search = False
model_name = "DecisionTreeRegressor"

In [None]:
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
model_type = RandomForestRegressor(max_depth=8, max_features="auto", max_leaf_nodes=50, min_samples_leaf=20)

params = {
    "max_depth": np.linspace(1, 15, 5, dtype=int),                      # max depth of tree
    "min_samples_leaf": np.linspace(1, 10, 5, dtype=int),               # min num of samples to split a node
    "max_features": ["auto"],                                           # number of features when looking for best split
    "max_leaf_nodes": [None] + list(np.linspace(10, 90, 3, dtype=int))  # maximum number of leaf nodes
}
empty_model_type = RandomForestRegressor

is_grid_search = False
model_name = "RandomForestRegressor"

In [None]:
# Support Vector Machine (SVM) Regressor
from sklearn.svm import LinearSVR
model_type = LinearSVR(epsilon=1e-1, tol=0.1, C=2, max_iter=5000)

params = {
    "tol": np.linspace(1e-5, 1e-3, 5, dtype=float),
    "epsilon": np.linspace(0.00001, 1e-2, 5, dtype=float),
    "C": np.linspace(1, 10, 3, dtype=int),
    "fit_intercept": [True, False],
    "intercept_scaling": np.linspace(1, 10, 5, dtype=int),    
}
empty_model_type = LinearSVR

is_grid_search = False
model_name = "SVMRegressor"

In [None]:
# Grid search
from sklearn.model_selection import GridSearchCV
model_type = GridSearchCV(empty_model_type(), params, cv=10)

is_grid_search = True
model_name = "GridSearch_" + model_name

## Fit the Model

In [None]:
import warnings

model = mr.ModelRunner(model_type, is_grid_search=is_grid_search, prediction_col="popularity")
if is_grid_search:
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        model.fit(train_df, feature_tup)
        print("Best Cross Validation Model Score: ", model.get_best_score())
        print("Params for Best Model Score", model.get_best_params())
else:
    model.fit(train_df, feature_tup)

## Save Model as Pickle File

In [None]:
from datetime import datetime

now = datetime.now()
file = MODEL_PATH/(data_name + "_" + model_name + "_" + now.strftime("%m_%d_%Y__%H_%M"))
model.save(file)
file

## Validate the Model

In [None]:
predicted_views_per_day = model.predict(validation_df)
print("Model Cross Validation Score (R^2):", model.get_score(validation_df))

## Run the Model

### Load the Model from Pickle (Optional)

In [None]:
# file = MODEL_PATH/"maxStrikes_2_noextras_datalen_7229_GridSearch_RandomForestRegressor_11_28_2021__13_32"
file = MODEL_PATH/"DATA_GridSearch_RandomForestRegressor_11_21_2021__11_13"
model = mr.load_model(file)

### Predict Testing Data

In [None]:
test_df = utils.unpickle_df(DB_PATH/"test_movies.pickle")
test_df = utils.rm_rows_missing_data(test_df, 2)
predicted_views_per_day = model.predict(test_df)

### Explain Row of Testing Data

In [None]:
import random

random_rows = tuple(random.sample(range(0, len(test_df)), 10))
model.explain_notebook(train_df, test_df, rows=random_rows)