## Import libraries

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

import tikzplotlib

from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV, cross_val_predict, validation_curve, learning_curve

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.compose import TransformedTargetRegressor
from sklearn.feature_selection import mutual_info_regression

from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, make_scorer

from utils.preprocessing import preprocess_duplicated_and_missing, preprocess_irrelevant_features, one_hot_encode_genres_feature, one_hot_encode_studio_feature, other_fixes, standardize
from utils.feature_extraction import extract_embeddings_features, pca_on_embeddings
from utils.feature_selection import get_mutual_information_matrix, normalize_mutual_information_matrix, select_features_MI, mrmr
from utils.model_selection import linreg, perform_grid_search, perform_random_search, perform_halving_random_search, perform_bayesian_search, evaluate_model, validate_model, compare_models, test_model
from utils.plots import plot_correlation_matrix, plot_mutual_information_matrix, plot_mutual_information_with_target, plot_residuals, plot_predictions, validate_model_with_feature_selection

In [6]:
# declare variables for model selection
kf = KFold(n_splits=5, shuffle=True, random_state=0)
rmse = make_scorer(mean_squared_error, greater_is_better=False, squared=False)

In [7]:
pd.set_option('display.max_columns', 50)

## Load datasets

In [8]:
def read_datasets():
	X1 = pd.read_csv("datasets/X1.csv", na_values="\\N")
	Y1 = pd.read_csv("datasets/Y1.csv", header=None, names=["revenues"])
	X2 = pd.read_csv("datasets/X2.csv", na_values="\\N")

	X1.drop("Unnamed: 0", axis=1, inplace=True)
	df = pd.concat([X1, Y1], axis = 1)

	print(f"X1 dataset contains {X1.shape[0]} observations and {X1.shape[1]} features")
	print(f"X2 dataset (for prediction only) contains {X2.shape[0]} observations")

	return df, X2

## Preprocessing

In [9]:
def preprocess(df, dataset_name):
	print("-" * 25)
	print(f"PREPROCESSING {dataset_name}...")
	print("-" * 25)
	# remove duplicated observations and preprocessing missing values
	df = preprocess_duplicated_and_missing(df)
	# remove (obvious) irrelevant/redundant features
	df = preprocess_irrelevant_features(df)

	# fix high-cardinality + one-hot-encode studio feature
	df = one_hot_encode_studio_feature(df)

	# one-hot encode genres feature
	df = one_hot_encode_genres_feature(df)

	# minor fixes
	df = other_fixes(df)
	return df

## Feature extraction and dimension reduction

In [10]:
def extract_features(X, Y, run_pca=True, non_linear=True):
	print("-" * 25)
	print("FEATURE EXTRACTION...")
	print("-" * 25)

	X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size = 0.8, test_size = 0.2, shuffle = True, random_state = 0)

	print(f"training dataset dimension: X_train: {X_train.shape}, y_train: {y_train.shape}")
	print(f"testing dataset dimension: X_test: {X_test.shape}, y_test: {y_test.shape}")

	# remove outliers only on train set
	# as test set should be representative of the reality
	#remove_outliers(X_train)

	# extract feature vectors
	X_train_img_embeddings = extract_embeddings_features(X_train["img_embeddings"])
	X_test_img_embeddings = extract_embeddings_features(X_test["img_embeddings"])

	X_train_text_embeddings = extract_embeddings_features(X_train["text_embeddings"])
	X_test_text_embeddings = extract_embeddings_features(X_test["text_embeddings"])

	X_train_img_df, X_test_img_df = pca_on_embeddings(X_train_img_embeddings, X_test_img_embeddings, X_train.index, X_test.index, prefix="img_feature", total_variance_explained=0.6, run_pca=run_pca, non_linear=non_linear)

	X_train_text_df, X_test_text_df = pca_on_embeddings(X_train_text_embeddings, X_test_text_embeddings, X_train.index, X_test.index, prefix="text_feature", total_variance_explained=0.8, run_pca=run_pca, non_linear=non_linear)

	# drop unnecessary features
	X_train.drop(["img_embeddings", "text_embeddings"], axis=1, inplace=True)
	X_test.drop(["img_embeddings", "text_embeddings"], axis=1, inplace=True)

	# standardize other features
	X_train, X_test, standard_scaler = standardize(X_train, X_test)

	X_train = pd.concat([X_train, X_train_img_df, X_train_text_df], axis=1)
	X_test = pd.concat([X_test, X_test_img_df, X_test_text_df], axis=1)

	# should also extract features for X2

	return X_train, X_test, y_train, y_test

## Model

In [11]:
df, X2 = read_datasets()

# preprocessing 
df = preprocess(df, "modeling dataset")
X2 = preprocess(X2, "prediction dataset")

# spliting input and target
X = df.drop("revenues", axis=1)
Y = df["revenues"]

# standardize, pca, train-test split
X_train, X_test, y_train, y_test = extract_features(X, Y, run_pca=True, non_linear=False)

X1 dataset contains 3540 observations and 13 features
X2 dataset (for prediction only) contains 1518 observations
-------------------------
PREPROCESSING modeling dataset...
-------------------------
[X] Removing duplicated and missing values
[X] Removing irrelevant features
[X] One-Hot encoding studio feature
[X] One-Hot encoding genres feature
[X] Minor fixes
-------------------------
PREPROCESSING prediction dataset...
-------------------------
[X] Removing duplicated and missing values
[X] Removing irrelevant features
[X] One-Hot encoding studio feature
[X] One-Hot encoding genres feature
[X] Minor fixes
-------------------------
FEATURE EXTRACTION...
-------------------------
training dataset dimension: X_train: (2484, 54), y_train: (2484,)
testing dataset dimension: X_test: (621, 54), y_test: (621,)


extracting features:   0%|          | 0/2484 [00:00<?, ?it/s]

extracting features:   0%|          | 0/621 [00:00<?, ?it/s]

extracting features:   0%|          | 0/2484 [00:00<?, ?it/s]

extracting features:   0%|          | 0/621 [00:00<?, ?it/s]

successfully reduced from 2048 features to 56 features keeping 60.0% of variance explained
successfully reduced from 768 features to 3 features keeping 80.0% of variance explained


In [None]:
scores, columns = mrmr(X_train, y_train)

scores_df = pd.Series(scores, index=columns)
scores_df.plot.bar(figsize=(20, 5))

In [12]:
#columns[scores > 0.001]

KEPT_FEATURES = ['n_votes', 'studio_other', 'release_year', 'production_year',
       'Adventure', 'runtime', 'studio_Uni.', 'studio_BV', 'studio_WB',
       'Action', 'studio_Par.', 'studio_SPC', 'studio_Fox',
       'text_feature2', 'img_feature43', 'text_feature0', 'studio_Col.',
       'studio_MGM', 'studio_Sony', 'studio_Orion', 'studio_Strand',
       'studio_Magn.', 'studio_IFC', 'studio_Mira.', 'studio_Eros',
       'Animation', 'studio_Reg.', 'studio_Gold.', 'studio_NL', 'Comedy',
       'Mystery', 'Horror', 'Fantasy', 'Drama', 'studio_FoxS']

X_train = X_train[KEPT_FEATURES]
X_test = X_test[KEPT_FEATURES]

In [None]:
DISCRETE_FEATURES = ["ratings", "production_year", "release_year"]
STUDIOS_FEATURES = X_train.columns[X_train.columns.str.startswith('studio')].tolist()
GENRES_FEATURES = X_train.coumns[X_train.columns.str.startswith("genre")].tolist()

DISCRETE_FEATURES = np.concatenate(DISCRETE_FEATURES, STUDIOS_FEATURES, GENRES_FEATURES)

In [None]:
plot_mutual_information_with_target(X_train, y_train)
tikzplotlib.save("report/figures/MI_with_target.tex")

### Linear Regression

In [13]:
print("+" * 25)
print("Linear Regression")
print("+" * 25)

val_score, rmse_score = linreg(X_train, y_train, X_test, y_test, kf, rmse)
print(val_score)
print(rmse_score)

# percentiles_candidates = [40, 45, 50, 55, 60]
# val_scores = []
# rmse_scores = []
# f1_scores = []

# # test different percentage of features to keep (MI)
# for percentile in percentiles_candidates:
# 	X_train_MI, X_test_MI = select_features_MI(X_train, y_train, X_test, percentile=percentile)
# 	val_score, rmse_score = linreg(X_train_MI, y_train, X_test_MI, y_test, kf, rmse)
# 	val_scores.append(np.round(lr_score, 3))
# 	rmse_scores.append(np.round(rmse_score, 3))

# pd.DataFrame({
# 	"Features keps [%]": percentiles_candidates,
# 	"val scores (RMSE)": val_scores,
# 	"test scores (RMSE)": rmse_scores
# })

# plt.plot(percentiles_candidates, scores, color="blue", marker="o")
# plt.title("Linear Regression: RMSE for different percentages of feature kept (MI)")
# plt.xlabel("percentage of features kepts")
# plt.ylabel("score (RMSE)")

# compare with RFE
#X_train_RFE, X_test_RFE = select_features_RFE(X_train, y_train, X_test)
#lr_score = linreg(X_train_RFE, y_train, X_test_RFE, y_test, kf, rmse)

#print("[RFE] Linear Regression RMSE: {:.3f}".format(lr_score))

+++++++++++++++++++++++++
Linear Regression
+++++++++++++++++++++++++
69728273.214273
78881846.08175164


### K-Nearest Neighbors

In [16]:
print("+" * 25)
print("K-Nearset Neighbors")
print("+" * 25)

KNN_pipe = Pipeline([
	("model", TransformedTargetRegressor(regressor=KNeighborsRegressor(), func=np.log, inverse_func=np.exp))
])

KNN = {
	"instance": KNN_pipe,
	"hyperparameters": {
		"model__regressor__n_neighbors": np.linspace(20, 40, 10, dtype=int),
		"model__regressor__p": [1], #[1, 2],
		"model__regressor__weights": ["uniform", "distance"]
	},
	"n_iter": 10,
	"validation_param": "model__regressor__n_neighbors"
}

best_estimator, best_params, best_score = test_model(
	model=KNN, 
	name="K-Nearest Neighbors", 
	X_train=X_train, 
	y_train=y_train, 
	X_test=X_test, 
	y_test=y_test, 
	kf=kf, 
	scorer=rmse
)

print(best_estimator)
print(best_params)
print(-best_score)

# percentiles_candidates = [40, 50, 60]

# estimators = []
# scores = []

# for percentile in percentiles_candidates:
# 	X_train_MI, X_test_MI = select_features_MI(X_train, y_train, X_test, percentile=percentile)

# 	best_estimator, best_params, best_score = test_model(
# 		model=KNN, 
# 		name="K-Nearest Neighbors", 
# 		X_train=X_train_MI, 
# 		y_train=y_train, 
# 		X_test=X_test_MI, 
# 		y_test=y_test, 
# 		kf=kf, 
# 		scorer=rmse
# 	)

# 	estimators.append(best_estimator)
# 	scores.append(best_score)

# validate_model_with_feature_selection(percentiles_candidates, estimators, "K-Nearest Neighbors", KNN["validation_param"], KNN["hyperparameters"][KNN["validation_param"]], X_train_MI, y_train, X_test_MI, y_test, kf, rmse)

# pd.DataFrame({
# 	"Features keps [%]": percentiles_candidates,
# 	"val scores": scores
# })

+++++++++++++++++++++++++
K-Nearset Neighbors
+++++++++++++++++++++++++
Pipeline(steps=[('model',
                 TransformedTargetRegressor(func=<ufunc 'log'>,
                                            inverse_func=<ufunc 'exp'>,
                                            regressor=KNeighborsRegressor()))])
{'model__regressor__weights': 'distance', 'model__regressor__p': 1, 'model__regressor__n_neighbors': 20}
K-Nearest Neighbors RMSE: -8551450384677128.000
Pipeline(steps=[('model',
                 TransformedTargetRegressor(func=<ufunc 'log'>,
                                            inverse_func=<ufunc 'exp'>,
                                            regressor=KNeighborsRegressor(n_neighbors=20,
                                                                          p=1,
                                                                          weights='distance')))])
{'model__regressor__weights': 'distance', 'model__regressor__p': 1, 'model__regressor__n_neighbors': 20}

In [None]:
validate_model_with_feature_selection(percentiles_candidates, estimators, "K-Nearest Neighbors", KNN["validation_param"], KNN["hyperparameters"][KNN["validation_param"]], X_train_MI, y_train, X_test_MI, y_test, kf, rmse)

### Multi-Layer Perceptron

In [None]:
MLP_pipe = Pipeline([
	("model", TransformedTargetRegressor(regressor=MLPRegressor(), func=np.log, inverse_func=np.exp))
])

MLP = {
	"instance": MLP_pipe,
	"hyperparameters": {
		"model__regressor__hidden_layer_sizes": [(25,25,25),(25,25),(25,)],
		"model__regressor__activation": ["identity", "logistic", "tanh", "relu"],
		"model__regressor__alpha": 10.0 ** -np.arange(1, 7), # https://scikit-learn.org/stable/modules/neural_networks_supervised.html,
		"model__regressor__max_iter": [int(x) for x in np.linspace(10, 10000, 100)]
	},
	"n_iter": 10,
	"validation_param": "hidden_layer_sizes"
}

best_estimator, best_params, best_score = test_model(
	model=MLP, 
	name="Multi-Layer Perceptron", 
	X_train=X_train, 
	y_train=y_train, 
	X_test=X_test, 
	y_test=y_test, 
	kf=kf, 
	scorer=rmse
)

print(best_estimator)
print(best_params)
print(best_score)

# percentiles_candidates = [40, 50, 60]

# estimators = []
# scores = []

# for percentile in percentiles_candidates:
# 	X_train_MI, X_test_MI = select_features_MI(X_train, y_train, X_test, percentile=percentile)

# 	best_estimator, best_params, best_score = test_model(
# 		model=MLP, 
# 		name="Multi-Layer Perceptron", 
# 		X_train=X_train_MI, 
# 		y_train=y_train, 
# 		X_test=X_test_MI, 
# 		y_test=y_test, 
# 		kf=kf, 
# 		scorer=rmse
# 	)

# 	estimators.append(best_estimator)
# 	scores.append(best_score)

# pd.DataFrame({
# 	"Features keps [%]": percentiles_candidates,
# 	"val scores": scores
# })

### Random Forest

In [None]:
random_forest_pipe = Pipeline([
	("model", TransformedTargetRegressor(regressor=RandomForestRegressor(criterion="gini", min_samples_split=30), func=np.log, inverse_func=np.exp))
])

random_forest = {
	"instance": random_forest_pipe,
	"hyperparameters": {
		"model__regressor__n_estimators": [int(x) for x in np.linspace(25, 40, 10)],
		"model__regressor__criterion": ["squared_error", "absolute_error", "poisson"],
		"model__regressor__max_features": ["auto", "sqrt"],
		"model__regressor__max_depth": [3, 5, 7, 10, 12, None] # none means unbounded max depth
	},
	"n_iter": 10,
	"validation_param": "max_depth"
},

test_model(
	model=random_forest, 
	name="Random Forest", 
	X_train=X_train_MI, 
	y_train=y_train, 
	X_test=X_test_MI, 
	y_test=y_test, 
	kf=kf, 
	scorer=rmse
)

percentiles_candidates = np.arange(40, 65, 5)

estimators = []
scores = []

for percentile in percentiles_candidates:
	X_train_MI, X_test_MI = select_features_MI(X_train, y_train, X_test, percentile=percentile)

	best_estimator, best_params, best_score = test_model(
		model=random_forest, 
		name="Random Forest", 
		X_train=X_train_MI, 
		y_train=y_train, 
		X_test=X_test_MI, 
		y_test=y_test, 
		kf=kf, 
		scorer=rmse
	)

	estimators.append(best_estimator)
	scores.append(best_score)

#validate_model_with_feature_selection(percentiles, models, "Random Forest", random_forest["validation_param"], random_forest["hyperparamaters"][random_forest["validation_param"]], X_train_MI, y_train, X_test_MI, y_test, kf, rmse)

pd.DataFrame({
	"Features keps [%]": percentiles_candidates,
	"val scores": scores
})

## Prediction

We're gonna make prediction about the revenue of movies present in `X2.csv`.