## Import libraries

In [9]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV, cross_val_predict, validation_curve, learning_curve

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, make_scorer

from utils.preprocessing import preprocess_duplicated_and_missing, preprocess_irrelevant_features, one_hot_encode_genres_feature, count_encode_studio_feature, other_fixes, standardize
from utils.process_embeddings import extract_embeddings_features, pca_on_embeddings
from utils.feature_selection import select_features_correlation, select_features_MI, select_features_RFE
from utils.model_selection import linreg, perform_grid_search, perform_random_search, perform_bayesian_search, evaluate_model, validate_model, compare_models
from utils.plots import plot_correlation_matrix, plot_residuals, plot_predictions

In [3]:
pd.set_option('display.max_columns', 50)

## Load datasets

In [4]:
def read_datasets():
	X1 = pd.read_csv("datasets/X1.csv", na_values="\\N")
	Y1 = pd.read_csv("datasets/Y1.csv", header=None, names=["revenues"])
	X2 = pd.read_csv("datasets/X2.csv", na_values="\\N")

	X1.drop("Unnamed: 0", axis=1, inplace=True)
	df = pd.concat([X1, Y1], axis = 1)

	print(f"X1 dataset contains {X1.shape[0]} observations and {X1.shape[1]} features")
	print(f"X2 dataset (for prediction only) contains {X2.shape[0]} observations")

	return df, X2

## Preprocessing

In [5]:
def preprocess(df, dataset_name):
	print(f"PREPROCESSING {dataset_name}...")
	print("--------------------------")
	# remove duplicated observations and preprocessing missing values
	df = preprocess_duplicated_and_missing(df)
	# remove (obvious) irrelevant/redundant features
	df = preprocess_irrelevant_features(df)

	# one-hot encode genres feature
	df = one_hot_encode_genres_feature(df)
	#df = label_encode_studio_feature(df)

	# count encode studio feature
	df = count_encode_studio_feature(df)

	# minor fixes
	df = other_fixes(df)
	return df

## Feature extraction and dimension reduction

In [6]:
def extract_features(X, Y, run_pca=True, non_linear=True):
	print("FEATURE EXTRACTION...")
	print("--------------------------")

	X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size = 0.8, test_size = 0.2, shuffle = True, random_state = 0)

	print(f"training dataset dimension: X_train: {X_train.shape}, y_train: {y_train.shape}")
	print(f"testing dataset dimension: X_test: {X_test.shape}, y_test: {y_test.shape}")

	# remove outliers only on train set
	# as test set should be representative of the reality
	#remove_outliers(X_train)

	# extract feature vectors
	X_train_img_embeddings = extract_embeddings_features(X_train["img_embeddings"])
	X_test_img_embeddings = extract_embeddings_features(X_test["img_embeddings"])

	X_train_text_embeddings = extract_embeddings_features(X_train["text_embeddings"])
	X_test_text_embeddings = extract_embeddings_features(X_test["text_embeddings"])

	X_train_img_df, X_test_img_df = pca_on_embeddings(X_train_img_embeddings, X_test_img_embeddings, X_train.index, X_test.index, prefix="img_feature", total_variance_explained=0.8, run_pca=run_pca, non_linear=non_linear)

	X_train_text_df, X_test_text_df = pca_on_embeddings(X_train_text_embeddings, X_test_text_embeddings, X_train.index, X_test.index, prefix="text_feature", total_variance_explained=0.8, run_pca=run_pca, non_linear=non_linear)

	# drop unnecessary features
	X_train.drop(["img_embeddings", "text_embeddings"], axis=1, inplace=True)
	X_test.drop(["img_embeddings", "text_embeddings"], axis=1, inplace=True)

	# standardize other features
	X_train, X_test, standard_scaler = standardize(X_train, X_test)

	X_train = pd.concat([X_train, X_train_img_df, X_train_text_df], axis=1)
	X_test = pd.concat([X_test, X_test_img_df, X_test_text_df], axis=1)

	# should also extract features for X2

	return X_train, X_test, y_train, y_test

## Model

In [10]:
df, X2 = read_datasets()

# preprocessing 
df = preprocess(df, "modeling dataset")
X2 = preprocess(X2, "prediction dataset")

# spliting input and target
X = df.drop("revenues", axis=1)
Y = df["revenues"]

# standardize, pca, train-test split
X_train, X_test, y_train, y_test = extract_features(X, Y, run_pca=True, non_linear=False)

# feature selection
X_train_corr, X_test_corr = select_features_correlation(X_train, y_train, X_test, percentile=20)
X_train_MI, X_test_MI = select_features_MI(X_train, y_train, X_test, percentile=20)
#X_train_RFE, X_test_RFE = select_features_RFE(X_train, y_train, X_test)

# model
kf = KFold(n_splits=5)
rmse = make_scorer(mean_squared_error, greater_is_better=True, squared=False)

models = {
	"Random Forest": {
		"instance": RandomForestRegressor(criterion="gini", min_samples_split=30),
		"hyperparameters": {
			"n_estimators": np.linspace(100, 1000, 10),
			"criterion": ["squared_error", "absolute_error", "poisson"],
			"max_depth": [3, 10, None] # none means unbounded max depth
		},
		"n_iter": 10,
		"validation_param": "n_estimators"
	},
	"KNN": {
		"instance": KNeighborsRegressor(),
		"hyperparameters":  {
			"n_neighbors": np.arange(1, 55, 5),
			"p": [1, 2],
			"weights": ["uniform", "distance"]
		},
		"n_iter": 5,
		"validation_param": "n_neighbors"
	},
	"MLP": {
		"instance": MLPRegressor(),
		"hyperparameters": {
			"hidden_layer_sizes": [(100,75,50),(75,50,25),(50,25,10),(25,25),(10,10),(10,)],
			"activation": ["identity", "logistic", "tanh", "relu"],
			"alpha": 10.0 ** -np.arange(1, 7), # https://scikit-learn.org/stable/modules/neural_networks_supervised.html,
			"max_iter": np.linspace(10, 100, 10)
		},
		"n_iter": 40,
		"validation_param": "hidden_layer_sizes"
	}
}

#print("Correlation coefficient")
#compare_models(models, X_train_corr, y_train, X_test_corr, y_test, kf, rmse)

print("Mutual Information")
compare_models(models, X_train_MI, y_train, X_test_MI, y_test, kf, rmse)

#print("Random Features Selection")
#compare_models(X_train_RFE, y_train, X_test_RFE, y_test, kf, scorer)


X1 dataset contains 3540 observations and 13 features
X2 dataset (for prediction only) contains 1518 observations
PREPROCESSING modeling dataset...
--------------------------
[X] Removing duplicated and missing values
[X] Removing irrelevant features
[X] One-Hot encoding
[X] Count encoding
[X] Minor fixes
PREPROCESSING prediction dataset...
--------------------------
[X] Removing duplicated and missing values
[X] Removing irrelevant features
[X] One-Hot encoding
[X] Count encoding
[X] Minor fixes
FEATURE EXTRACTION...
--------------------------
training dataset dimension: X_train: (2484, 34), y_train: (2484,)
testing dataset dimension: X_test: (621, 34), y_test: (621,)


extracting features:   0%|          | 0/2484 [00:00<?, ?it/s]

extracting features:   0%|          | 0/621 [00:00<?, ?it/s]

extracting features:   0%|          | 0/2484 [00:00<?, ?it/s]

extracting features:   0%|          | 0/621 [00:00<?, ?it/s]

successfully reduced from 2048 features to 125 features keeping 80.0% of variance explained
successfully reduced from 768 features to 3 features keeping 80.0% of variance explained
FEATURE SELECTION (CORRELATION MATRIX)...
reduced from 160 features to 32 features
FEATURE SELECTION (MUTUAL INFORMATION)...
reduced from 160 features to 32 features
Mutual Information
COMPARING MODELS...
Linear Regression RMSE: 2.456


TypeError: __init__() got an unexpected keyword argument 'scorer'

## Prediction

We're gonna make prediction about the revenue of movies present in `X2.csv`.