## Import libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

import tikzplotlib

from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV, cross_val_predict, validation_curve, learning_curve

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.compose import TransformedTargetRegressor
from sklearn.feature_selection import mutual_info_regression

from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, make_scorer, r2_score

from utils.preprocessing import preprocess_duplicated_and_missing, preprocess_irrelevant_features, one_hot_encode_genres_feature, one_hot_encode_studio_feature, remove_outliers, other_fixes, standardize
from utils.feature_extraction import extract_embeddings_features, pca_on_embeddings
from utils.feature_selection import get_mutual_information_matrix, normalize_mutual_information_matrix, select_features_MI_kbest, mrmr, select_features_RFECV
from utils.model_selection import linreg, perform_grid_search, perform_random_search, ModelSelection
from utils.plots import plot_correlation_matrix, plot_mutual_information_matrix, plot_mutual_information_with_target, plot_residuals, plot_predictions, validate_model_with_feature_selection

In [3]:
# declare variables for model selection
N_SPLITS = 5

kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=0)
rmse = make_scorer(mean_squared_error, greater_is_better=False, squared=False)

In [4]:
pd.set_option('display.max_columns', 50)

## Load datasets

In [5]:
def read_datasets():
	X1 = pd.read_csv("datasets/X1.csv", na_values="\\N")
	Y1 = pd.read_csv("datasets/Y1.csv", header=None, names=["revenues"])
	X2 = pd.read_csv("datasets/X2.csv", na_values="\\N")

	X1.drop("Unnamed: 0", axis=1, inplace=True)
	X2.drop("Unnamed: 0", axis=1, inplace=True)
	df = pd.concat([X1, Y1], axis = 1)

	print(f"X1 dataset contains {X1.shape[0]} observations and {X1.shape[1]} features")
	print(f"X2 dataset (for prediction only) contains {X2.shape[0]} observations")

	return df, X2

## Preprocessing

In [6]:
def preprocess(df, train, X2, dataset_name):
	print("-" * 25)
	print(f"PREPROCESSING {dataset_name}...")
	print("-" * 25)
	# remove duplicated observations and preprocessing missing values
	df = preprocess_duplicated_and_missing(df, train)
	X2 = preprocess_duplicated_and_missing(X2, train)

	# remove (obvious) irrelevant/redundant features
	df = preprocess_irrelevant_features(df)
	X2 = preprocess_irrelevant_features(X2)

	# fix high-cardinality + one-hot-encode studio feature
	df, X2 = one_hot_encode_studio_feature(df, X2)

	# one-hot encode genres feature
	df, X2 = one_hot_encode_genres_feature(df, X2)

	# minor fixes
	df = other_fixes(df)
	X2 = other_fixes(X2)
	return df, X2

## Feature extraction and dimension reduction

In [7]:
def remove_outliers_and_split(X, Y):
	print("-" * 25)
	print("REMOVING OUTLIERS AND TRAIN-TEST SPLIT...")
	print("-" * 25)

	X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size = 0.8, test_size = 0.2, shuffle = True, random_state = 0)

	print(f"training dataset dimension: X_train: {X_train.shape}, y_train: {y_train.shape}")
	print(f"testing dataset dimension: X_test: {X_test.shape}, y_test: {y_test.shape}")

	# remove outliers only on train set
	# as test set should be representative of the reality
	X_train, y_train = remove_outliers(X_train, y_train, ["runtime", "production_year", "release_year"])

	return X_train, y_train, X_test, y_test

def extract_features(X_train, y_train, X_test, y_test, X2, run_pca=True, non_linear=True):
	print("-" * 25)
	print("FEATURE EXTRACTION...")
	print("-" * 25)

	# extract feature vectors
	X_train_img_embeddings = extract_embeddings_features(X_train["img_embeddings"])
	X_test_img_embeddings = extract_embeddings_features(X_test["img_embeddings"])

	X_train_text_embeddings = extract_embeddings_features(X_train["text_embeddings"])
	X_test_text_embeddings = extract_embeddings_features(X_test["text_embeddings"])

	# should also extract features for X2
	X2_img_embeddings = extract_embeddings_features(X2["img_embeddings"])
	X2_text_embeddings = extract_embeddings_features(X2["text_embeddings"])

	X_train_img_df, X_test_img_df, X2_img_df = pca_on_embeddings(X_train_img_embeddings, X_test_img_embeddings, X2_img_embeddings, X_train.index, X_test.index, X2.index, prefix="img_feature", total_variance_explained=0.6, run_pca=run_pca, non_linear=non_linear)

	X_train_text_df, X_test_text_df, X2_text_df = pca_on_embeddings(X_train_text_embeddings, X_test_text_embeddings, X2_text_embeddings, X_train.index, X_test.index, X2.index, prefix="text_feature", total_variance_explained=0.8, run_pca=run_pca, non_linear=non_linear)

	# drop unnecessary features
	X_train.drop(["img_embeddings", "text_embeddings"], axis=1, inplace=True)
	X_test.drop(["img_embeddings", "text_embeddings"], axis=1, inplace=True)
	X2.drop(["img_embeddings", "text_embeddings"], axis=1, inplace=True)

	# standardize other features
	X_train, X_test, X2, standard_scaler = standardize(X_train, X_test, X2)

	X_train = pd.concat([X_train, X_train_img_df, X_train_text_df], axis=1)
	X_test = pd.concat([X_test, X_test_img_df, X_test_text_df], axis=1)
	X2 = pd.concat([X2, X2_img_df, X2_text_df], axis=1)


	return X_train, y_train, X_test, y_test, X2

## Model

In [8]:
df, X2 = read_datasets()

train_set = df

# preprocessing 
df, X2 = preprocess(df, train_set, X2, "modeling and prediction datasets")

X1 dataset contains 3540 observations and 13 features
X2 dataset (for prediction only) contains 1518 observations
-------------------------
PREPROCESSING modeling and prediction datasets...
-------------------------
[X] Removing duplicated and missing values
[X] Removing duplicated and missing values
[X] Removing irrelevant features
[X] Removing irrelevant features
[X] One-Hot encoding studio feature
[X] One-Hot encoding genres feature
[X] Minor fixes
[X] Minor fixes


In [9]:
# spliting input and target
X = df.drop("revenues", axis=1)
Y = df["revenues"]

# remove outliers and train-test split
X_train, y_train, X_test, y_test = remove_outliers_and_split(X, Y)

# extract features + standardize (and pca)
X_train, y_train, X_test, y_test, X2 = extract_features(X_train, y_train, X_test, y_test, X2, run_pca=True, non_linear=False)

-------------------------
REMOVING OUTLIERS AND TRAIN-TEST SPLIT...
-------------------------
training dataset dimension: X_train: (2484, 54), y_train: (2484,)
testing dataset dimension: X_test: (621, 54), y_test: (621,)
-------------------------
FEATURE EXTRACTION...
-------------------------


extracting features:   0%|          | 0/2408 [00:00<?, ?it/s]

extracting features:   0%|          | 0/621 [00:00<?, ?it/s]

extracting features:   0%|          | 0/2408 [00:00<?, ?it/s]

extracting features:   0%|          | 0/621 [00:00<?, ?it/s]

extracting features:   0%|          | 0/1425 [00:00<?, ?it/s]

extracting features:   0%|          | 0/1425 [00:00<?, ?it/s]

successfully reduced from 2048 features to 56 features keeping 60.0% of variance explained
successfully reduced from 768 features to 3 features keeping 80.0% of variance explained


In [10]:
#X_train_filtered, X_test_filtered = select_features_RFECV(X_train, y_train, X_test, kf, rmse)
#X_train_filtered, X_test_filtered = select_features_MI_kbest(X_train, y_train, X_test, k=75)

In [11]:
# scores, columns = mrmr(X_train, y_train)

# scores_df = pd.Series(scores, index=columns)
# scores_df.plot.bar(figsize=(20, 5))

In [None]:
# KEPT_FEATURES = ['n_votes', 'studio_other', 'release_year', 'production_year',
#        'Adventure', 'runtime', 'studio_Uni.', 'studio_BV', 'studio_WB',
#        'Action', 'studio_Par.', 'studio_SPC', 'studio_Fox',
#        'text_feature2', 'img_feature43', 'text_feature0', 'studio_Col.',
#        'studio_MGM', 'studio_Sony', 'studio_Orion', 'studio_Strand',
#        'studio_Magn.', 'studio_IFC', 'studio_Mira.', 'studio_Eros',
#        'Animation', 'studio_Reg.', 'studio_Gold.', 'studio_NL', 'Comedy',
#        'Mystery', 'Horror', 'Fantasy', 'Drama', 'studio_FoxS']

# OTHER_KEPT_FEATURES = ['n_votes', 'studio_other', 'release_year', 'Adventure', 'runtime',
#        'studio_WB', 'studio_Uni.', 'Action', 'studio_BV', 'studio_Par.',
#        'studio_SPC', 'img_feature0', 'studio_Fox', 'studio_Sony',
#        'studio_Col.', 'studio_MGM', 'studio_Magn.', 'studio_Strand',
#        'studio_Orion', 'studio_Mira.', 'Comedy', 'studio_IFC', 'Family']

# X_train_filtered = X_train[columns[scores > 0.001]]
# X_test_filtered = X_test[columns[scores > 0.001]]

In [None]:
# DISCRETE_FEATURES = ["ratings", "production_year", "release_year"]
# STUDIOS_FEATURES = X_train.columns[X_train.columns.str.startswith('studio')].tolist()
# GENRES_FEATURES = X_train.coumns[X_train.columns.str.startswith("genre")].tolist()

# DISCRETE_FEATURES = np.concatenate(DISCRETE_FEATURES, STUDIOS_FEATURES, GENRES_FEATURES)

In [None]:
# plot_mutual_information_with_target(X_train, y_train)
# tikzplotlib.save("report/figures/MI_with_target.tex")

### Linear Regression

In [None]:
print("+" * 25)
print("Linear Regression")
print("+" * 25)

val_score, rmse_score, r2 = linreg(X_train_filtered, y_train, X_test_filtered, y_test, kf, rmse)
print(f"val rmse: {round(val_score, 3)}")
print(f"train rmse: {round(rmse_score, 3)}")
print(f"train r2: {round(r2, 3)}")

val_score, rmse_score, r2 = linreg(X_train, y_train, X_test, y_test, kf, rmse)
print(f"val rmse: {round(val_score, 3)}")
print(f"train rmse: {round(rmse_score, 3)}")
print(f"train r2: {round(r2, 3)}")

# percentiles_candidates = [40, 45, 50, 55, 60]
# val_scores = []
# rmse_scores = []
# f1_scores = []

# # test different percentage of features to keep (MI)
# for percentile in percentiles_candidates:
# 	X_train_MI, X_test_MI = select_features_MI(X_train, y_train, X_test, percentile=percentile)
# 	val_score, rmse_score = linreg(X_train_MI, y_train, X_test_MI, y_test, kf, rmse)
# 	val_scores.append(np.round(lr_score, 3))
# 	rmse_scores.append(np.round(rmse_score, 3))

# pd.DataFrame({
# 	"Features keps [%]": percentiles_candidates,
# 	"val scores (RMSE)": val_scores,
# 	"test scores (RMSE)": rmse_scores
# })

# plt.plot(percentiles_candidates, scores, color="blue", marker="o")
# plt.title("Linear Regression: RMSE for different percentages of feature kept (MI)")
# plt.xlabel("percentage of features kepts")
# plt.ylabel("score (RMSE)")

# compare with RFE
#X_train_RFE, X_test_RFE = select_features_RFE(X_train, y_train, X_test)
#lr_score = linreg(X_train_RFE, y_train, X_test_RFE, y_test, kf, rmse)

#print("[RFE] Linear Regression RMSE: {:.3f}".format(lr_score))

### K-Nearest Neighbors

In [None]:
print("+" * 25)
print("K-Nearset Neighbors")
print("+" * 25)

KNN_pipe = Pipeline([
	("model", TransformedTargetRegressor(regressor=KNeighborsRegressor(), func=np.log, inverse_func=np.exp))
])

KNN = {
	"instance": KNN_pipe,
	"hyperparameters": {
		"model__regressor__n_neighbors": np.linspace(1, 50, 10, dtype=int),
		"model__regressor__p": [1, 2],
		"model__regressor__weights": ["uniform", "distance"]
	},
	"n_iter": 100,
	"validation_param": "model__regressor__n_neighbors"
}

best_estimator, best_params, best_score = test_model(
	model=KNN, 
	name="K-Nearest Neighbors", 
	X_train=X_train_filtered, 
	y_train=y_train, 
	X_test=X_test_filtered, 
	y_test=y_test, 
	kf=kf, 
	scorer=rmse
)

print(best_params)
print(f"train rmse: {round(best_score, 3)}")

# percentiles_candidates = [40, 50, 60]

# estimators = []
# scores = []

# for percentile in percentiles_candidates:
# 	X_train_MI, X_test_MI = select_features_MI(X_train, y_train, X_test, percentile=percentile)

# 	best_estimator, best_params, best_score = test_model(
# 		model=KNN, 
# 		name="K-Nearest Neighbors", 
# 		X_train=X_train_MI, 
# 		y_train=y_train, 
# 		X_test=X_test_MI, 
# 		y_test=y_test, 
# 		kf=kf, 
# 		scorer=rmse
# 	)

# 	estimators.append(best_estimator)
# 	scores.append(best_score)

# validate_model_with_feature_selection(percentiles_candidates, estimators, "K-Nearest Neighbors", KNN["validation_param"], KNN["hyperparameters"][KNN["validation_param"]], X_train_MI, y_train, X_test_MI, y_test, kf, rmse)

# pd.DataFrame({
# 	"Features keps [%]": percentiles_candidates,
# 	"val scores": scores
# })

In [None]:
validate_model(best_estimator, "K-Nearest Neighbors", KNN["validation_param"], KNN["hyperparameters"][KNN["validation_param"]], X_train_filtered, y_train, X_test_filtered, y_test, kf, rmse)

### Multi-Layer Perceptron

In [None]:
MLP_pipe = Pipeline([
	("model", TransformedTargetRegressor(regressor=MLPRegressor(), func=np.log1p, inverse_func=np.expm1))
])

MLP = {
	"instance": MLP_pipe,
	"hyperparameters": {
		"model__regressor__hidden_layer_sizes": [(25,25,25),(25,25),(25,)],
		"model__regressor__activation": ["identity", "logistic", "tanh", "relu"],
		"model__regressor__alpha": 10.0 ** -np.arange(1, 7), # https://scikit-learn.org/stable/modules/neural_networks_supervised.html,
		"model__regressor__max_iter": [int(x) for x in np.linspace(10, 10000, 100)]
	},
	"n_iter": 10,
	"validation_param": "model__regressor__hidden_layer_sizes"
}

best_estimator, best_params, best_score = test_model(
	model=MLP, 
	name="Multi-Layer Perceptron", 
	X_train=X_train_filtered, 
	y_train=y_train, 
	X_test=X_test_filtered, 
	y_test=y_test, 
	kf=kf, 
	scorer=rmse
)

print(best_params)
print(f"train rmse: {round(best_score, 3)}")

# percentiles_candidates = [40, 50, 60]

# estimators = []
# scores = []

# for percentile in percentiles_candidates:
# 	X_train_MI, X_test_MI = select_features_MI(X_train, y_train, X_test, percentile=percentile)

# 	best_estimator, best_params, best_score = test_model(
# 		model=MLP, 
# 		name="Multi-Layer Perceptron", 
# 		X_train=X_train_MI, 
# 		y_train=y_train, 
# 		X_test=X_test_MI, 
# 		y_test=y_test, 
# 		kf=kf, 
# 		scorer=rmse
# 	)

# 	estimators.append(best_estimator)
# 	scores.append(best_score)

# pd.DataFrame({
# 	"Features keps [%]": percentiles_candidates,
# 	"val scores": scores
# })

### Random Forest

#### Baseline

In [None]:
n_fold = 0 
oof_rmse = 0

pred_rf_test = np.zeros(len(X2))

for train_index, val_index in kf.split(X_train, y_train):
    X_train_stra, X_val_stra = X_train.iloc[train_index, :], X_train.iloc[val_index, :]
    y_train_stra, y_val_stra = y_train.iloc[train_index], y_train.iloc[val_index]
    
    print()
    print(f"Fold: {n_fold}")
    print()
    
    # setting up a basic random forest
    random_forest_pipe = Pipeline([
		("model", TransformedTargetRegressor(regressor=RandomForestRegressor(random_state=42, n_estimators=100), func=np.log1p, inverse_func=np.expm1))
	])
    
    # train the model on the stratified k-fold training set
    random_forest_pipe.fit(X_train_stra, y_train_stra)
    
    # predict regression on the whole test set
    pred = random_forest_pipe.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_true=y_test, y_pred=pred))
    print(f"Fold {n_fold} -- test RMSE: {rmse}")
    
    n_fold += 1
    oof_rmse += rmse
    
    pred_rf_test += random_forest_pipe.predict(X2[X_train.columns]) / N_SPLITS
  
print(f"Out-of-fold RMSE: {oof_rmse / N_SPLITS}")

fig, ax = plt.subplots(2, figsize=(12, 7))
sns.set(rc={"figure.figsize": (9, 14)})

sns.distplot(y_train, ax=ax[0])
sns.distplot(pred_rf_test, ax=ax[1])
fig.tight_layout()

#### Tuning

In [13]:
random_forest_pipe = Pipeline([
	("model", TransformedTargetRegressor(regressor=RandomForestRegressor(random_state=42), func=np.log, inverse_func=np.exp))
])

rf = {
	"instance": random_forest_pipe,
	"hyperparameters": {
		"model__regressor__n_estimators": [int(x) for x in np.linspace(35, 100, 10)],
		"model__regressor__criterion": ["absolute_error"], #["squared_error", "absolute_error", "poisson"],
		"model__regressor__max_features": ["auto"], #["auto", "sqrt"],
		"model__regressor__max_depth": [10, 12, None] #[3, 5, 7, 10, 12, None] # none means unbounded max depth
	},
	"n_iter": 10,
	"validation_param": "model__regressor__max_depth"
}

rf_bayes = {
	"instance": random_forest_pipe,
	"hyperparameters": {
		"model__regressor__n_estimators": (35, 100),
		"model__regressor__criterion": ["absolute_error"], #["squared_error", "absolute_error", "poisson"],
		"model__regressor__max_depth": (7, 12), #[3, 5, 7, 10, 12, None] # none means unbounded max depth
		"model__regressor__min_samples_split": (2, 20),
    	"model__regressor__min_samples_leaf": (1, 20)
	},
	"n_iter": 12,
	"validation_param": "model__regressor__max_depth"
}

# best_estimator, best_params, best_score = test_model(
# 	model=rf, 
# 	name="Random Forest", 
# 	X_train=X_train_filtered, 
# 	y_train=y_train, 
# 	X_test=X_test_filtered, 
# 	y_test=y_test, 
# 	kf=kf, 
# 	scorer=rmse
# )

# print(best_params)
# print(f"train rmse: {round(best_score, 3)}")

k_candidates = [10, 20]

estimators = []
scores = []

for k in k_candidates:
	X_train_filtered, X_test_filtered = select_features_MI_kbest(X_train, y_train, X_test, k=k)

	ms = ModelSelection(
		X_train=X_train_filtered,
		y_train=y_train,
		X_test=X_test_filtered,
		y_test=y_test,
		kf=kf,
		scorer=rmse
	)

	best_estimator, best_params, best_score = ms.test_model(
		model=rf_bayes, 
		name="Random Forest"
	)

	estimators.append(best_estimator)
	scores.append(round(best_score, 3))

results = pd.DataFrame({
	"Features kepts [%]": k_candidates,
	"val scores": scores
})

results

-------------------------
FEATURE SELECTION (MUTUAL INFORMATION)...
-------------------------
reduced from 111 features to 10 features
-75834639.83237118
Random Forest RMSE: 75834639.832
-------------------------
FEATURE SELECTION (MUTUAL INFORMATION)...
-------------------------
reduced from 111 features to 20 features
-74690427.60908094
Random Forest RMSE: 74690427.609


Unnamed: 0,Features kepts [%],val scores
0,10,75834640.0
1,20,74690430.0


#### Validation of the best model (overfitting, underfitting)

In [14]:
# plot validation curve of the best model
best_idx = np.argmin(scores)

best_estimator = estimators[best_idx]
best_number_of_faetures = k_candidates[best_idx]

validation_param = rf["validation_param"]

validate_model(
	model=best_estimator, 
	model_name="Random Forest", 
	param_name=validation_param, 
	param_range=rf["hyperparameters"][validation_param], 
	X_train=X_train_filtered, 
	y_train=y_train, 
	X_test=X_test_filtered, 
	y_test=y_test, 
	kf=kf, 
	scorer=rmse
)

NameError: name 'validate_model' is not defined

## Prediction

We're gonna make prediction about the revenue of movies present in `X2.csv`.