In [None]:
import numpy as np
import pandas as pd
import joblib

In [None]:
random_state = 100
cv_folds = 5

## Data Loading

Set the filename here

In [None]:
CSV_FILENAME = "compiled_features_dataset.csv" #<-- update csv name
df_features = pd.read_csv(CSV_FILENAME)
df_features.head()

```
df_features = df_features.drop(columns = ['Unnamed: 0', 'path', 'source_w', 'source_h', 'face_index'])
df_features = df_features.loc[df_features["e_bbox_yf"] < 3, :]
print(df_features.columns)
```

Change string types to numeric types

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
encoder = LabelEncoder()
categorical_columns = df_features.select_dtypes(include=[bool, object]).columns
encoded_columns = df_features[categorical_columns].apply(encoder.fit_transform)
encoded_columns

In [None]:
df_encoded_features = df_features.copy()
df_encoded_features[categorical_columns] = encoded_columns
df_encoded_features

Split into X and Y

In [None]:
feat_regions = ["bbox_", "mask_"]

color_spaces = {
    "RGB": ("R_BIN_", "G_BIN_", "B_BIN_"),
    "HSV": ("H_HSV_BIN_", "S_HSV_BIN_", "V_HSV_BIN_"),
    "HSL": ("H_HSL_BIN_", "S_HSL_BIN_", "L_HSL_BIN_"),
    "LAB": ("L_LAB_BIN_", "A_LAB_BIN_", "B_LAB_BIN_"),
    "YCBCR": ("Y_BIN_", "CR_BIN_", "CB_BIN_"),
}

In [None]:
ASSIGNED_COLOR_SPACE = "HSL" #<-- pick a colorspace
features = {}
for region in feat_regions:
    features[region] = ["source_w", "source_h", "x1", "y1", "x2", "y2"]
    for color_space in color_spaces[ASSIGNED_COLOR_SPACE]: 
        features[region] += [color_space + region + str(i) for i in range(26)]
    features[region] += ["LBP_BIN_" + region + str(i) for i in range(26)]
    features[region] += ["SOBELX_BIN_" + region + str(i) for i in range(21)]
    features[region] += ["SOBELY_BIN_" + region + str(i) for i in range(21)]
    features[region] += ["SOBEL_BIN_" + region + str(i) for i in range(21)]

In [None]:
X_features_mask =  df_encoded_features.loc[:,  features["mask_"]]
y_features_mask = df_encoded_features.loc[:, "e_face_yf"].values

X_features_bbox =  df_encoded_features.loc[:,  features["bbox_"]]
y_features_bbox = df_encoded_features.loc[:, "e_bbox_yf"].values

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train_mask, X_test_mask, y_train_mask, y_test_mask = train_test_split(X_features_mask, y_features_mask, test_size = 0.2, random_state=random_state)
X_train_mask, X_val_mask, y_train_mask, y_val_mask  = train_test_split(X_train_mask, y_train_mask, test_size = 0.25, random_state=random_state)

print("Split shapes")
print("X_train: ", X_train_mask.shape)
print("y_train: ", y_train_mask.shape)
print("X_val: ", X_val_mask.shape)
print("y_val: ", y_val_mask.shape)
print("X_test: ", X_test_mask.shape)
print("y_test: ", y_test_mask.shape)

In [None]:
X_train_bbox, X_test_bbox, y_train_bbox, y_test_bbox = train_test_split(X_features_bbox, y_features_bbox, test_size = 0.2, random_state=random_state)
X_train_bbox, X_val_bbox, y_train_bbox, y_val_bbox  = train_test_split(X_train_bbox, y_train_bbox, test_size = 0.25, random_state=random_state)

print("Split shapes")
print("X_train: ", X_train_bbox.shape)
print("y_train: ", y_train_bbox.shape)
print("X_val: ", X_val_bbox.shape)
print("y_val: ", y_val_bbox.shape)
print("X_test: ", X_test_bbox.shape)
print("y_test: ", y_test_bbox.shape)

## Base Models

### Model Evaluation Functions

In [None]:
def rmse(predictions, actual):
    return np.sqrt(np.mean(np.square(predictions - actual)))

In [None]:
def mae(predictions, actual):
    return np.mean(np.abs(predictions - actual))

### Random Forest

Train RF model

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rfr = RandomForestRegressor(random_state = random_state)
rfr.fit(X_train_mask, y_train_mask)
rfr_pred = rfr.predict(X_val_mask)
rfr_pred
print("RMSE:", rmse(rfr_pred, y_val_mask))
print("MAE:", mae(rfr_pred, y_val_mask))

if not os.path.isdir("model_dumps"):
    os.makedirs("model_dumps")
joblib.dump(rfr, "rfr_base_mask_" + ASSIGNED_COLOR_SPACE + ".pkl") 

In [None]:
rfr = RandomForestRegressor(random_state = random_state)
rfr.fit(X_train_bbox, y_train_bbox)
rfr_pred = rfr.predict(X_val_bbox)
rfr_pred
print("RMSE:", rmse(rfr_pred, y_val_bbox))
print("MAE:", mae(rfr_pred, y_val_bbox))

if not os.path.isdir("model_dumps"):
    os.makedirs("model_dumps")
joblib.dump(rfr, "rfr_base_bbox_" + ASSIGNED_COLOR_SPACE + ".pkl") 

### Support Vector Machine

In [None]:
from sklearn.svm import SVR

In [None]:
svr = SVR()
svr.fit(X_train_mask, y_train_mask)
svr_pred = svr.predict(X_val_mask)
svr_pred
print("RMSE:", rmse(svr_pred, y_val_mask))
print("MAE:", mae(svr_pred, y_val_mask))

if not os.path.isdir("model_dumps"):
    os.makedirs("model_dumps")
joblib.dump(svr, "svr_base_mask_" + ASSIGNED_COLOR_SPACE + ".pkl") 

In [None]:
svr = RandomForestRegressor(random_state = random_state)
svr.fit(X_train_bbox, y_train_bbox)
svr_pred = svr.predict(X_val_bbox)
svr_pred
print("RMSE:", rmse(svr_pred, y_val_bbox))
print("MAE:", mae(svr_pred, y_val_bbox))

if not os.path.isdir("model_dumps"):
    os.makedirs("model_dumps")
joblib.dump(svr, "svr_base_bbox_" + ASSIGNED_COLOR_SPACE + ".pkl") 

### Multilayer Perceptrons



In [None]:
from sklearn.neural_network import MLPRegressor

In [None]:
mpr = MLPRegressor(random_state = random_state)
mpr.fit(X_train_mask, y_train_mask)
mpr_pred = mpr.predict(X_val_mask)
mpr_pred
print("RMSE:", rmse(mpr_pred, y_val_mask))
print("MAE:", mae(mpr_pred, y_val_mask))

if not os.path.isdir("model_dumps"):
    os.makedirs("model_dumps")
joblib.dump(mpr, "mpr_base_mask_" + ASSIGNED_COLOR_SPACE + ".pkl") 

In [None]:
mpr = RandomForestRegressor(random_state = random_state)
mpr.fit(X_train_bbox, y_train_bbox)
mpr_pred = mpr.predict(X_val_bbox)
mpr_pred
print("RMSE:", rmse(mpr_pred, y_val_bbox))
print("MAE:", mae(mpr_pred, y_val_bbox))

if not os.path.isdir("model_dumps"):
    os.makedirs("model_dumps")
joblib.dump(mpr, "mpr_base_bbox_" + ASSIGNED_COLOR_SPACE + ".pkl") 

___

In [None]:
raise Exception("STOP HERE")

## Feature selection and Hyperparameter tuning



In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.base import clone as clone_model

### Random Forest

**Grid Search**

```
param_space = [
    {
        "n_estimators": list(range(100, 351, 50)),
        "criterion": ["squared_error", "absolute_error", "friedman_mse", "poisson"],
        "max_depth": [None, 1, 10, 30, 80, 150],
        "min_samples_split": [2, 4, 8, 16, 32],
        "min_samples_leaf": [1, 5, 10, 20],
        "max_features": [None, "sqrt", "log2", 0.25, 0.5, 0.75],
        "max_leaf_nodes": [None, 50, 100, 300],
        "min_impurity_decrease": [0.0, 0.5],
        "bootstrap": [False, True],
        "ccp_alpha": [0.0, 0.5, 1.0],
    }
]
```

```
rfr = RandomForestRegressor()
gs_rfr = GridSearchCV(rfr, param_space, n_jobs=-1, cv=cv_folds)
gs_rfr.fit(X_train, y_train)
```

```
print('Best parameters found:\n', gs_svr.best_params_)
y_pred = gs_rfr.predict(X_val)
print("Best accuracy: ", rmse(y_val, y_pred))
print(classification_report(y_val, y_pred))
```

**Bayesian Optimization**

In [None]:
search_space = {
    "n_estimators": Integer(100, 350),
    "criterion": Categorical(["squared_error", "absolute_error", "friedman_mse", "poisson"]),
    "max_depth": Integer(1, 300),
    "min_samples_split": Integer(2, 32),
    "min_samples_leaf": Integer(1, 20),
    "max_features": Categorical([None, "sqrt", "log2", 0.25, 0.5, 0.75]),
    "max_leaf_nodes": Integer(50, 300),
    "min_impurity_decrease": Real(0.0, 2.0),
    "bootstrap": Categorical([False, True]),
    "ccp_alpha": Real(0.0, 2.0),
}

In [None]:
rfr = RandomForestRegressor()
bo_rfr = BayesSearchCV(rfr, search_space, n_iter=50, n_jobs=-1, cv=cv_folds, random_state=random_state)
bo_rfr.fit(X_train, y_train)

In [None]:
print('Best parameters found:\n', bo_rfr.best_params_)
y_pred = bo_rfr.predict(X_val)
print("RMSE: ", rmse(y_val, y_pred))
print("MAE: ", mae(y_val, y_pred))

### Feature Selection

In [None]:
rfr = RandomForestRegressor()
pipe = Pipeline([('selector', SelectKBest(f_regression)), ('rfr', rfr)])

search_space = {
    "selector__k": Integer(X_train.shape[1] // 2, X_train.shape[1] - 1),
    "rfr__n_estimators": Integer(100, 350),
    "rfr__criterion": Categorical(["squared_error", "absolute_error", "friedman_mse", "poisson"]),
    "rfr__max_depth": Integer(1, 300),
    "rfr__min_samples_split": Integer(2, 32),
    "rfr__min_samples_leaf": Integer(1, 20),
    "rfr__max_features": Categorical([None, "sqrt", "log2", 0.25, 0.5, 0.75]),
    "rfr__max_leaf_nodes": Integer(50, 300),
    "rfr__min_impurity_decrease": Real(0.0, 2.0),
    "rfr__bootstrap": Categorical([False, True]),
    "rfr__ccp_alpha": Real(0.0, 2.0),
}

bo_rfr = BayesSearchCV(pipe, search_space, n_iter=75, n_jobs=-1, cv=cv_folds, random_state=random_state)
bo_rfr.fit(X_train, y_train)

selected_feat = bo_rfr.best_estimator_.named_steps["selector"].get_support()
best_rfr = clone_model(bo_rfr.best_estimator_)
best_rfr.fit(X_train.loc[:, selected_feat], y_train)

In [None]:
y_pred = best_rfr.predict(X_val.loc[:, selected_feat])
print('Best params:\n', bo_rfr.best_params_)
print('Best features found:\n', X_train.columns[selected_feat])
print("RMSE: ", rmse(y_val, y_pred))
print("MAE: ", mae(y_val, y_pred))

### Support Vector Machine

**Grid Search**

```
param_space = [
    {
        "kernel": "poly",
        "degree": [3, 5, 10, 15, 20],
        "gamma": ["auto", "scale"],
        "coef0": [0.0, 2.5, 5.0],
        "tol": [0.0001, 0.001, 0.01, 0.1],
        "C": [0.0001, 0.01, 1.0, 100.0, 1000.0],
        "epsilon": [0.05, 0.1, .5],
        "shrinking": [False, True],
        "max_iter": [-1, 100, 500, 1000],
    },
    {
        "kernel": "rbf",
        "gamma": ["auto", "scale"],
        "tol": [0.0001, 0.001, 0.01, 0.1],
        "C": [0.0001, 0.01, 1.0, 100.0, 1000.0],
        "epsilon": [0.05, 0.1, .5],
        "shrinking": [False, True],
        "max_iter": [-1, 100, 500, 1000],
    },
    {
        "kernel": "sigmoid",
        "gamma": ["auto", "scale"],
        "coef0": [0.0, 2.5, 5.0],
        "tol": [0.0001, 0.001, 0.01, 0.1],
        "C": [0.0001, 0.01, 1.0, 100.0, 1000.0],
        "epsilon": [0.05, 0.1, .5],
        "shrinking": [False, True],
        "max_iter": [-1, 100, 500, 1000],
    },
    {
        "kernel": ["linear"],
        "tol": [0.0001, 0.001, 0.01, 0.1],
        "C": [0.0001, 0.01, 1.0, 100.0, 1000.0],
        "epsilon": [0.05, 0.1, .5],
        "shrinking": [False, True],
        "max_iter": [-1, 100, 500, 1000],
    }
]
```

```
svr = SVR()
gs_svr = GridSearchCV(svr, param_space, n_jobs=-1, cv=cv_folds)
gs_svr.fit(X_train, y_train)
```

```
print('Best parameters found:\n', gs_svr.best_params_)
y_pred = gs_svr.predict(X_val)
print("Best accuracy: ", rmse(y_val, y_pred))
print(classification_report(y_val, y_pred))
```

**Bayesian Optimization**

In [None]:
search_space = {
    "kernel": Categorical(["linear", "poly", "rbf", "sigmoid"]),
    "degree": Integer(3, 20),
    "gamma": Categorical(["auto", "scale"]),
    "coef0": Real(0.0, 5.0),
    "tol": Real(0.0001, 0.1),
    "C": Real(0.0001, 1000.0),
    "epsilon": Real(0.05, .5),
    "shrinking": Categorical([False, True]),
    "max_iter": Integer(100, 5000),
}

In [None]:
svr = SVR()
bo_svr = BayesSearchCV(svr, search_space, n_iter=50, n_jobs=-1, cv=cv_folds, random_state=random_state)
bo_svr.fit(X_train, y_train)

In [None]:
print('Best parameters found:\n', bo_svr.best_params_)
y_pred = bo_svr.predict(X_val)
print("RMSE: ", rmse(y_val, y_pred))
print("MAE: ", mae(y_val, y_pred))

### Feature Selection

In [None]:
svr = SVR()
pipe = Pipeline([('selector', SelectKBest(f_regression)), ('svr', svr)])

search_space = {
    "selector__k": Integer(X_train.shape[1] // 2, X_train.shape[1] - 1),
    "svr__kernel": Categorical(["linear", "poly", "rbf", "sigmoid"]),
    "svr__degree": Integer(3, 20),
    "svr__gamma": Categorical(["auto", "scale"]),
    "svr__coef0": Real(0.0, 5.0),
    "svr__tol": Real(0.0001, 0.1),
    "svr__C": Real(0.0001, 1000.0),
    "svr__epsilon": Real(0.05, .5),
    "svr__shrinking": Categorical([False, True]),
    "svr__max_iter": Integer(100, 5000),
}

bo_svr = BayesSearchCV(pipe, search_space, n_iter=75, n_jobs=-1, cv=cv_folds, random_state=random_state)
bo_svr.fit(X_train, y_train)

selected_feat = bo_svr.best_estimator_.named_steps["selector"].get_support()
best_svr = clone_model(bo_svr.best_estimator_)
best_svr.fit(X_train.loc[:, selected_feat], y_train)

In [None]:
y_pred = best_svr.predict(X_val.loc[:, selected_feat])
print('Best params:\n', bo_svr.best_params_)
print('Best features found:\n', X_train.columns[selected_feat])
print("RMSE: ", rmse(y_val, y_pred))
print("MAE: ", mae(y_val, y_pred))

### Multilayer Perceptron

**Grid Search**

```
param_space = [
    {
        "solver": "lbfgs",
        "hidden_layer_sizes": [(100,), (50, 50,), (50, 25, 25,)], # pick better ones
        "activation": ["identity", "logistic", "tanh", "relu"],
        "alpha": [0.00001, 0.0001, 0.001],
        "max_iter": [200, 500, 1000],
        "random_state": random_state,
        "tol": [0.0001, 0.001, 0.01, 0.1],
        "max_fun": [10000, 15000],
    },
    {
        "solver": "adam",
        "hidden_layer_sizes": [(100,), (50, 50,), (50, 25, 25,)], # pick better ones
        "activation": ["identity", "logistic", "tanh", "relu"],
        "alpha": [0.00001, 0.0001, 0.001],
        "max_iter": [200, 500, 1000],
        "random_state": random_state,
        "tol": [0.0001, 0.001, 0.01, 0.1],
        "batch_size": ['auto', n_samples // 5, n_samples // 10],
        "learning_rate_init": [0.0005, 0.001, 0.005],
        "shuffle": [False, True],
        "early_stopping": [False, True],
        "validation_fraction": [0.1, 0.15],
        "n_iter_no_change": [10, 15],
        # not sure abt these values, should we be changing these?
        "beta_1": [0.75, 0.9],
        "beta_2": [0.85, 0.999],
        "epsilon": [1e-07, 1e-08],
    },
    {
        "solver": "sgd",
        "hidden_layer_sizes": [(100,), (50, 50,), (50, 25, 25,)], # pick better ones
        "activation": ["identity", "logistic", "tanh", "relu"],
        "alpha": [0.00001, 0.0001, 0.001],
        "max_iter": [200, 500, 1000],
        "random_state": random_state,
        "tol": [0.0001, 0.001, 0.01, 0.1],
        "batch_size": ['auto', n_samples // 5, n_samples // 10],
        "learning_rate_init": [0.0005, 0.001, 0.005],
        "shuffle": [False, True],
        "early_stopping": [False, True],
        "validation_fraction": [0.1, 0.15],
        "n_iter_no_change": [10, 15],
        "learning_rate": ["constant", "invscaling", "adaptive"],
        "momentum": [0.75, 0.9],
        "nesterovs_momentum": [False, True],
    }
]
```

```
mpr = MLPRegressor()
gs_mpr = GridSearchCV(mpr, param_space, n_jobs=-1, cv=cv_folds)
gs_mpr.fit(X_train, y_train)
```

```
print('Best parameters found:\n', gs_mpr.best_params_)
y_pred = gs_mpr.predict(X_val)
print("Best accuracy: ", rmse(y_val, y_pred))
print(classification_report(y_val, y_pred))
```

**Bayesian Optimization**

In [None]:
hidden_sz = X_train.shape[0] * 2 // 3 + 1
search_space = {
    "activation": Categorical(["identity", "logistic", "tanh", "relu"]),
    "solver": Categorical(["lbfgs", "adam", "sgd"]),
    "alpha": Real(0.00001, 0.001),
    "learning_rate": Categorical(["constant", "invscaling", "adaptive"]),
    "learning_rate_init": Real(0.0005, 0.005),
    "max_iter": Integer(200, 1000),
    "shuffle": Categorical([False, True]),
    "tol": Real(0.0001, 0.1),
    "momentum": Real(0.75, 0.9),
    "nesterovs_momentum": Categorical([False, True]),
    "early_stopping": Categorical([False, True]),
    "validation_fraction": Real(0.1, 0.15),
    "beta_1": Real(0.75, 0.9),
    "beta_2": Real(0.85, 0.999),
    "epsilon": Real(1e-08, 1e-07),
    "n_iter_no_change": Integer(10, 15),
    "max_fun": Integer(10000, 15000),
}

In [None]:
mpr = MLPRegressor(random_state=random_state, hidden_layer_sizes=(hidden_sz, hidden_sz * 2 // 3, hidden_sz * 4 // 9))
bo_mpr = BayesSearchCV(mpr, search_space, n_iter=50, n_jobs=-1, cv=cv_folds)
bo_mpr.fit(X_train, y_train)

In [None]:
print('Best parameters found:\n', bo_mpr.best_params_)
y_pred = bo_mpr.predict(X_val)
print("Best accuracy: ", rmse(y_val, y_pred))
print(classification_report(y_val, y_pred))

### Feature Selection

In [None]:
mpr = MLPRegressor(random_state=random_state, hidden_layer_sizes=(hidden_sz, hidden_sz * 2 // 3, hidden_sz * 4 // 9))
pipe = Pipeline([('selector', SelectKBest(f_regression)), ('mpr', mpr)])

search_space = {
    "selector__k": Integer(X_train.shape[1] // 2, X_train.shape[1] - 1),
    "mpr__activation": Categorical(["identity", "logistic", "tanh", "relu"]),
    "mpr__solver": Categorical(["lbfgs", "adam", "sgd"]),
    "mpr__alpha": Real(0.00001, 0.001),
    "mpr__learning_rate": Categorical(["constant", "invscaling", "adaptive"]),
    "mpr__learning_rate_init": Real(0.0005, 0.005),
    "mpr__max_iter": Integer(200, 1000),
    "mpr__shuffle": Categorical([False, True]),
    "mpr__tol": Real(0.0001, 0.1),
    "mpr__momentum": Real(0.75, 0.9),
    "mpr__nesterovs_momentum": Categorical([False, True]),
    "mpr__early_stopping": Categorical([False, True]),
    "mpr__validation_fraction": Real(0.1, 0.15),
    "mpr__beta_1": Real(0.75, 0.9),
    "mpr__beta_2": Real(0.85, 0.999),
    "mpr__epsilon": Real(1e-08, 1e-07),
    "mpr__n_iter_no_change": Integer(10, 15),
    "mpr__max_fun": Integer(10000, 15000),
}

bo_mpr = BayesSearchCV(pipe, search_space, n_iter=76, n_jobs=-1, cv=cv_folds, random_state=random_state)
bo_mpr.fit(X_train, y_train)

selected_feat = bo_mpr.best_estimator_.named_steps["selector"].get_support()
best_mpr = clone_model(bo_mpr.best_estimator_)
best_mpr.fit(X_train.loc[:, selected_feat], y_train)

In [None]:
y_pred = best_mpr.predict(X_val.loc[:, selected_feat])
print('Best params:\n', bo_mpr.best_params_)
print('Best features found:\n', X_train.columns[selected_feat])
print("RMSE: ", rmse(y_val, y_pred))
print("MAE: ", mae(y_val, y_pred))

## Evaluation

In [None]:
import joblib
if not os.path.isdir("model_dumps"):
    os.makedirs("model_dumps")
joblib.dump(bo_rfr, "model_dumps/rf_bosearch.pkl") 
joblib.dump(bo_svr, "model_dumps/svm_bosearch.pkl") 
joblib.dump(bo_mpr, "model_dumps/mlp_bosearch.pkl") 
joblib.dump(best_rfr, "model_dumps/rf.pkl") 
joblib.dump(best_svr, "model_dumps/svm.pkl") 
joblib.dump(best_mpr, "model_dumps/mlp.pkl") 

#