<a href="https://colab.research.google.com/github/MrJangoBox/Team-13-Padelol-Kaggle-IFT-6578/blob/main/Detailed_Best_Public_Score_1_69530_IFT_6578_Kaggle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Welcome to Team-13 Implementation
Link: https://colab.research.google.com/drive/1dY0KaHImIk1mJ65wyk5EFx2GSEfUdcJv?usp=sharing

In [None]:
!pip install geotext

Collecting geotext
[?25l  Downloading https://files.pythonhosted.org/packages/25/c5/36351193092cb4c1d7002d2a3babe5e72ae377868473933d6f63b41e5454/geotext-0.4.0-py2.py3-none-any.whl (2.0MB)
[K     |████████████████████████████████| 2.0MB 5.7MB/s 
[?25hInstalling collected packages: geotext
Successfully installed geotext-0.4.0


In [None]:
import gdown
import datetime
import numpy as np
import pandas as pd
import sklearn
from geotext import GeoText
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_log_error, make_scorer
from sklearn.compose import TransformedTargetRegressor, ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, MaxAbsScaler, FunctionTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.svm import NuSVR
from sklearn.ensemble import AdaBoostRegressor, VotingRegressor
from sklearn.tree import DecisionTreeRegressor

### Load data

In [None]:
# Load data
if 'google.colab' in str(get_ipython()):
    files = []
    files.append(('test.csv',
                'https://drive.google.com/uc?id=1gu6DXcEr4agiGttFj2ubToSICl5NDGKw'))
    files.append(('train.csv',
                'https://drive.google.com/uc?id=1x-Baw1riR-7wKZTpAvXe9RCCC0zu1sXv'))

    for file in files:
        gdown.download(file[1], file[0], quiet=True)

    train_data, test_data = pd.read_csv('train.csv'), pd.read_csv('test.csv')
else:
    train_data, test_data = pd.read_csv('../train.csv'), pd.read_csv('../test.csv')

# Rename columns
train_cols = train_data.columns.str.lower().str.replace(' ', '_').str.replace('?', '')
test_cols = train_cols[:-1]
train_data.columns = train_cols
test_data.columns = test_cols
# Split train dataset to train and validation
X_train, X_valid, y_train, y_valid = train_test_split(train_data.iloc[:, :-1],
                                                      train_data.iloc[:, -1],
                                                      test_size=0.3,
                                                      shuffle=True,
                                                      random_state=76)

### Functions

In [None]:
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))


def clean_profile_category(df):
    ''' replace "  " with "unknown" '''
    return df.apply(lambda x: x.astype(str).str.replace(' ', 'unknown'))


def clean_user_language(df):
    ''' keep only the first two characters '''
    return df.apply(lambda x: x.astype(str).str.split('-').str[0])


def lower_location_visibility(df):
    ''' lowercase location visibility feature '''
    return df.apply(lambda x: x.astype(str).str.lower())


def date2int(df):
    end = datetime.date(2020, 11, 24)  # when Kaggle competition started
    result = end - pd.to_datetime(df['profile_creation_timestamp']).dt.date
    result /= np.timedelta64(1, 'D')  # get the number of days
    return pd.DataFrame(result)


def date2ym(df):
    result = pd.to_datetime(df['profile_creation_timestamp']).dt.year \
             * 100 + pd.to_datetime(df['profile_creation_timestamp']).dt.month
    return pd.DataFrame(result)


def to_string(df):
    return df.apply(lambda x: x.astype(str))


def if_black(df):
    def f(x):
        return x.apply(lambda x: True if x != '000000' else False)
    return df.apply(f, axis=1)


def extract_language(df):
    popular_languages = ['de', 'en', 'es', 'fr', 'it', 'ja', 'pt', 'tr']

    def extract_language_helper(lang):
        lang = lang.lower()[:2]
        return lang if lang in popular_languages else 'xx'

    return df.applymap(extract_language_helper)


color_dict = {(255, 0, 0): 'red', (0, 255, 0): 'lime', (0, 0, 255): 'blue'}
color_dict_ind = np.array(list(color_dict.keys()))


def transform_colors(df):
    return df.applymap(hex2col)


def hex2col(hex):
    if pd.isnull(hex):
        return 'unknown'
    rgb = [int(hex[i:i+2], 16) for i in range(0, len(hex), 2)]
    distances = ((color_dict_ind - rgb) ** 2).sum(axis=1)  # Euclidean distance

    code = tuple(color_dict_ind[np.argmin(distances)])
    return color_dict[code]


def text2loc(text):
    if pd.isnull(text):
        return 'MISSING'
    locations = GeoText(text)
    countries = locations.country_mentions
    n_countries = len(countries)
    if n_countries == 0:
        return 'UNKNOWN'
    elif n_countries > 1:
        return 'MULTIPLE'
    else:
        return list(countries.keys())[0]


def transform_locations(df):
    df = df.copy()
    return df.applymap(text2loc)


def utc(df):
    return df.fillna(df.mean()) / 60


def avg_features(df):
    # print(df['profile_creation_timestamp'])
    days = date2int(df[['profile_creation_timestamp']]).to_numpy()
    return df.iloc[:, :-1].to_numpy() / days


def follower_ratio(df):
    ratio = (df['num_of_followers'] + 1) / (df['num_of_people_following'] + 1)
    return pd.DataFrame(ratio)


def avg_clicks_times_following(df):
    avg_clicks = df['avg_daily_profile_clicks']
    avg_clicks = avg_clicks.fillna(avg_clicks.median())
    product = avg_clicks * df['num_of_people_following']
    return pd.DataFrame(product)


class DenseTransformer(sklearn.base.TransformerMixin):
# https://stackoverflow.com/questions/28384680/scikit-learns-pipeline-a-sparse-matrix-was-passed-but-dense-data-is-required
    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

In [None]:
score = make_scorer(rmsle, greater_is_better=False)

### Column transformers

In [None]:
# Numeric skewed features
numeric_exp = ['num_of_followers', 'num_of_people_following',
               'num_of_status_updates', 'num_of_direct_messages',
               'avg_daily_profile_clicks']
numeric_exp_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=2, weights='uniform')),
    ('log', FunctionTransformer(np.log1p)),
    ('scaler', StandardScaler())])

# Numeric Normally distributed features
numeric_norm = ['avg_daily_profile_visit_duration_in_seconds']
numeric_norm_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=1, weights='uniform')),
    ('scaler', StandardScaler())])

categorical_features = ['utc_offset']
categorical_transformer = Pipeline(steps=[
    ('clean', FunctionTransformer(to_string)),
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

utc_feature = ['utc_offset']
utc_transformer = Pipeline(steps=[
    ('utc', FunctionTransformer(utc)),
    ('scaler', StandardScaler())])

boolean_features = ['is_profile_view_size_customized',
                    'profile_cover_image_status',
                    'profile_verification_status']
boolean_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Not set')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

color_features = ['profile_text_color',
                  'profile_page_color',
                  'profile_theme_color']
color_transformer = Pipeline(steps=[
    ('extract', FunctionTransformer(if_black)),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

profile_category = ['profile_category']
profile_category_transformer = Pipeline(steps=[
    ('clean', FunctionTransformer(clean_profile_category)),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

user_language = ['user_language']
user_language_transformer = Pipeline(steps=[
    ('clean', FunctionTransformer(clean_user_language)),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

personal_url = ['personal_url']
personal_url_transformer = Pipeline(steps=[
    ('extract', FunctionTransformer(pd.notna)),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

location_feature = ['location']
location_transformer = Pipeline(steps=[
    ('extract', FunctionTransformer(transform_locations)),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

loc_visib = ['location_public_visibility']
loc_visib_transformer = Pipeline(steps=[
    ('lower', FunctionTransformer(lower_location_visibility)),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

time_feature = ['profile_creation_timestamp']
time_transformer = Pipeline(steps=[
    ('extract', FunctionTransformer(date2int)),
    ('log', FunctionTransformer(np.log1p)),
    ('scaler', StandardScaler())])

average_features = ['num_of_followers', 'num_of_people_following',
                    'num_of_status_updates', 'num_of_direct_messages',
                    'profile_creation_timestamp']
average_transformer = Pipeline(steps=[
        ('avg', FunctionTransformer(avg_features)),
        ('log', FunctionTransformer(np.log1p)),
        ('scaler', MinMaxScaler())])

ratio_features = ['num_of_followers', 'num_of_people_following']
ratio_transformer = Pipeline(steps=[
        ('ratio', FunctionTransformer(follower_ratio)),
        ('log', FunctionTransformer(np.log1p)),
        ('scaler', StandardScaler())])

product_features = ['avg_daily_profile_clicks', 'num_of_people_following']
product_transformer = Pipeline(steps=[
        ('product', FunctionTransformer(avg_clicks_times_following)),
        ('log', FunctionTransformer(np.log1p)),
        ('scaler', MinMaxScaler())])

#### Combine Transformers

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric_exp', numeric_exp_transformer, numeric_exp),
        ('numeric_norm', numeric_norm_transformer, numeric_norm),
        ('categorical', categorical_transformer, categorical_features),
        ('boolean', boolean_transformer, boolean_features),
        ('color', color_transformer, color_features),
        ('profile_category', profile_category_transformer, profile_category),
        ('user_language', user_language_transformer, user_language),
        ('loc', location_transformer, location_feature),
        ('loc_visib', loc_visib_transformer, loc_visib),
        ('personal_url', personal_url_transformer, personal_url),
        ('time', time_transformer, time_feature),
        ('utc', utc_transformer, utc_feature),
        ('ratio', ratio_transformer, ratio_features),
        ('prod', product_transformer, product_features),
        ('avg', average_transformer, average_features)],
    
    # Hyperopt optmized weights
    transformer_weights={
        'avg': 0.7429247838555282,
        'boolean': 1.0411784050247315,
        'categorical': 1.1692751076415553,
        'color': 1.3113898616879212,
        'loc': 0.9317583766578823,
        'loc_visib': 1.703423501322376,
        'numeric_exp': 1.028338292619148,
        'numeric_norm': 0.08119173992748974,
        'personal_url': 1.8144143269129127,
        'prod': 1.5566389246824337,
        'profile_category': 1.091936715275474,
        'ratio': 1.4571284181542004,
        'time': 0.6867805554464619,
        'user_language': 0.9337466050764849,
        'utc': 0.7839861950053606
        })

### Target transformer coupled with a voting regressor

In [None]:
# Hyperopt optmized hyperparameters
NuSVR_regressor = NuSVR(nu=0.718, C=0.634)
AdaBoost_regressor = AdaBoostRegressor(random_state=0, n_estimators=724,
                              base_estimator=DecisionTreeRegressor(max_depth=17),
                              learning_rate=2.06, loss='exponential')
transformer = QuantileTransformer(n_quantiles=11, output_distribution='normal')
regressor_vote = VotingRegressor(
    estimators=[
        ('NuSVR', NuSVR_regressor),
        ('AdaBoost', AdaBoost_regressor),
        ])
regr = TransformedTargetRegressor(regressor=regressor_vote,
                                  transformer=transformer)

### Main pipeline

In [None]:
estimators = [('preprocessor', preprocessor),
              ('to_dense', DenseTransformer()),
              ('regressor', regr)]
pipe = Pipeline(estimators)

### Train on valid and get results

In [None]:
_ = pipe.fit(X_train, y_train)
predictions = pipe.predict(X_valid)
rmsle_train = rmsle(y_train, pipe.predict(X_train))
rmsle_valid = rmsle(y_valid, predictions)
print(f'Train error: {rmsle_train:.3f}')
print(f'Valid error: {rmsle_valid:.3f}')

Train error: 0.915
Valid error: 1.726


### Train on whole data

In [None]:
X_train_whole = pd.concat([X_train, X_valid])
y_train_whole = pd.concat([y_train, y_valid])
pipe.fit(X_train_whole, y_train_whole)
pred_submission = pipe.predict(test_data)

### Save Predictions

In [None]:
output = pd.DataFrame({'Id': test_data['id'], 
                       'Predicted': np.floor(pred_submission).astype('int')})
output.to_csv('submission.csv', index=False)

- - - 

# ================= Model End ====================

### Grid search for hyperparameters of feature transformations

In [None]:
imputer = [dict(
    preprocessor__numeric_features__imputer=[SimpleImputer()],
    preprocessor__numeric_features__imputer__strategy=['mean', 'median']),
           dict(
    preprocessor__numeric_features__imputer=[KNNImputer()],
    preprocessor__numeric_features__imputer__n_neighbors=np.arange(1, 11, 1, dtype=int),
    preprocessor__numeric_features__imputer__weights=['uniform', 'distance'])
]

# RESULTS
# best: {'preprocessor__numeric_features__imputer': KNNImputer(n_neighbors=3, weights='distance'),
#        'preprocessor__numeric_features__imputer__n_neighbors': 3,
#        'preprocessor__numeric_features__imputer__weights': 'distance'}

In [None]:
imputer = [dict(
    preprocessor__numeric_exp__imputer=[KNNImputer()],
    preprocessor__numeric_exp__imputer__n_neighbors=np.arange(1, 6, 1, dtype=int),
    preprocessor__numeric_exp__imputer__weights=['uniform', 'distance']),
           dict(
    preprocessor__numeric_norm__imputer=[KNNImputer()],
    preprocessor__numeric_norm__imputer__n_neighbors=np.arange(1, 6, 1, dtype=int),
    preprocessor__numeric_norm__imputer__weights=['uniform', 'distance']),
           dict(
    preprocessor__categorical__imputer__strategy=['constant', 'most_frequent']),
           dict(
    preprocessor__boolean__imputer__strategy=['constant', 'most_frequent'])
]

for item in imputer:
    print('Testing:', item)
    grid_search = GridSearchCV(pipe, param_grid=item, scoring=score, n_jobs=-1)
    _ = grid_search.fit(X_train, y_train)
    print(grid_search.best_score_)
    print(grid_search.best_params_)

# RESULTS
# Testing: {'preprocessor__numeric_exp__imputer'}
# {'preprocessor__numeric_exp__imputer': KNNImputer(n_neighbors=2),
#  'preprocessor__numeric_exp__imputer__n_neighbors': 2,
#  'preprocessor__numeric_exp__imputer__weights': 'uniform'}
# Testing: {'preprocessor__numeric_norm__imputer': }
# {'preprocessor__numeric_norm__imputer': KNNImputer(n_neighbors=1),
#  'preprocessor__numeric_norm__imputer__n_neighbors': 1,
#  'preprocessor__numeric_norm__imputer__weights': 'uniform'}
# Testing: {'preprocessor__categorical__imputer__strategy'}
# {'preprocessor__categorical__imputer__strategy': 'constant'}
# Testing: {'preprocessor__boolean__imputer__strategy'}
# {'preprocessor__boolean__imputer__strategy': 'constant'}

In [None]:
scaler = dict(
    preprocessor__numeric__scaler=[StandardScaler(), MinMaxScaler(), MaxAbsScaler()],
    preprocessor__time__scaler=[StandardScaler(), MinMaxScaler(), MaxAbsScaler()],
    preprocessor__utc__scaler=[StandardScaler(), MinMaxScaler(), MaxAbsScaler()],
    preprocessor__avg__scaler=[StandardScaler(), MinMaxScaler(), MaxAbsScaler()],
    preprocessor__prod__scaler=[StandardScaler(), MinMaxScaler(), MaxAbsScaler()],
    preprocessor__ratio__scaler=[StandardScaler(), MinMaxScaler(), MaxAbsScaler()],
)
# RESULTS
# preprocessor__numeric__scaler=[StandardScaler()],
# preprocessor__time__scaler=[StandardScaler()],
# preprocessor__utc__scaler=[StandardScaler()],
# preprocessor__avg__scaler=[MinMaxScaler()],
# preprocessor__prod__scaler=[MinMaxScaler()],
# preprocessor__ratio__scaler=[StandardScaler()],

scaler_2 = dict(
    preprocessor__numeric__scaler=[StandardScaler(), QuantileTransformer(output_distribution='normal')],
    preprocessor__time__scaler=[StandardScaler(), QuantileTransformer(output_distribution='normal')],
    preprocessor__utc__scaler=[StandardScaler(), QuantileTransformer(output_distribution='normal')],
    preprocessor__avg__scaler=[MinMaxScaler(), QuantileTransformer(output_distribution='normal')],
    preprocessor__prod__scaler=[MinMaxScaler(), QuantileTransformer(output_distribution='normal')],
    preprocessor__ratio__scaler=[StandardScaler(), QuantileTransformer(output_distribution='normal')],
)
# RESULTS
# Quantile scaling is no better across the board

In [None]:
preprocessing = dict(
    preprocessor__time__extract=[FunctionTransformer(date2int), FunctionTransformer(date2ym)],
    preprocessor__color__extract=[FunctionTransformer(if_black), FunctionTransformer(transform_colors)],
    preprocessor__user_language__clean=['passthrough',
                                        FunctionTransformer(clean_user_language),
                                        FunctionTransformer(extract_language)],
    preprocessor__loc__extract=['passthrough',
                                FunctionTransformer(transform_locations),
                                FunctionTransformer(pd.notna)],
)

for item in preprocessing.items():
    print('Testing:', item[0])
    grid_search = GridSearchCV(pipe, param_grid={item[0]: item[1]}, scoring=score, n_jobs=-1)
    _ = grid_search.fit(X_train, y_train)
    print(grid_search.best_score_)
    print(grid_search.best_params_)

# RESULTS
# -1.7491152606998064 {'preprocessor__time__extract': FunctionTransformer(date2int)}
# -1.7437539291750308 {'preprocessor__color__extract': FunctionTransformer(if_black)}
# -1.7486062782477316 {'preprocessor__user_language__clean': FunctionTransformer(clean_user_language)}
# -1.7491152606998064 {'preprocessor__loc__extract': FunctionTransformer(transform_locations)}

In [None]:
f_selection_1 = dict(
    preprocessor__numeric_exp=['drop', numeric_exp_transformer],
    preprocessor__numeric_norm=['drop', numeric_norm_transformer],
    preprocessor__categorical=['drop', categorical_transformer],
    preprocessor__boolean=['drop', boolean_transformer],
    preprocessor__color=['drop', color_transformer],
    preprocessor__profile_category=['drop', profile_category_transformer],
    preprocessor__user_language=['drop', user_language_transformer],
    preprocessor__loc=['drop', location_transformer],
)
f_selection_2 = dict(
    preprocessor__loc_visib=['drop', loc_visib_transformer],
    preprocessor__personal_url=['drop', personal_url_transformer],
    preprocessor__time=['drop', time_transformer],
    preprocessor__utc=['drop', utc_transformer],
    preprocessor__avg=['drop', average_transformer],
    preprocessor__prod=['drop', product_transformer],
    preprocessor__ratio=['drop', ratio_transformer],
)
# RESULTS
# The best performing models contained all features in both cases.

In [None]:
# Pass the right param_grid
grid_search = GridSearchCV(pipe, param_grid=imputer, scoring=score, n_jobs=-1, cv=3, verbose=2)
_ = grid_search.fit(X_train, y_train)
print("Best score:", grid_search.best_score_)
print("Best params:", grid_search.best_params_)

### Bayesian optimization
Hyperopt's job is to find the best value of a scalar-valued, possibly-stochastic function over a set of possible arguments to that function. Whereas many optimization packages will assume that these inputs are drawn from a vector space, Hyperopt is different in that it encourages you to describe your search space in more detail. By providing more information about where your function is defined, and where you think the best values are, you allow algorithms in hyperopt to search more efficiently.

- [Parameter expressions](https://github.com/hyperopt/hyperopt/wiki/FMin#21-parameter-expressions)
- [Tutorial](https://github.com/WillKoehrsen/hyperparameter-optimization/blob/master/Bayesian%20Hyperparameter%20Optimization%20of%20Gradient%20Boosting%20Machine.ipynb)

In [None]:
!pip install hyperopt

In [None]:
import hyperopt
from hyperopt import hp

In [None]:
svr_space = dict(
    regressor__regressor__kernel=[hp.choice('regressor__regressor__kernel',
                                            ['rbf', 'sigmoid', 'linear', 'poly'])],
    regressor__regressor__coef0=[hp.lognormal('regressor__regressor__coef0', 0, 1)],
    regressor__regressor__degree=[hp.quniform('regressor__regressor__degree', 0, 5, 1)],
    regressor__regressor__C=[hp.lognormal('regressor__regressor__C', 0, 1)],
    regressor__regressor__epsilon=[hp.lognormal('regressor__regressor__epsilon', 0, 1)]
)
# RESULTS
# best loss: 1.7437062778333687
# {'regressor__regressor__C': (0.35392641089361354,),
#  'regressor__regressor__degree': (3.0,),
#  'regressor__regressor__kernel': ('rbf',)}

In [None]:
ada_space = dict(
    regressor__regressor__loss=[hp.choice('regressor__regressor__loss', ['linear', 'square', 'exponential'])],
    regressor__regressor__n_estimators=[hp.quniform('regressor__regressor__n_estimators', 50, 800, 1)],
    regressor__regressor__learning_rate=[hp.lognormal('regressor__regressor__learning_rate', 0, 1)],
    regressor__regressor__base_estimator__max_depth=[hp.quniform('regressor__regressor__base_estimator__max_depth', 2, 20, 1)]
)
# RESULTS
# best loss: 1.758543277550394
# {'regressor__regressor__base_estimator__max_depth': (17.0,),
#  'regressor__regressor__learning_rate': (2.060304100430177,),
#  'regressor__regressor__loss': ('exponential',),
#  'regressor__regressor__n_estimators': (724.0,)}

In [None]:
ridge_space = dict(
    regressor__regressor__alpha=[hp.lognormal('regressor__regressor__alpha', 0, 1)],
    regressor__regressor__solver=[hp.choice('regressor__regressor__solver',
                                            ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'])]
)
# RESULTS
# best loss: 1.9026832250411743

In [None]:
lassolars_space = dict(
    regressor__regressor__alpha=[hp.lognormal('regressor__regressor__alpha', 0, 1)],
    # regressor__regressor__eps=[hp.lognormal('regressor__regressor__eps', 0, 1)]
)
# RESULTS
# best loss: 1.902702357590335

In [None]:
nusvr_space = dict(
    regressor__regressor__nu=[hp.uniform('regressor__regressor__nu', 0.0001, 1)],
    regressor__regressor__C=[hp.lognormal('regressor__regressor__C', 0, 1)],
    regressor__transformer__n_quantiles=[hp.loguniform('regressor__transformer__n_quantiles', 1, 8)]
)
# RESULTS
# best loss: 1.7244565055353362
# {'regressor__regressor__C': (0.6349626324485043,),
#  'regressor__regressor__nu': (0.71895745494701,),
#  'regressor__transformer__n_quantiles': (11.721076083470386,)}

In [None]:
knn_space = dict(
    # regressor__regressor__metric=[hp.choice('regressor__regressor__metric', ['cosine', 'l1', 'l2'])],
    regressor__regressor__n_neighbors=[hp.quniform('regressor__regressor__n_neighbors', 1, 100, 1)],
    # 'regressor__regressor__p': 2,
    regressor__regressor__weights=[hp.choice('regressor__regressor__weights', ['uniform', 'distance'])]
)
# RESULTS
# best loss: 1.8704194083689019
# {'regressor__regressor__n_neighbors': (43.0,),
#  'regressor__regressor__weights': ('distance',)}

In [None]:
weights_space = dict(
    preprocessor__transformer_weights=[
        {'numeric_exp': hp.uniform('numeric_exp', 0, 2),
         'numeric_norm': hp.uniform('numeric_norm', 0, 2),
         'boolean': hp.uniform('boolean', 0, 2),
         'profile_category': hp.uniform('profile_category', 0, 2),
         'color': hp.uniform('color', 0, 2),
         'user_language': hp.uniform('user_language', 0, 2),
         'loc': hp.uniform('loc', 0, 2),
         'loc_visib': hp.uniform('loc_visib', 0, 2),
         'personal_url': hp.uniform('personal_url', 0, 2),
         'time': hp.uniform('time', 0, 2),
         'utc': hp.uniform('utc', 0, 2),
         'ratio': hp.uniform('ratio', 0, 2),
         'avg': hp.uniform('avg', 0, 2),
         'prod': hp.uniform('prod', 0, 2),
         }])

# RESULTS
# best loss: 1.7295401465311135
# {'preprocessor__transformer_weights': ({
#    'avg': 0.7429247838555282,
#    'boolean': 1.0411784050247315,
#    'color': 1.3113898616879212,
#    'loc': 0.9317583766578823,
#    'loc_visib': 1.703423501322376,
#    'numeric_exp': 1.028338292619148,
#    'numeric_norm': 0.08119173992748974,
#    'personal_url': 1.8144143269129127,
#    'prod': 1.5566389246824337,
#    'profile_category': 1.091936715275474,
#    'ratio': 1.4571284181542004,
#    'time': 0.6867805554464619,
#    'user_language': 0.9337466050764849,
#    'utc': 0.7839861950053606},)}

In [None]:
# Run after each change in the pipeline
def objective(params, X=X_train, y=y_train, pipe=pipe, score=score, cv=3):
    '''
    A "black box" function calculating a score (=the loss to minimize) given
    a set of hyper parameters, using GridSearchCV with cv=5 by default.
    It flattens the hyper parameter spaces (if necessary).
    It casts some floats to ints (based on a hardcoded list of hyper paramer names).
    '''
    for param in ['regressor__regressor__degree',
                  'regressor__regressor__n_estimators',
                  'regressor__regressor__n_estimators',
                  'regressor__regressor__n_neighbors',
                  'dimensions__k',
                  'regressor__transformer__n_quantiles']:
        if param in params.keys():
            params[param] = [int(params[param][0])]

    grid_search = GridSearchCV(pipe, params, scoring=score, cv=cv, n_jobs=-1)
    _ = grid_search.fit(X, y)
    loss = -grid_search.best_score_
    return {'loss': loss, 'params': params, 'status': hyperopt.STATUS_OK}

In [None]:
# Pass the appropriate space before running
bayes_trials = hyperopt.Trials()
best = hyperopt.fmin(fn=objective, space=nusvr_space, algo=hyperopt.tpe.suggest,
                     max_evals=50, trials=bayes_trials)

In [None]:
# See the best params (pass the appropriate space)
hyperopt.space_eval(nusvr_space, best)