# Load modules and define custom functions 

In [20]:
import pandas as pd
import numpy as np
import os.path
import joblib
from decimal import Decimal
from IPython.core.debugger import set_trace
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.compose import make_column_transformer, ColumnTransformer, TransformedTargetRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer, OneHotEncoder
from sklearn.feature_extraction import FeatureHasher
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, HuberRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, \
GradientBoostingRegressor, VotingRegressor, StackingRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from category_encoders.hashing import HashingEncoder 

def get_list_features(my_string):
    if my_string == 'num_features':
        return num_features
    if my_string == 'num_ord_features':
        return num_features + ord_features
    if my_string == 'num_ord_cat_features':
        return num_features + ord_features + cat_features
    else:
        return print('String incorrecly formatted.')

def get_train_test_sets(X, set_features, random_state = 1):
    my_features = get_list_features(set_features)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_state)
    return X_train[my_features], X_test[my_features], y_train, y_test

def build_col_trans(set_features):
    
    if set_features == 'num_features':
        col_trans = ColumnTransformer([
            ('imp_num_cols', SimpleImputer(), num_features)    
        ])
    if set_features == 'num_ord_features':
        col_trans = ColumnTransformer([
            ('imp_num_cols', SimpleImputer(), num_features),
            ('imp_ord_cols', SimpleImputer(), ord_features)
        ])
    if set_features == 'num_ord_cat_features':
        col_trans = ColumnTransformer([
            ('imp_num_cols', SimpleImputer(), num_features),
            ('imp_ord_cols', SimpleImputer(), ord_features),
            ('imp_cat_cols', SimpleImputer(fill_value = 'missing_value'), cat_features)
        ])
    return col_trans       

def load_run_save_GSRSCV(params, key, search_type = 'GS', n_iter = 10000):
    #set_trace()
    filename = 'save_pkl/' + key + '_' + search_type + '_'
    if search_type == 'GS':        
        pass        
    if search_type == 'RS':
        filename = filename + 'n_iter' + str(n_iter) + '_'
    filename = filename + set_features 
    if skipPosition:
        filename = filename + '_skipPosition'
    filename = filename + '.pkl'
                    
    if os.path.isfile(filename):
        my_s = joblib.load(filename)
    else:
        if search_type == 'GS':
            my_s = GridSearchCV(ttr, param_grid = params, cv = 5, scoring = 'neg_root_mean_squared_error', \
                                n_jobs = -1, verbose = 10)
        if search_type == 'RS':
            my_s = RandomizedSearchCV(ttr, param_distributions = params, n_iter = n_iter, cv = 5, \
                                    scoring = 'neg_root_mean_squared_error', n_jobs = -1, verbose = 10, \
                                    random_state = 1)       
        my_s = my_s.fit(X_train, y_train)
        joblib.dump(my_s, filename)    
    
    best_est = my_s.best_estimator_
    best_est.fit(X_train, y_train)
    y_pred = best_est.predict(X_test)
    test_score = format(np.sqrt(mean_squared_error(y_test, y_pred)), 'E')
    best_CV_score = format(-my_s.best_score_, 'E')
    
    #if search_type == 'GS':
    #    test_scores[key + '_' + search_type + '_' + set_features] = test_score
    #    best_CV_scores[key + '_' + search_type + '_' + set_features] = best_CV_score
    #if search_type == 'RS':
    #    test_scores[key + '_' + search_type + '_niter' + '%.0E' % Decimal(n_iter) + '_' + set_features] \
    #    = test_score
    #    best_CV_scores[key + '_' + search_type + '_niter' + '%.0E' % Decimal(n_iter) + '_' + set_features] \
    #    = best_CV_score
        
    dict_key = filename.split('/')[1].split('.')[0]
    test_scores[dict_key] = test_score
    best_CV_scores[dict_key] = best_CV_score
    
    print('Type of search:', search_type)
    print(my_s.best_params_)    
    print('Best CV score:', best_CV_score)
    print('Test score:', test_score)
    print('#################################################################################################')
    return my_s
    
def print_key_value_sorted(key_value):
    print(*sorted(key_value.items(), key = lambda kv:(float(kv[1]), kv[0])), sep = '\n')    