In [34]:
# download data (-q is the quiet mode)
! wget -q https://github.com/CISC-372/Notebook/releases/download/a1/test.csv -O test.csv
! wget -q https://github.com/CISC-372/Notebook/releases/download/a1/train.csv -O train.csv
# you can tuning the model (search for the best hyper-parameters setting) automatically if we have a narrow range of hyper-parameter to be searched for


In [None]:
import pandas as pd
# The dataset contain rental house information, where each data sample (data row) represents a rental post
  
# we can do data pre-processing with pandas or build it into the pieline for hyper-parameter tuning 
Xy_train = pd.read_csv('train.csv', engine='python')
X_train = Xy_train.drop(columns=['price_rating'])
y_train = Xy_train[['price_rating']]

print('training', len(X_train))
#Xy_train.price_rating.hist()

X_test = pd.read_csv('test.csv', engine='python')
testing_ids = X_test.Id
print('testing', len(X_test))

# Note: The pre-processing steps are split, below are some basic pre-processing done through Pandas dataframe 

# In the original training and testing datasets, the attribute 'deposit' and 'extra_people' has '$' as prefix, ',' as infix , and 'host_response_rate' has '%' as suffix
# so we remove the '$', ',' , '%' and convert those string integers into float values 
X_train['security_deposit'] = Xy_train['security_deposit'].replace({'\$': '', ',': ''}, regex=True).astype(float)
X_train['extra_people'] = Xy_train['extra_people'].replace({'\$': '', ',': ''}, regex=True).astype(float)
X_train['host_response_rate'] = Xy_train['host_response_rate'].replace({'%': ''}, regex=True).astype(float) / 100 # divided by 100 to transform from '%' representation to ordinary numeric representation

# The attribute 'deposit', 'extra_people', and 'host_response_rate' values in X_test must also be converted into float type values
# So that our model can be applied to the testing set
X_test['security_deposit'] = X_test['security_deposit'].replace({'\$': '', ',': ''}, regex=True).astype(float)
X_test['extra_people'] = X_test['extra_people'].replace({'\$': '', ',': ''}, regex=True).astype(float)
X_test['host_response_rate'] = X_test['host_response_rate'].replace({'%': ''}, regex=True).astype(float) / 100

# For time-series attribute 'host_since' and 'last_scraped', we can combine them to create a new numeric feature to be added into the feature space
training_days_active = pd.to_datetime(Xy_train['last_scraped']) - pd.to_datetime(Xy_train['host_since']) # This is a pandas Seres about the number of days that a host has been on the platform
testing_days_active = pd.to_datetime(X_test['last_scraped']) - pd.to_datetime(X_test['host_since'])
# Create a new numeric feature, named 'host_days_active', based on the 'host_since' and 'last_scraped'
X_train['host_days_active'] = training_days_active.astype('timedelta64[D]')
X_test['host_days_active'] = testing_days_active.astype('timedelta64[D]')

# For time-series attribute 'first_review' and 'last_review', we can combine them to create a new numeric feature to be added into the feature space
training_review_active = pd.to_datetime(Xy_train['last_review']) - pd.to_datetime(Xy_train['first_review']) # This is a pandas Seres about the number of days that reviews are being written for the rental listings
testing_review_active = pd.to_datetime(X_test['last_review']) - pd.to_datetime(X_test['first_review'])
# Create a new numeric feature, named 'review_active', based on the 'first_review' and 'last_review'
X_train['review_active'] = training_review_active.astype('timedelta64[D]')
X_test['review_active'] = testing_review_active.astype('timedelta64[D]')


# Manual tuning approach: 
1. Split the training set into 2 subsets (as the training set have the known target attribute values), one subset for training/building the model, and the another one for evaluating the model as the 1st validation set
2. Then, based on the model's performance on the 1st validation set, we can do hyper-parameter tuning to adjust the model
3. Once a new model with good hyper-parameter settings is gained, we test the model on the entire training set (put the 2 subsets back togeter) namely adjusting the model again based on its performance on Xy_train (but we should not change the hyper-parameter settings as we have already optimized its)
4. Then, we can apply our model to the 2nd validation set (testing set in public leaderboard)

# Semi-auto tuning approach: (using hold-out method)
 1. split Xy_train into training set and 1st validation set
 2. pick a range of hyper-parameters (ex: regularization, learning rate,etc)
 2. training set -> build all the models based on the hyper-parameter range we choose
 3. validation set -> evaluate all the models we gained in step 3 -> adjust the hyper-parameter ranges and change the model (go back step 2 if needed)
 4. train a new model using the chosen hyper-parameters on Xy_train, and evaluate on X_test

# Semi-auto tuning approach: (using cross-validation method)
 1. pick a range of hyper-parameters (ex: pre-processing, data selections, regularization, learning rate,etc)
 2. train/evaluate models using CV on the training set, Xy_train
 3. Based on the results of CV, we adjust the hyper-parameter ranges or change the model (go back step 2 if needed)
 4. train a new model using the chosen/ideal hyper-parameters on Xy_train, and evaluate on X_test
 

In [None]:
# model training and tuning
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, Normalizer, OneHotEncoder,OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
#from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


np.random.seed(0) # set the randomizer to default, so we can get the same sequence of data rows in each fold of CV every time we re-run the code
# so we can better analyze the performance of the model as the impact brought by the randomization in CV is removed

# select needed data attributes for classification purpose (as the target attribute is categorical data [0,1,2])
# and we can select different pre-processing techniques for different types of attributes (numeric vs categorical)
# so we need to treat different types of attributes separately

# increase model performance by selecting more attributes 
numeric_features = ['bedrooms', 'review_scores_location','host_total_listings_count', 'availability_60','accommodates', 'beds', 'bathrooms',
                    'availability_90','guests_included', 'minimum_nights','maximum_nights', 'review_scores_rating',
                    'reviews_per_month','availability_365','availability_30','review_scores_accuracy','review_scores_value', 
                    'review_scores_cleanliness','security_deposit','extra_people','review_scores_communication', 
                    'review_scores_checkin', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm',
                   'number_of_reviews_ltm', 'host_response_rate',
                   'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count', 'number_of_reviews','calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms',
                    'minimum_minimum_nights','maximum_minimum_nights','minimum_maximum_nights','maximum_maximum_nights', 
                    # 'host_days_active','review_active', Note: these two attributes are commented out as adding them into the training set reduces the model's performance
                   ] # select needed numeric features from the training dataset

# Define a transformer/pre-processor for numeric attributes
# pipeline() means we can have different steps in pre-processing, and in each step, we can transform the features (we can have as many steps as we want)
# Also, we need to give a name to each step in pipeline as we need to specify the range of hyper-parameters by specifying which step we want to adjust later
numeric_transformer = Pipeline(steps=[
    ('iterative_imputer', IterativeImputer(max_iter=10, random_state=0)),
    #    ('imputer', SimpleImputer(strategy='median')), # the first step is called 'imputer', which replaces the missing value with the median attribute value, but it did not yield a better performance than 'iterative_imputer'
    ('scaler', StandardScaler())]) # the second step is called 'scaler', which transfers each entry in numeric data columns to have zero mean and unit variance with respect to the column it is in

# select categorical features
categorical_features = [
  'property_type', 'is_business_travel_ready', 'room_type', 'bed_type', 'is_location_exact','host_identity_verified',
  'host_response_time','require_guest_profile_picture','require_guest_phone_verification','has_availability', 
  'cancellation_policy','host_is_superhost','instant_bookable', 
    'calendar_updated', 'requires_license', 
    
] 
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), # replace all the missing values with the constant string 'missing' 
    # note the step names in different pipelines can be the same
#    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)) # try ordinal encoder
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# ex of onehot encoder: if 'property_type' have value = [apt,house,room] 
# After encoding, we will get three different features with each having the value [0,1]:
# property_type_apt = [0,1], so 1 stands for the observation having 'apt' at property_type, 0 for other property_type
# property_type_house = [0,1]
# property_type_room = [0,1]

# ColumnTransformer transforms each of the selected data column (it will transform all the selected attributes in training and testing sets)
preprocessor = ColumnTransformer(  # apply categorical_transformer to categorical_features, and apply numeric_transformer to numeric_features
    transformers=[    # define list of transformer we want  
        ('num', numeric_transformer, numeric_features), # give a name 'num' to the numeric_transformer to pre-process the numeric_features
        ('cat', categorical_transformer, categorical_features)]) # give a name 'cat' to the categorical_transformer to pre-process the categorical_features
# note that each transformer requires an input list of features, and
# we have 2 different transformers because we have different pre-processing techniques for different types of attributes (categorical vs numeric)
# and you can have as many transformers as you want in the ColumnTransformer()

# define the whole pipeline of building/training the classifier/model by combining the pre-processor to the model building process
regr = Pipeline(steps=[('preprocessor', preprocessor), # the first step is do the pre-processing with using the pre-processor we defined above, and the pre-processor will process different features based on which type group it is in with using the corresponding transformer
                      #('standardscaler', StandardScaler(copy=False, with_mean=False)), # standardization on the training set, not needed as they are in the preprocessor already 
                       ('normalizer', Normalizer()), # the second step is to integrate regularization into the model building process to avoid overfitting 
                       ('classifier', LogisticRegression(random_state=123, multi_class='multinomial'))]) # The third step is choosing Logistic Regression model for the classification problem                    


# Feature selection: select the needed non-target attributes from the updated training and testing sets
X_train = X_train[[*numeric_features, *categorical_features]] # [*numeric_features, *categorical_features] merges two independent lists into one, instead of merging into a list of lists
X_test = X_test[[*numeric_features, *categorical_features]]

# `__` denotes attribute of the previous ONE term/name
# (e.g. regressor__n_estimators means the `n_estimators` parameter for `classifier`
#  which is our xgb)

# try RandomSearchCV
distributions = { # set the range of (hyper-)parameters we want to search,
    'preprocessor__num__iterative_imputer__max_iter': range(10,20), # The default value of 'max_iter' is 10, and there is no need to extend further the range(10,20) as the best setting for the 'max_iter' is always below 18
#    'preprocessor__num__imputer__strategy': ['mean','median','most_frequent'], # search the optimal hyper-parameter setting for the pre-processor 'imputer', but it did not yield a better model performance than 'iterative_imputer'
    'classifier__max_iter': range(200,400), # defautl value is 100, we set the range to be (200,400) as training longer can avoid underfitting
    'classifier__solver': ['newton-cg','lbfgs', 'sag', 'saga' ], # search which algorithm is the best to be used in the optimization problem
    'classifier__tol': [1e-4,1e-5,], # note that '1e-5' tends to not converge when the 'max_iter' is reached
    'classifier__class_weight': ['balanced',None], # the default value is None, but it turns out that 'balanced' is the best setting for this parameter in most of situations  
    'normalizer__norm': ['l1','l2','max'] # 'max' turns out to be the best settting for the regularization strategy
    
    #'classifier__penalty': ['l2','none','elasticnet'] #see if regularization needed, the default value is 'l2', and it turns out tuning this parameter is redundant as we already have a normalizer() in the search space    
} 

# Adjustment log (previous records/logs are lost due to some data storage problem):
# 9th tuning: remove Normalizer() from regr to see if it is redundant
# result: Performance decreases dramatically (need to add regularization back), and 'balanced' should be the best parameter setting for 'class_weight'

# 10th tuning: add Normalizer back, extend 'max_iter' to range(200,400)
# result: Performance did not improve, may need to change the model architecture or remove some attributes from the training set

# fit the model on the full training dataset with using CV, namely the step 4 in the Semi-auto tuning apporoach
random_search_log = RandomizedSearchCV(regr, distributions, n_iter=40,random_state=0,scoring = 'f1_micro', # f1 with 'micro-averaging' in a multiclass setting is chosen, and will return the total ratio of tp/(tp + fp)   
                                       n_jobs = -1, cv=5, verbose=1 )
# n_jobs = -1 means using all the CPU processors, random_state=0 to ensure we get the same result/performance each time we run this cell of code
random_search_log.fit(X_train, y_train)
print('best score {}'.format(random_search_log.best_score_))

In [11]:
# Get feedbacks, see what are the best (hyper-)parameter settings in the search space we specified above
random_search_log.best_params_

{'preprocessor__num__iterative_imputer__max_iter': 18,
 'normalizer__norm': 'l2',
 'classifier__tol': 1e-05,
 'classifier__solver': 'sag',
 'classifier__max_iter': 239,
 'classifier__class_weight': None}

In [28]:
# Get feedbacks, determine the performance of the model on the training set, and the evaluation metric is set to be 'accuracy' (as we want to check the model's accuracy first)
from sklearn.model_selection import cross_validate
scoring = ['accuracy'] 
scores = cross_validate(random_search_log.best_estimator_, X_train, y_train, scoring=scoring, cv=5, n_jobs=-1) 

In [29]:
scores # print the scores

{'fit_time': array([9.49279404, 6.90370774, 9.02439189, 9.41852403, 3.88465691]),
 'score_time': array([0.03730392, 0.05217409, 0.06174397, 0.03809118, 0.03446078]),
 'test_f1_micro': array([0.69679109, 0.71952818, 0.72083879, 0.7129751 , 0.70642202])}

In [33]:
# Prediction & generating the submission file
y_pred = random_search_log.predict(X_test) # generate the predictions on the testing set with using the model/classifier we trained/tuned above
pd.DataFrame( #construct a dataframe with the 'Id' value of the observations and the predictions of testing 'price_rating' values we gained above, then export the dataframe as a CSV file that can be submitted to the Kaggle leaderboard
    {'Id': testing_ids, 'price_rating':y_pred}).to_csv('sample_submission.csv', index=False)


# Define another pipeline of building/training a XGB classifier/model with using RandomSearchCV

In [14]:


regr = Pipeline(steps=[('preprocessor', preprocessor), # the first step is do the pre-processing with using the pre-processor we defined above, and the pre-processor will process different features based on which type group it is in with using the corresponding transformer
#                     ('normalizer', Normalizer()), # the second step is to integrate regularization into the model building process to avoid overfitting 
                       ('classifier', XGBClassifier( # the third step is choosing the model architecture we will use
                         seed=1, ))]) # set the number of classes, num_class=3
param_random = { # the range of hyper-parameters we want to search
    'preprocessor__num__iterative_imputer__max_iter': range(10,20),# The default value of 'max_iter' is 10, 
 #   'preprocessor__num__imputer__strategy': ['mean'],#'median','most_frequent'], # search the optimal hyper-parameter setting for the pre-processor 'imputer', but it failed to yield a better model performance than 'iterative_imputer'
 #   'preprocessor__cat__onehot__drop': ['first','if_binary',None], #Tuning the categorical pre-processor, the default value is 'None', and it turns out that 'None' is the one yielding the best model performance in most of the situations
    'classifier__objective': ['multi:softmax', 'multi:softprob'],#'rank:pairwise','rank:map' ], tuning the objective function, but it turns out 'multi:softmax' is the best parameter setting in most of the cases
    #'classifier__eval_metric': ['merror','map','mlogloss','aucpr'], # Tuning this hyperparameter does not affect the performance of the model at all
    'classifier__max_depth': range(6,30), # The default value is 6, so we make it more flexible by extending the max_depth from the default value 6 to 30
    'classifier__n_estimators': range(200,400), # The default value is 100, so we make it more flexible by extending the number of estimators from the default value 100 to the range(200,400)
    'classifier__colsample_bynode': np.arange(0.0, 1.1, 0.1), # the ranges of colsample_bynode, colsample_bylevel, and colsample_bytree are all (0,1], so we use np.arange() rather than range()
    'classifier__colsample_bytree': np.arange(0.0, 1.1, 0.1), # np.arange(0.0, 1.1, 0.1) gives a array of float values from 0.0 to 1.0 with incrementing each element by 0.1 
    'classifier__colsample_bylevel': np.arange(0.0, 1.1, 0.1), 
    # try np.arange(0.0,1.0,0.05) to see if it give better performance than (0.0, 1.1, 0.1), but it turns out that (0.0,1.0,0.05) does not give a better performance  
#    'classifier__colsample_bynode': np.arange(0.0,1.0,0.05), # the ranges of colsample_bynode, colsample_bylevel, and colsample_bytree are all (0,1], so we use np.arange() rather than range()
#    'classifier__colsample_bytree': np.arange(0.0,1.0,0.05), # np.arange(0.0,1.05,0.05) gives a array of float values from 0.0 to 1.0 with incrementing each element by 0.05 
#    'classifier__colsample_bylevel': np.arange(0.0,1.0,0.05),
#    'classifier__booster':['gbtree', 'gblinear', 'dart'], # check which booster performs better, and it turns out that the default 'gbtree' is always better than the other two boosters
    'classifier__min_child_weight': range(0,10), # the larger the min_child_weight(default value is 1) and max_delta_step(default value is 0) values are, the more conservative the algorithm will be
#    'classifier__max_delta_step': range(0,10), # However, tuning the 'max_delta_step' tends to deteriorate the model's performance
    'classifier__eta': np.arange(0.01,0.2,0.01), # adjust the learning rate, but the performance of the model is worse when we tuning the learning rate in a range that is above the default value 0.3, such as (0.4,1.0,0.1)
#    'classifier__scale_pos_weight': range(1,10), # we adjust the balance of positive and negative weights, the default value is 1, but tuning this parameter is not recommended by the system, and it did not improve the model's performance too
    'classifier__gamma': range(0,10), # default value of gamma is 0, and the larger gamma is, the more conservative the algorithm will be
#    'classifier__tree_method': ['auto','hist'], # the default value is 'auto', and it turns out that the default value will yield the best performance of the model in most of the situations 
#    'normalizer__norm': ['max'], # ,'l1','l2'], 'max' turns out to be the best settting for the regularization strategy, however, it later turns out that tuning the model's regularization parameter will be better than tuning this Normalizer in the model's pipeline
    'classifier__lambda': range(1,6), # L2 regularization term on weights, the default value is 1, and there is no need to extend further the range(1,6) as the best setting for the 'lambda' is always below 3
    'classifier__alpha': range(0,6), # L1 regularization term on weights, the default value is 0, and there is no need to extend further the range(0,6) as the best setting for the 'alpha' is always below 3
}
# Adjustment log (previous records/logs are lost due to some data storage problem): 
# 24th tuning: remove 'l1','l2' from normalizer__norm, remove classifier__colsample_bylevel, replace 'exact' with 'hist' in 'tree_method' to see if performance increase
# result: performance (accuracy score) did not improve (may because we did not tune 'colsample_bylevel'), and 'auto' is still the best setting for 'tree_method' parameter

# 25th tuning: put back classifier__colsample_bylevel for tuning, change 'eta' tuning range from (0.1,0.3,0.1) to (0.01,0.2,0.01)
# result: performance in both training set and the validation set improve to 73.09%

# 26th tuning: Same parameter setting with 25th tuning, but add 'host_days_active' and 'review_active' attributes into the feature space
# result: performance decreases

# 27th tuning: change 'eta' from (0.01,0.2,0.01) back to (0.1,0.3,0.1)
# result: performance did not improve (may need to remove the added 'host_days_active' and 'review_active' attributes)

# 28th: remove Normalizer() parameter and 'host_days_active' and 'review_active' attributes
# result: Performances in both training set and the validation set improve to 73.5%

# 29th: Tuning the model's regularization parameter 'lambda' and 'alpha'
# result: Performance on both training set and the validation set improve to 73.6% (The end of Kaggle competition)

# Originally, I used GridSearchCV() for XGBooster model training, but it takes 50 mins to train and can only search a few (hyper-)parameters at one run, so I shifted to randomSearchCV
random_search = RandomizedSearchCV( # pass the model pipeline and the ranges of (hyper-)parameters we want to search as arguments to RandomizedSearchCV()
    regr, param_random, cv=5, verbose=1, n_jobs=-1,  # cv=5 means we have 5 folds for the CV, n_jobs = -1 means using all CPU processors
    n_iter=35,random_state=1,  # 'n_iter'=35 means that 35 parameter settings are randomly sampled, and so we will have 35 models that will go through the 5-fold cross-validation
    scoring='f1_micro') 

random_search.fit(X_train, y_train)
print('best score {}'.format(random_search.best_score_))

Fitting 5 folds for each of 35 candidates, totalling 175 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 25.1min
[Parallel(n_jobs=-1)]: Done 175 out of 175 | elapsed: 33.6min finished
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


best score 0.7307058358030764


In [15]:
# Get feedbacks, see what are the best (hyper-)parameter settings in the search space we specified above
random_search.best_params_

{'preprocessor__num__iterative_imputer__max_iter': 14,
 'classifier__objective': 'multi:softmax',
 'classifier__n_estimators': 307,
 'classifier__min_child_weight': 3,
 'classifier__max_depth': 8,
 'classifier__lambda': 1,
 'classifier__gamma': 4,
 'classifier__eta': 0.19,
 'classifier__colsample_bytree': 0.7000000000000001,
 'classifier__colsample_bynode': 1.0,
 'classifier__colsample_bylevel': 1.0,
 'classifier__alpha': 2}

In [16]:
# Get feedbacks, determine the performance of the model on the training set, and the evaluation metric is set to be 'accuracy' 
scoring = ['accuracy']  
scores = cross_validate(random_search.best_estimator_, X_train, y_train, scoring=scoring, cv=5, n_jobs=-1) 
scores

{'fit_time': array([116.01877117,  85.75852799, 116.25719094, 116.19856882,
         55.20052505]),
 'score_time': array([0.08368492, 0.16831899, 0.08954906, 0.08655715, 0.04163504]),
 'test_accuracy': array([0.71447282, 0.73328965, 0.73984273, 0.73525557, 0.73066841])}

In [21]:
# Prediction & generating the submission file
y_pred = random_search.best_estimator_.predict(X_test) # generate the predictions on the testing set with using the model/classifier we trained/tuned above
pd.DataFrame( #construct a dataframe with the 'Id' value of the observations and the predictions of testing 'price_rating' values we gained above, then export the dataframe as a CSV file that can be submitted to the Kaggle leaderboard
    {'Id': testing_ids, 'price_rating':y_pred}).to_csv('sample_submission.csv', index=False)


# Define another pipeline of building/training a XGB classifier/model with using GridSearchCV for comparison purpose


In [826]:

regr = Pipeline(steps=[('preprocessor', preprocessor), # the first step is do the pre-processing with using the pre-processor we defined above, and the pre-processor will process different features based on which type group it is in with using the corresponding transformer
                     ('normalizer', Normalizer()), # the second step is to integrate regularization into the model building process to avoid overfitting 
                       ('classifier', XGBClassifier( # the third step is choosing the model architecture we will use
                         seed=1, num_class=3 ))]) # set the number of classes, num_class=3
grid_para = {'preprocessor__num__iterative_imputer__max_iter':[15],
             'classifier__objective': ['multi:softmax', ],#'multi:softprob'],
             'classifier__max_depth': [6,20],
             'classifier__n_estimators': [200,300,],
             'classifier__colsample_bynode': [0.4,0.7,0.9], # the ranges of colsample_bynode, colsample_bylevel, and colsample_bytree are all (0,1], 
            'classifier__colsample_bytree': [0.4,0.7,0.9], 
#            'classifier__colsample_bylevel': [0.4,0.7,0.9], these parameters are commented out as tuning them will increase the training time significantly
#             'classifier__min_child_weight': [1,3,7],
#             'classifier__max_delta_step': [1,3,7],
#             'classifier__eta': [0.1,0.3,0.5]
            }
grid_search = GridSearchCV( 
    regr, grid_para, cv=5, verbose=3, n_jobs=-1, 
    scoring='f1')
grid_search.fit(X_train, y_train)
print('best score {}'.format(grid_search.best_score_))

# The disadvantage of gridSearch is apparent, 
# the number of (hyper-)parameters and the numbers of the optional values for the parameters we are tuning are much less than the parameters we are tuning for the RandomSearchCV at one run 
# And if we want to tuning more parameters, then the total number of fits will be much higher than that of fits required by RandomSearchCV with n_iter set reasonably
# Thus, the time taken to complete the GridSearchCV is much longer than RandomSearchCV if we want to tuning an adequate number of (hyper-)parameters
# and the performances of the model gained from both GridSearchCV and RandomSearchCV tend to be similar if sufficient amount of training time are given to both of the methods,
# while the performance gained from RandomSearchCV is likely to be better than the one gained from GridSearchCV if the training time is limited



Fitting 5 folds for each of 36 candidates, totalling 180 fits


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]
  return f(*args, **kwargs)


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


best score nan


In [827]:
scoring = ['accuracy'] # you can include as many scores as you want
scores = cross_validate(grid_search.best_estimator_, X_train, y_train, scoring=scoring, cv=5, n_jobs=-1)
scores

{'fit_time': array([29.28269911, 30.96060705, 30.16808128, 30.30623984,  9.86873031]),
 'score_time': array([0.17380285, 0.06898999, 0.11868501, 0.1181221 , 0.06486988]),
 'test_accuracy': array([0.69548134, 0.72018349, 0.73263434, 0.72804718, 0.72083879])}

# Define another pipeline of building/training a different classifier/model with using Bayesian optimization

In [None]:

from skopt import BayesSearchCV
# parameter ranges are specified by one of below
from skopt.space import Real, Categorical, Integer
from sklearn.svm import SVC

regr = Pipeline(steps=[('preprocessor', preprocessor), # the first step is do the pre-processing with using the pre-processor we defined above, and the pre-processor will process different features based on which type group it is in with using the corresponding transformer
                     ('normalizer', Normalizer()), # the second step is to integrate regularization into the model building process to avoid overfitting 
                       ('classifier', SVC( # the third step is choosing the search space (which model we will use)
                         max_iter=10000 ))])
param_grid = { # the range of hyper-parameters we want to search
   'preprocessor__num__iterative_imputer__max_iter': Integer (5,20),
    # 'preprocessor__num__imputer__strategy': ['mean','median','most_frequent'], # search the optimal hyper-parameter setting for the pre-processor
    'classifier__kernel': Categorical(['linear','poly', 'rbf', 'sigmoid','precomputed']), 
    'classifier__gamma': Categorical(['auto','scale']),
    # note that for Integer and Real, we only need to supply the lower and upper bounds (inclusive), and random values will be sampled uniformly accoridng to the range we set
    'classifier__degree': Integer(1,10), # set degree for poly kernel use only
    'classifier__coef0': Integer(0,5), #  only used in ‘poly’ and ‘sigmoid’
    'classifier__tol': Real(1e-4,1e-3),
    'classifier__decision_function_shape': Categorical(['ovr','ovo']), # default value is 'ovr',and 'ovo' cannot be used when is 'break_ties=True'
#    'classifier__break_ties': Categorical([True, False]), # This parameter can only be True when decision_function_shape='ovr', so the default value is False
    'classifier__cache_size': Integer(200,300), # Specify the size of the kernel cache size (MB), the default value is 200
    'normalizer__norm': Categorical(['l1','l2','max']),
    }

# adjustment log
# 1st Tuning Result: 72% in training set

# 2nd Tuning: Added parameter 'decision_function_shape', 'break_ties', 'cache_size' for tuning
# result: performance did not improve, 'False' is the best parameter setting for 'break_ties'

# 3rd Tuning: Remove 'break_ties',
# result: performance did not improve, may need to change the model architecture or remove/add some attributes from/to the training set, 'ovr' should be the best setting for 'decision_function_shape'

bayes_search = BayesSearchCV( # putting the model pipeline, the range of hyper-parameters we want to search, into the BayesSearchCV
    regr, param_grid, n_iter=50, n_points=10, 
    cv=5, random_state=0 ,verbose=1, n_jobs=-1, iid=True, # cv=5 means we have 5 folds for the CV, and n_jobs = 2 means number of CPU we want to use
    scoring = 'f1_micro')

bayes_search.fit(X_train, y_train)
print('best score {}'.format(bayes_search.best_score_))

In [31]:
bayes_search.best_params_

OrderedDict([('classifier__cache_size', 262),
             ('classifier__coef0', 1),
             ('classifier__decision_function_shape', 'ovr'),
             ('classifier__degree', 10),
             ('classifier__gamma', 'scale'),
             ('classifier__kernel', 'rbf'),
             ('classifier__tol', 0.00010000242082487538),
             ('normalizer__norm', 'l1'),
             ('preprocessor__num__iterative_imputer__max_iter', 6)])

In [32]:
scoring = ['accuracy'] 
scores = cross_validate(bayes_search.best_estimator_, X_train, y_train, scoring=scoring, cv=5, n_jobs=-1) 
scores

{'fit_time': array([17.98857689, 15.753829  , 17.69740081, 17.84777069,  9.77823997]),
 'score_time': array([2.3238101 , 1.81304884, 2.43443131, 2.42977118, 1.26624489]),
 'test_accuracy': array([0.69941061, 0.7293578 , 0.72346003, 0.72018349, 0.71428571])}

In [305]:
# Prediction & generating the submission file
y_pred = bayes_search.predict(X_test) # generate the predictions on the testing set with using the model/classifier we trained/tuned above
pd.DataFrame( #construct a dataframe with the 'Id' value of the observations and the predictions of testing 'price_rating' values we gained above, then export the dataframe as a CSV file that can be submitted to the Kaggle leaderboard
    {'Id': testing_ids, 'price_rating':y_pred}).to_csv('sample_submission.csv', index=False)

# Bayesian optimization is slightly better than Randomized Search in this problem, because Bayesian Search spend about 21 mins to finish the (hyper-)parameters optimization, while Randomized Search spend about 24 mins (3 mins more than Bayesian Search), and the score gained by Bayesian Search is similar to that gained by Randomized Search (both scores are around '72%')

In [22]:
# SVM with RandomizedSearchCV for comparison purpose
regr = Pipeline(steps=[('preprocessor', preprocessor), # the first step is do the pre-processing with using the pre-processor we defined above, and the pre-processor will process different features based on which type group it is in with using the corresponding transformer
                     ('normalizer', Normalizer()),
                       ('classifier', SVC( # the second step is choosing the search space (which model we will use)
                          random_state=1, max_iter=10000))])
param_svc = { # the range of hyper-parameters we want to search
   'preprocessor__num__iterative_imputer__max_iter': range(10,20),
    # 'preprocessor__num__imputer__strategy': ['mean','median','most_frequent'], # search the optimal hyper-parameter setting for the pre-processor
    'classifier__kernel':['linear','poly', 'rbf', 'sigmoid','precomputed'], 
    'classifier__gamma': ['auto','scale'],
    'classifier__degree': range(1,10), # set degree for poly kernel use only, the default value is 3
    'classifier__coef0': range(0,10), #  only used in ‘poly’ and ‘sigmoid’, default value is 0,
    'classifier__tol': [1e-4,1e-3], # default value is 1e-3
    'classifier__decision_function_shape': ['ovr','ovo'], # default value is 'ovr', and 'ovo' cannot be used when is 'break_ties=True'
#    'classifier__break_ties': [True, False], # This parameter can only be 'True' when decision_function_shape='ovr', so the default value is 'False'. However, tuning this parameter is redundant as 'False' turns out to be the best parameter setting for 'break_ties' 
    'classifier__cache_size': [200,300], # Specify the size of the kernel cache size (MB), the default value is 200
    'normalizer__norm': ['l1','l2','max'],
}
random_search_svc = RandomizedSearchCV( # pass the model pipeline and the ranges of (hyper-)parameters we want to search as arguments to RandomizedSearchCV()
    regr, param_svc, cv=5, verbose=3, n_jobs=-1,  # cv=5 means we have 5 folds for the CV, n_jobs = -1 means using all CPU processors
    n_iter=35,random_state=1,  # 'n_iter'=35 means that 35 parameter settings are randomly sampled, and so we have 35 models that will go through the 5-fold cross-validation
    scoring='f1_micro') 

random_search_svc.fit(X_train, y_train)
print('best score {}'.format(random_search_svc.best_score_))

Fitting 5 folds for each of 35 candidates, totalling 175 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 175 out of 175 | elapsed: 13.2min finished
  y = column_or_1d(y, warn=True)


best score 0.7172082077004484


In [23]:
random_search_svc.best_params_

{'preprocessor__num__iterative_imputer__max_iter': 17,
 'normalizer__norm': 'l2',
 'classifier__tol': 0.001,
 'classifier__kernel': 'rbf',
 'classifier__gamma': 'scale',
 'classifier__degree': 2,
 'classifier__decision_function_shape': 'ovr',
 'classifier__coef0': 6,
 'classifier__cache_size': 300,
 'classifier__break_ties': False}

In [24]:
scoring = ['accuracy'] 
scores = cross_validate(random_search_svc.best_estimator_, X_train, y_train, scoring=scoring, cv=5, n_jobs=-1) 
scores

{'fit_time': array([22.85916519, 19.7175808 , 23.04716897, 24.0023489 , 11.26818204]),
 'score_time': array([3.07712007, 2.68952012, 3.0738728 , 2.7542901 , 1.40864897]),
 'test_accuracy': array([0.70137525, 0.72870249, 0.72411533, 0.71625164, 0.71559633])}

In [5]:
# Note: This is a patch copied from https://github.com/scikit-optimize/scikit-optimize/issues/978
# because in the newest skopt version, the parameter 'iid' is removed from BayesSearchCV(), and if we still want to run the code properly, this patch needs to be run first
def bayes_search_CV_init(self, estimator, search_spaces, optimizer_kwargs=None,
                         n_iter=50, scoring=None, fit_params=None, n_jobs=1,
                         n_points=1, iid=True, refit=True, cv=None, verbose=0,
                         pre_dispatch='2*n_jobs', random_state=None,
                         error_score='raise', return_train_score=False):

        self.search_spaces = search_spaces
        self.n_iter = n_iter
        self.n_points = n_points
        self.random_state = random_state
        self.optimizer_kwargs = optimizer_kwargs
        self._check_search_space(self.search_spaces)
        self.fit_params = fit_params

        super(BayesSearchCV, self).__init__(
             estimator=estimator, scoring=scoring,
             n_jobs=n_jobs, refit=refit, cv=cv, verbose=verbose,
             pre_dispatch=pre_dispatch, error_score=error_score,
             return_train_score=return_train_score)
        
BayesSearchCV.__init__ = bayes_search_CV_init


### Note: Answers to the word questions in this assignment are documented in 'A1 original_script' notebook
### Below is a Cell of Code trying to implement the LGBMClassifier, but I failed to make it running on my Macbook as I need to download and install multiple softwares to make it run and I do not have that mcuh time 
### (So you can ignore the below codes though I believe it can be run if you have the 'lightgbm' package)

In [732]:
# define another pipeline of building/training a different classifier/model
from lightgbm import LGBMClassifier
regr = Pipeline(steps=[('preprocessor', preprocessor), # the first step is do the pre-processing with using the pre-processor we defined above, and the pre-processor will process different features based on which type group it is in with using the corresponding transformer
                     ('normalizer', Normalizer()),
                       ('classifier', LGBMClassifier( # the second step is choosing the search space (which model we will use)
                        objective='multiclass'))]) # the objective is multiclass for LGBMClassifier as we have three types of label values
distribution = { # the range of hyper-parameters we want to search
   'preprocessor__num__iterative_imputer__max_iter': range(5,20),# use [5,10,15], in GridSearchCV 
 #   'preprocessor__num__imputer__strategy': ['mean'],#'median','most_frequent'], # search the optimal hyper-parameter setting for the pre-processor
    'classifier__boosting_type': ['gbdt', 'dart','goss','rf'],#'rank:pairwise','rank:map' ], 
    #'classifier__eval_metric': ['merror','map','mlogloss','aucpr'], # merror = Multiclass classification error rate  
    'classifier__max_depth': range(5,20), # use [6, 10], in GridSearchCV
    'classifier__n_estimator': range(100,250), # use [100,200], in GridSearchCV
    'classifier__colsample_bytree': range(0,1), # the range of colsample_bytree is (0,1] 
    'classifier__colsample_bylevel': range(0,1),
    'normalizer__norm': ['max'] # 'l1','l2',
}
random_search = RandomizedSearchCV( # putting the model pipeline, the range of hyper-parameters we want to search,
    regr, distribution, cv=5, verbose=1, n_jobs=-1,  # cv=5 means we have 5 folds for the CV, n_jobs = 2 means number of CPU we want to use
    n_iter=60,random_state=0,
    scoring='f1')

random_search.fit(X_train, y_train)
print('best score {}'.format(random_search.best_score_))

OSError: dlopen(/opt/anaconda3/envs/test372/lib/python3.7/site-packages/lightgbm/lib_lightgbm.so, 6): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib
  Referenced from: /opt/anaconda3/envs/test372/lib/python3.7/site-packages/lightgbm/lib_lightgbm.so
  Reason: image not found

In [None]:
random_search.best_params_

In [None]:
scoring = ['accuracy'] # you can include as many scores as you want
scores = cross_validate(random_search.best_estimator_, X_train, y_train, scoring=scoring, cv=5, n_jobs=-1) 

In [None]:
scores