In [1]:
# Setting up dataset for hyperparameter optimization

import category_encoders as ce
from glob import glob
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline

In [18]:
train = pd.read_csv('competition_data/train_set.csv')
test = pd.read_csv('competition_data/test_set.csv')
tube = pd.read_csv('competition_data/tube.csv')
materials = pd.read_csv('competition_data/bill_of_materials.csv')
components = pd.read_csv('competition_data/components.csv')

comp = pd.concat((pd.read_csv(path) for path in glob('competition_data/comp_*.csv')), sort=False)

In [19]:
assembly_components = materials.melt(id_vars='tube_assembly_id',
                                    value_vars=[f'component_id_{n}' for n in range(1,9)])

assembly_components = (assembly_components
                      .sort_values(by='tube_assembly_id')
                      .dropna()
                      .rename(columns={'value': 'component_id'}))

assembly_component_types = assembly_components.merge(components, how='left')

In [20]:
table = pd.crosstab(assembly_component_types['tube_assembly_id'],
                   assembly_component_types['component_type_id'])
table=table.reset_index()
table.head()

component_type_id,tube_assembly_id,CP-001,CP-002,CP-003,CP-004,CP-005,CP-006,CP-007,CP-008,CP-009,...,CP-021,CP-022,CP-023,CP-024,CP-025,CP-026,CP-027,CP-028,CP-029,OTHER
0,TA-00001,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
1,TA-00002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,TA-00003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,TA-00004,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,TA-00005,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0


In [21]:
features = ['component_id', 'component_type_id', 'orientation', 'unique_feature', 'weight']
comp = comp[features]
comp['orientation'] = (comp['orientation']=='Yes').astype(int)
comp['unique_feature'] = (comp['unique_feature']=='Yes').astype(int)
comp['weight'] = comp['weight'].fillna(comp['weight'].median())

In [22]:
materials['components_total'] = sum(materials[f'quantity_{n}'].fillna(0) for n in range(1,9))
materials['components_distinct'] = sum(materials[f'component_id_{n}'].notnull().astype(int) for n in range(1,9))
materials['orientation'] = 0
materials['unique_feature'] = 0
materials['weight'] = 0

for n in range(1,9):
    materials = materials.merge(comp, how='left',
                               left_on=f'component_id_{n}',
                               right_on='component_id',
                               suffixes=('', f'_{n}'))
    
for col in materials:
    if 'orientation' in col or 'unique_feature' in col or 'weight' in col:
        materials[col] = materials[col].fillna(0)
        
materials['orientation'] = sum(materials[f'orientation_{n}'] for n in range(1,9))
materials['unique_feature'] = sum(materials[f'unique_feature_{n}'] for n in range(1,9))
materials['weight'] = sum(materials[f'weight_{n}'] for n in range(1,9))

features = ['tube_assembly_id', 'orientation', 'unique_feature', 'weight',
           'components_total', 'components_distinct', 'component_id_1']
materials = materials[features]

In [23]:
train['quote_date_year'] = pd.to_datetime(train['quote_date'], infer_datetime_format=True).dt.year
test['quote_date_year'] = pd.to_datetime(train['quote_date'], infer_datetime_format=True).dt.year

train = (train
        .merge(tube, how='left')
        .merge(materials, how='left')
        .merge(table, how='left')
        .fillna(0))

test = (test
       .merge(tube, how='left')
       .merge(materials, how='left')
       .merge(table, how='left')
       .fillna(0))

In [25]:
target = 'cost'
features = train.columns.drop([target, 'tube_assembly_id'])
X_train = train[features]
y_train = train[target]
X_test = test[features]

y_train_log = np.log1p(y_train)

pipeline = make_pipeline(ce.OrdinalEncoder(),
                        RandomForestRegressor(n_estimators=100, n_jobs=-1))

In [27]:
# Hyperparameter optimization with RandomizedSearchCV

from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV

groups = train['tube_assembly_id']

# Tinkering with some different parameters than used in lecture
param_distributions = {
    'randomforestregressor__n_estimators': randint(50, 200),
    'randomforestregressor__max_features': uniform(),
    'randomforestregressor__min_samples_leaf': [1, 5, 20]
}

search = RandomizedSearchCV(
    pipeline,
param_distributions = param_distributions,
n_iter=5,
cv=2,
scoring='neg_mean_squared_error',
verbose=10,
return_train_score=True,
n_jobs=-1)

search.fit(X_train, y_train_log, groups=groups);

Fitting 2 folds for each of 5 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   26.4s remaining:   26.4s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   35.6s remaining:   15.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   41.6s finished


In [29]:
print('Best hyperparameters', search.best_params_)
print('cross validation rmsle', np.sqrt(-search.best_score_))

Best hyperparameters {'randomforestregressor__max_features': 0.7440854884206252, 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__n_estimators': 160}
cross validation rmsle 0.3280434026660631


In [33]:
# Choosing best hyperparamters with XGBoost
# Choosing better parameters based on first run in lecture

from xgboost import XGBRegressor
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pipeline = make_pipeline(ce.OrdinalEncoder(),
                        XGBRegressor())

param_distributions = {'xgbregressor__n_estimators': randint(400, 600),
                      'xgbregressor__max_depth': randint(4,6)}

search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=5,
    cv=2,
    scoring='neg_mean_squared_error',
    verbose=10,
    return_train_score=True,
    n_jobs=-1)

search.fit(X_train, y_train_log, groups=groups);

Fitting 2 folds for each of 5 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:  1.1min remaining:  1.1min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  1.4min remaining:   36.7s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.7min finished


In [34]:
print('best hyperparamters', search.best_params_)
print('cross val rmsle', np.sqrt(-search.best_score_))

best hyperparamters {'xgbregressor__max_depth': 5, 'xgbregressor__n_estimators': 545}
cross val rmsle 0.31088637924275075


In [35]:
# Score only went down by a bit, but time to process decreased by over 50%

results = pd.DataFrame(search.cv_results_).sort_values(by='rank_test_score')
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_xgbregressor__max_depth,param_xgbregressor__n_estimators,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,mean_train_score,std_train_score
4,37.121198,0.014253,0.593302,0.000787,5,545,"{'xgbregressor__max_depth': 5, 'xgbregressor__...",-0.086546,-0.106756,-0.09665,0.010105,1,-0.014337,-0.009484,-0.01191,0.002426
3,41.870789,0.094508,0.775366,0.004458,5,533,"{'xgbregressor__max_depth': 5, 'xgbregressor__...",-0.086629,-0.106983,-0.096805,0.010177,2,-0.01475,-0.009749,-0.01225,0.0025
1,38.009365,0.043996,0.687122,0.022871,4,577,"{'xgbregressor__max_depth': 4, 'xgbregressor__...",-0.083671,-0.118847,-0.101259,0.017588,3,-0.025331,-0.017705,-0.021518,0.003813
0,31.471606,0.196782,0.566351,0.004972,4,470,"{'xgbregressor__max_depth': 4, 'xgbregressor__...",-0.084457,-0.118737,-0.101596,0.01714,4,-0.029,-0.02063,-0.024815,0.004185
2,26.684282,0.177583,0.540586,0.011654,4,408,"{'xgbregressor__max_depth': 4, 'xgbregressor__...",-0.084988,-0.11938,-0.102183,0.017196,5,-0.031546,-0.023071,-0.027308,0.004237


In [44]:
%%time
# Trying GridSearch as stretch goal

from sklearn.model_selection import GridSearchCV

param_grid = {'xgbregressor__n_estimators': (540,570),
            'xgbregressor__max_depth': (5,6)}

gs = GridSearchCV(estimator=pipeline,
                     param_grid=param_grid,
                     scoring='neg_mean_squared_error',
                     n_jobs=-1,
                     cv=3,
                     verbose=3,
                      refit=True,
)

gs.fit(X_train, y_train_log);

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:  3.6min remaining:   43.5s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  3.8min finished


CPU times: user 57.7 s, sys: 883 ms, total: 58.6 s
Wall time: 4min 45s


In [45]:
print('best hyperparamters', search.best_params_)
print('cross val rmsle', np.sqrt(-search.best_score_))

best hyperparamters {'xgbregressor__max_depth': 5, 'xgbregressor__n_estimators': 545}
cross val rmsle 0.31088637924275075
