In [1]:
import category_encoders as ce
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
from glob import glob
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# evaluation functions
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [3]:
# wrangle function

def wrangle(X):
    X = X.copy()
    
    # Engineer date features
    X['quote_date'] = pd.to_datetime(X['quote_date'], infer_datetime_format=True)
    X['quote_date_year'] = X['quote_date'].dt.year
    X['quote_date_month'] = X['quote_date'].dt.month
    X = X.drop(columns='quote_date')
    
    # Merge tube data
    tube = pd.read_csv('competition_data/tube.csv')
    X = X.merge(tube, how='left')
    
    # Engineer features from bill_of_materials
    materials = pd.read_csv('competition_data/bill_of_materials.csv')
    
    materials['components_total'] = (materials['quantity_1'].fillna(0) + 
                                     materials['quantity_2'].fillna(0) + 
                                     materials['quantity_3'].fillna(0) + 
                                     materials['quantity_4'].fillna(0) + 
                                     materials['quantity_5'].fillna(0) + 
                                     materials['quantity_6'].fillna(0) + 
                                     materials['quantity_7'].fillna(0) + 
                                     materials['quantity_8'].fillna(0))

    materials['components_distinct'] = (materials['component_id_1'].notnull().astype(int) + 
                                        materials['component_id_2'].notnull().astype(int) + 
                                        materials['component_id_3'].notnull().astype(int) + 
                                        materials['component_id_4'].notnull().astype(int) + 
                                        materials['component_id_5'].notnull().astype(int) + 
                                        materials['component_id_6'].notnull().astype(int) + 
                                        materials['component_id_7'].notnull().astype(int) + 
                                        materials['component_id_8'].notnull().astype(int))
    
    # Engineer features from components and bill_of_materials
    components = pd.read_csv('competition_data/components.csv')
    
    # create dictionary to map component_id to component_id_type
    component_dict = components[['component_id', 'component_type_id']].set_index('component_id').to_dict(orient='dict')
    component_dict = component_dict['component_type_id']
    
    # use the dictionary to replace component_id in bill_of_materials with component_id_type
    materials_type = materials.replace(component_dict)
    
    # create a function to return the tally of components with a specified component type in a row
    def count(row):
        tally=0
        for num in range(1,6):
            name = 'component_id_' + str(num)
            quantity = 'quantity_' + str(num)
            if row[name]==item:
                tally += row[quantity]
            return tally
    
    # get a list of unique component types
    component_type_list = components.component_type_id.unique().tolist()
    
    # iterate over the list of component types and apply the function to 
    # create a feature with the row tallies
    for item in component_type_list:
        materials_type[item] = materials_type.apply(count, axis=1)
    
    # Merge selected features from bill_of_materials
    # Just use the first component_id, ignore the others for now!
    features = ['tube_assembly_id', 'component_id_1', 'components_total', 'components_distinct',
                'OTHER', 'CP-024', 'CP-026', 'CP-028', 'CP-014', 
                'CP-018', 'CP-001', 'CP-008', 'CP-009', 'CP-002', 
                'CP-010', 'CP-021', 'CP-011', 'CP-015', 'CP-027',
                'CP-003', 'CP-004', 'CP-005', 'CP-019', 'CP-025', 
                'CP-006', 'CP-016', 'CP-020', 'CP-012', 'CP-022', 
                'CP-007', 'CP-017', 'CP-023', 'CP-029']
    X = X.merge(materials_type[features], how='left')
    
    # Get component_type_id (has lower cardinality than component_id)
    components = pd.read_csv('competition_data/components.csv')
    components = components.rename(columns={'component_id': 'component_id_1'})
    features = ['component_id_1', 'component_type_id']
    X = X.merge(components[features], how='left')
    
    # Count the number of specs for the tube assembly
    specs = pd.read_csv('competition_data/specs.csv')
    specs['specs_total'] = specs.drop(columns=['tube_assembly_id']).count(axis=1)
    features = ['tube_assembly_id', 'specs_total', 'spec1']
    X = X.merge(specs[features], how='left')
    
    return X

In [4]:
# read data
train = pd.read_csv('competition_data/train_set.csv')
test = pd.read_csv('competition_data/test_set.csv')

train.shape, test.shape

((30213, 8), (30235, 8))

In [5]:
# Wrangle train, validation, and test sets
train = wrangle(train)
test = wrangle(test)

In [6]:
# Arrange X matrix and y vector (log-transformed)
target = 'cost'
X_train = train.drop(columns=target)
X_test = test.drop(columns='id')
y_train = train[target]
y_train_log = np.log1p(y_train)

In [7]:
# prediction function
def generate_submission(estimator, X_test_param, filename):
    y_pred_log = estimator.predict(X_test_param)
    y_pred = np.expm1(y_pred_log)  # Convert from log-dollars to dollars
    submission = pd.read_csv('sample_submission.csv')
    submission['cost'] = y_pred
    submission.to_csv(filename, index=False)

## Now lets do a gradient boosting model

In [8]:
from sklearn.preprocessing import OneHotEncoder

In [9]:
encoder = ce.OneHotEncoder()
X_train_encoded = encoder.fit_transform(X_train, cols=['material_id', 'supplier'])
X_test_encoded = encoder.fit_transform(X_test, cols=['material_id', 'supplier'])

In [10]:
encoder = ce.OrdinalEncoder()
X_train_encoded = encoder.fit_transform(X_train)
X_test_encoded = encoder.transform(X_test)

In [11]:
# eval_set = [(X_train_encoded, y_train_log),
#             (X_val_encoded, y_val_log)]

# model = XGBRegressor(n_estimators=2000, n_jobs=-1, eta=0.085, max_depth=6)
# model.fit(X_train_encoded, y_train_log, eval_set=eval_set, eval_metric='rmse',
#          early_stopping_rounds=100)

In [16]:
model = XGBRegressor()

In [17]:
from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV

In [22]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

groups = train['tube_assembly_id']
param_distributions = {
    'n_estimators': randint(800, 1000), 
    'max_depth': [6,7,8],
    'learning_rate': [.08, .085, .09, .095, .1, .105, .11, .115, .12] 
}

search = RandomizedSearchCV(
    model, 
    param_distributions=param_distributions, 
    n_iter=10, 
    cv=5, 
    scoring='neg_mean_squared_error', 
    verbose=10, 
    return_train_score=True, 
    n_jobs=-1
)

search.fit(X_train_encoded, y_train_log, groups=groups);

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  9.8min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 20.3min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 29.4min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 34.9min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 49.6min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 59.6min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 67.7min finished


In [23]:
print('Best hyperparameters', search.best_params_)
print('Cross-validation RMSLE', np.sqrt(-search.best_score_))

Best hyperparameters {'learning_rate': 0.095, 'max_depth': 6, 'n_estimators': 861}
Cross-validation RMSLE 0.30153316046805045


In [24]:
pd.DataFrame(search.cv_results_).sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
3,256.735391,8.657783,1.071139,0.387696,0.095,6,861,"{'learning_rate': 0.095, 'max_depth': 6, 'n_es...",-0.09196,-0.067823,...,-0.090922,0.018437,1,-0.006405,-0.006211,-0.005512,-0.003662,-0.00664,-0.005686,0.00108
0,265.346564,12.202934,2.852531,0.534164,0.115,7,828,"{'learning_rate': 0.115, 'max_depth': 7, 'n_es...",-0.085974,-0.071797,...,-0.091376,0.018611,2,-0.001857,-0.002051,-0.001498,-0.001096,-0.00209,-0.001718,0.000375
4,251.339458,6.950165,1.30017,0.322335,0.085,6,937,"{'learning_rate': 0.085, 'max_depth': 6, 'n_es...",-0.095094,-0.068905,...,-0.091993,0.0173,3,-0.006748,-0.006425,-0.005929,-0.004069,-0.006758,-0.005986,0.001005
7,228.017617,8.407867,0.812825,0.123257,0.105,6,817,"{'learning_rate': 0.105, 'max_depth': 6, 'n_es...",-0.099423,-0.068314,...,-0.093957,0.018628,4,-0.005858,-0.005659,-0.005378,-0.003713,-0.006022,-0.005326,0.000835
9,245.289942,48.315894,1.09883,0.232231,0.09,7,956,"{'learning_rate': 0.09, 'max_depth': 7, 'n_est...",-0.103726,-0.0687,...,-0.09523,0.021781,5,-0.002304,-0.002364,-0.001991,-0.001257,-0.002507,-0.002085,0.000447
1,449.778756,43.845482,7.917541,0.777294,0.085,8,952,"{'learning_rate': 0.085, 'max_depth': 8, 'n_es...",-0.125415,-0.067273,...,-0.098107,0.024193,6,-0.000986,-0.000927,-0.000795,-0.000479,-0.001015,-0.00084,0.000196
6,338.902435,5.834281,2.701644,0.600175,0.11,7,997,"{'learning_rate': 0.11, 'max_depth': 7, 'n_est...",-0.122623,-0.069824,...,-0.099016,0.022039,7,-0.001513,-0.001443,-0.001174,-0.000837,-0.001517,-0.001297,0.000262
5,316.550304,17.831761,3.826401,0.969465,0.105,8,899,"{'learning_rate': 0.105, 'max_depth': 8, 'n_es...",-0.123425,-0.069869,...,-0.099054,0.022762,8,-0.000619,-0.000723,-0.000588,-0.000328,-0.000708,-0.000593,0.000142
8,314.887824,2.123291,2.298544,0.298158,0.11,8,897,"{'learning_rate': 0.11, 'max_depth': 8, 'n_est...",-0.121543,-0.068597,...,-0.10062,0.024269,9,-0.000568,-0.000619,-0.000471,-0.000259,-0.000586,-0.000501,0.00013
2,379.725099,51.441597,4.322969,2.439842,0.115,8,839,"{'learning_rate': 0.115, 'max_depth': 8, 'n_es...",-0.124219,-0.07147,...,-0.101469,0.022532,10,-0.000619,-0.000557,-0.00054,-0.000311,-0.000613,-0.000528,0.000113


In [27]:
pipeline = search.best_estimator_
y_pred_log = pipeline.predict(X_test_encoded)
y_pred = np.expm1(y_pred_log)  # Convert from log-dollars to dollars
submission = pd.read_csv('sample_submission.csv')
submission['cost'] = y_pred
submission.to_csv('submission-40.csv', index=False)