In [21]:
# conda install nomkl

In [22]:
# conda install -c conda-forge category_encoders

In [23]:
# conda install -c conda-forge xgboost

In [24]:
import os

os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [25]:
SOURCE = '/Users/RichT/Documents/Lambda/DS-Unit-2-Applied-Modeling-master/data/caterpillar/caterpillar-tube-pricing/competition_data/'

In [26]:
import pandas as pd
import glob as glob

In [27]:
# tidy list of the component id's
materials = pd.read_csv(SOURCE + 'bill_of_materials.csv')

assembly_components = materials.melt(id_vars='tube_assembly_id', 
                                     value_vars=[f'component_id_{n}' for n in range(1,9)])

assembly_components = (assembly_components
                       .sort_values(by='tube_assembly_id')
                       .dropna()
                       .rename(columns={'value': 'component_id'}))


In [28]:
# merge with component types
components = pd.read_csv(SOURCE + 'components.csv')
assembly_component_types = assembly_components.merge(components, how='left')


In [29]:
# create a crosstab (similar to one hot encoding)
table = pd.crosstab(assembly_component_types['tube_assembly_id'], 
                    assembly_component_types['component_type_id'])

table = table.reset_index()
table.columns.name = ''
print(table.shape)

(19149, 30)


In [30]:
from glob import glob

def search_column(name):
    for path in glob(SOURCE + '*.csv'):
        df = pd.read_csv(path)
        if name in df.columns:
            print(path, df.shape)
            print(df.columns.tolist(), '\n')

search_column('weight')

/Users/RichT/Documents/Lambda/DS-Unit-2-Applied-Modeling-master/data/caterpillar/caterpillar-tube-pricing/competition_data/comp_threaded.csv (194, 32)
['component_id', 'component_type_id', 'adaptor_angle', 'overall_length', 'hex_size', 'end_form_id_1', 'connection_type_id_1', 'length_1', 'thread_size_1', 'thread_pitch_1', 'nominal_size_1', 'end_form_id_2', 'connection_type_id_2', 'length_2', 'thread_size_2', 'thread_pitch_2', 'nominal_size_2', 'end_form_id_3', 'connection_type_id_3', 'length_3', 'thread_size_3', 'thread_pitch_3', 'nominal_size_3', 'end_form_id_4', 'connection_type_id_4', 'length_4', 'thread_size_4', 'thread_pitch_4', 'nominal_size_4', 'unique_feature', 'orientation', 'weight'] 

/Users/RichT/Documents/Lambda/DS-Unit-2-Applied-Modeling-master/data/caterpillar/caterpillar-tube-pricing/competition_data/comp_adaptor.csv (25, 20)
['component_id', 'component_type_id', 'adaptor_angle', 'overall_length', 'end_form_id_1', 'connection_type_id_1', 'length_1', 'thread_size_1', 'th

In [31]:
comp_threaded = pd.read_csv(SOURCE + 'comp_threaded.csv')
comp_threaded['orientation'].value_counts()

No     121
Yes     73
Name: orientation, dtype: int64

In [32]:
comp_threaded['unique_feature'].value_counts()

No     161
Yes     33
Name: unique_feature, dtype: int64

In [33]:
# read the component files and concat them together
comp = pd.concat((pd.read_csv(path) for path in glob(SOURCE + 'comp_*.csv')), sort=False)
columns = ['component_id', 'component_type_id', 'orientation', 'unique_feature', 'weight']
comp = comp[columns]
comp['orientation'] = (comp['orientation']=='Yes').astype(int)
comp['unique_feature'] = (comp['unique_feature']=='Yes').astype(int)
comp['weight'] = comp['weight'].fillna(comp['weight'].median())


In [34]:
# engineer features, aggregated for all components in a tube assembly

materials['components_total'] = sum(materials[f'quantity_{n}'].fillna(0) for n in range(1,9))
materials['components_distinct'] = sum(materials[f'component_id_{n}'].notnull().astype(int) for n in range(1,9))
materials['orientation'] = 0
materials['unique_feature'] = 0
materials['weight'] = 0

for n in range(1,9):
    materials = materials.merge(comp, left_on= f'component_id_{n}', right_on = 'component_id',
                               how = 'left', suffixes=('', f'_{n}'))

for col in materials:
    if 'orientation' in col or 'unique_feature' in col or 'weight' in col:
        materials[col] = materials[col].fillna(0)

materials['orientation'] = sum(materials[f'orientation_{n}'] for n in range(1,9))
materials['unique_feature'] = sum(materials[f'unique_feature_{n}'] for n in range(1,9))
materials['weight'] = sum(materials[f'weight_{n}'] for n in range(1,9))

In [35]:
features = ['tube_assembly_id', 'orientation', 'unique_feature', 'weight',
           'components_total', 'components_distinct', 'component_id_1']
materials = materials[features]
print(materials.shape)

(21198, 7)


In [36]:
# read tube data
tube = pd.read_csv(SOURCE+'tube.csv')

In [37]:
from sklearn.model_selection import train_test_split

# read data
trainval = pd.read_csv(SOURCE + 'train_set.csv')
test = pd.read_csv(SOURCE + 'test_set.csv')

# split into train/val
# with all rows for a given tube assembly going in either train or val
trainval_tube_assemblies = trainval['tube_assembly_id'].unique()
train_tube_assemblies, val_tube_assemblies = train_test_split(
    trainval_tube_assemblies, random_state = 42)
train = trainval[trainval.tube_assembly_id.isin(train_tube_assemblies)]
val = trainval[trainval.tube_assembly_id.isin(val_tube_assemblies)]


In [38]:
# wrangle train, val and test sets
def wrangle(X):
    X = X.copy()
    
#     engineer date features
    X['quote_date'] = pd.to_datetime(X['quote_date'], infer_datetime_format=True)
    X['quote_date_year'] = X['quote_date'].dt.year
    X['quote_date_month'] = X['quote_date'].dt.month
    X = X.drop(columns='quote_date')
    
#     merge data
    X = (X.merge(table, how='left')
         .merge(materials, how='left')
         .merge(tube, how='left')
         .fillna(0))
    
#     drop tube assembly id because we want to predict unknown assemblies
    X = X.drop(columns='tube_assembly_id')
    return X

train_wrangled = wrangle(train)
val_wrangled = wrangle(val)
test_wrangled = wrangle(test)



In [39]:
# arrange into matrix and vector (log transformed)
import numpy as np

target = 'cost'
X_train = train_wrangled.drop(columns=target)
X_val = val_wrangled.drop(columns=target)
X_test = test_wrangled.drop(columns='id')
y_train = train[target]
y_val = val[target]
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)



In [40]:
# Use xgboost to fit and evaluate the model
import category_encoders as ce
from xgboost import XGBRegressor
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# encoder = ce.OrdinalEncoder()
# X_train_encoded = encoder.fit_transform(X_train)
# X_val_encoded = encoder.transform(X_val)
# eval_set = [(X_train_encoded, y_train_log),
#            (X_val_encoded, y_val_log)]
# model = XGBRegressor(n_estimators=2000, n_jobs=-1)
# model.fit(X_train_encoded, y_train_log,
#          eval_set=eval_set, eval_metric='rmse', early_stopping_rounds=50)


In [41]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline

# pipeline = make_pipeline(
#     ce.OrdinalEncoder(), 
#     RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)
# )

# k = 5
# groups = train['tube_assembly_id']
# scores = cross_val_score(pipeline, X_train, y_train_log, cv=k, 
#                          scoring='neg_mean_squared_error', groups=groups)
# print(f'RMSLE for {k} folds:', np.sqrt(-scores))

In [42]:
# THIS EXAMPLE FOR RANDOM FOREST HYPERPARAMATER OPTIMIZATION
# from scipy.stats import randint, uniform
# from sklearn.model_selection import RandomizedSearchCV

# pipeline = make_pipeline(
#     ce.OrdinalEncoder(), 
#     RandomForestRegressor(random_state=42)
# )

# param_distributions = {
#     'randomforestregressor__n_estimators': randint(50, 500), 
#     'randomforestregressor__max_features': uniform(0, 1), 
#     'randomforestregressor__max_features': [0.3, 0.5, 0.7]
# }

# search = RandomizedSearchCV(
#     pipeline, 
#     param_distributions=param_distributions, 
#     n_iter=20, 
#     cv=5, 
#     scoring='neg_mean_squared_error', 
#     verbose=10, 
#     return_train_score=True, 
#     n_jobs=-1
# )

# search.fit(X_train, y_train_log, groups=groups);

In [46]:
from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV
groups = train['tube_assembly_id']

pipeline = make_pipeline(
    ce.OrdinalEncoder(),
    XGBRegressor(random_state=42)
)

param_distributions = {
    'xgbregressor__n_estimators': randint(500, 1000),
    'xgbregressor__max_depth': randint(3,7)
}

search = RandomizedSearchCV(
    pipeline,
    param_distributions= param_distributions,
    n_iter = 10,
    cv = 3,
    scoring = 'neg_mean_squared_error',
    verbose = 10,
    return_train_score=True,
    n_jobs=-1
)

search.fit(X_train, y_train_log, groups=groups);

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed:  4.1min remaining:   27.1s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  4.4min finished


In [48]:
print('Best Hyperparamaters:', search.best_params_)
print('Cross-Validation RMSLE:', np.sqrt(-search.best_score_))

Best Hyperparamaters: {'xgbregressor__max_depth': 5, 'xgbregressor__n_estimators': 596}
Cross-Validation RMSLE: 0.2934695351655911


In [51]:
pipeline = search.best_estimator_

def generate_submission(estimator, X_test, filename):
    y_pred_log = estimator.predict(X_test)
    y_pred = np.expm1(y_pred_log) #converts from log-dollars to dollars
    submission = pd.read_csv(SOURCE + '../sample_submission.csv')
    submission['cost'] = y_pred
    submission.to_csv(filename, index=False)
    
    
generate_submission(pipeline, X_test, 'submission.csv')