In [1]:
import pandas as pd
import numpy as np
from glob import glob
import category_encoders as ce
from sklearn.model_selection import train_test_split, cross_val_score, validation_curve
from xgboost import XGBRegressor
import warnings
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV


In [2]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
source = 'C:/Users/Cactuar/Projects/DS-Unit-2-Applied-Modeling/data/caterpillar/caterpillar-tube-pricing/data/competition_data/'

In [13]:
# variables for csv
materials = pd.read_csv(source + 'bill_of_materials.csv')
components = pd.read_csv(source + 'components.csv')
tube = pd.read_csv(source + 'tube.csv')

In [4]:
# Get a tidy list of the component id's in each tube assembly
#materials = pd.read_csv(source + 'bill_of_materials.csv')

assembly_components = materials.melt(id_vars='tube_assembly_id',
                                     value_vars=[f'component_id_{n}' for n in range(1,9)])
assembly_components = (assembly_components
                       .sort_values(by='tube_assembly_id')
                       .dropna()
                       .rename(columns={'value': 'component_id'}))
assembly_components.head()

Unnamed: 0,tube_assembly_id,variable,component_id
0,TA-00001,component_id_1,C-1622
21198,TA-00001,component_id_2,C-1629
1,TA-00002,component_id_1,C-1312
2,TA-00003,component_id_1,C-1312
3,TA-00004,component_id_1,C-1312


In [5]:
# Merge with components
#components = pd.read_csv(source + 'components.csv')
assembly_component_types = assembly_components.merge(components, how='left')
assembly_component_types.head()

Unnamed: 0,tube_assembly_id,variable,component_id,name,component_type_id
0,TA-00001,component_id_1,C-1622,NUT-SWIVEL,CP-025
1,TA-00001,component_id_2,C-1629,SLEEVE-ORFS,CP-024
2,TA-00002,component_id_1,C-1312,NUT-FLARED,CP-028
3,TA-00003,component_id_1,C-1312,NUT-FLARED,CP-028
4,TA-00004,component_id_1,C-1312,NUT-FLARED,CP-028


In [6]:
# Make a crosstab of the component types for each assembly(one-hot-encoding)
table = pd.crosstab(assembly_component_types['tube_assembly_id'],
                   assembly_component_types['component_type_id'])

table = table.reset_index()
table.columns.name = ''
print(table.shape)
table.head()

(19149, 30)


Unnamed: 0,tube_assembly_id,CP-001,CP-002,CP-003,CP-004,CP-005,CP-006,CP-007,CP-008,CP-009,...,CP-021,CP-022,CP-023,CP-024,CP-025,CP-026,CP-027,CP-028,CP-029,OTHER
0,TA-00001,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
1,TA-00002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,TA-00003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,TA-00004,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,TA-00005,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0


In [7]:
# function to filter for specific column in multiple csv's
def search_column(name):
    for path in glob(source + '*.csv'):
        df = pd.read_csv(path)
        if name in df.columns:
            print(df.shape)
            print(df.columns.tolist(), '\n')
            
#search_column('weight')

In [8]:
# Concat comp_ files that share features
comp = pd.concat((pd.read_csv(path) for path in glob(source + 'comp_*.csv')), sort=False)
columns = ['component_id', 'component_type_id', 'orientation', 'unique_feature', 'weight']
comp = comp[columns]
comp['orientation'] = (comp['orientation']=='Yes').astype(int)
comp['unique_feature'] = (comp['unique_feature']=='Yes').astype(int)
comp['weight'] = comp['weight'].fillna(comp['weight'].median())
comp.head()

Unnamed: 0,component_id,component_type_id,orientation,unique_feature,weight
0,C-0005,CP-028,0,0,0.206
1,C-0006,CP-028,0,0,0.083
2,C-1435,CP-028,0,0,0.023
3,C-1546,CP-028,0,0,0.026
4,C-1583,CP-028,0,0,0.256


In [9]:
# Engineer features, aggregate for all components in a tube assembly
materials['components_total'] = sum(materials[f'quantity_{n}'].notnull().astype(int) for n in range(1,9))
materials['components_distinct'] = sum(materials[f'component_id_{n}'].notnull().astype(int) for n in range(1,9))
materials['orientation'] = 0
materials['unique_feature'] = 0
materials['weight'] = 0

for n in range(1,9):
    materials = materials.merge(comp, left_on=f'component_id_{n}', right_on='component_id',
                                how='left', suffixes=('', f'_{n}'))
for col in materials:
    if 'orientation' in col or 'unique_feature' in col or 'weight' in col:
        materials[col] = materials[col].fillna(0)
        
materials['orientation'] = sum(materials[f'orientation_{n}'] for n in range(1,9))
materials['unique_feature'] = sum(materials[f'unique_feature_{n}'] for n in range(1,9))
materials['weight'] = sum(materials[f'weight_{n}'] for n in range(1,9))

materials.head()
    

Unnamed: 0,tube_assembly_id,component_id_1,quantity_1,component_id_2,quantity_2,component_id_3,quantity_3,component_id_4,quantity_4,component_id_5,...,component_id_7,component_type_id_7,orientation_7,unique_feature_7,weight_7,component_id_8,component_type_id_8,orientation_8,unique_feature_8,weight_8
0,TA-00001,C-1622,2.0,C-1629,2.0,,,,,,...,,,0.0,0.0,0.0,,,0.0,0.0,0.0
1,TA-00002,C-1312,2.0,,,,,,,,...,,,0.0,0.0,0.0,,,0.0,0.0,0.0
2,TA-00003,C-1312,2.0,,,,,,,,...,,,0.0,0.0,0.0,,,0.0,0.0,0.0
3,TA-00004,C-1312,2.0,,,,,,,,...,,,0.0,0.0,0.0,,,0.0,0.0,0.0
4,TA-00005,C-1624,1.0,C-1631,1.0,C-1641,1.0,,,,...,,,0.0,0.0,0.0,,,0.0,0.0,0.0


In [12]:
features = ['tube_assembly_id', 'orientation', 'unique_feature', 'weight',
            'components_total', 'components_distinct', 'component_id_1']
materials = materials[features]
print(materials.shape)
materials.head()

(21198, 7)


Unnamed: 0,tube_assembly_id,orientation,unique_feature,weight,components_total,components_distinct,component_id_1
0,TA-00001,0.0,1.0,0.048,2,2,C-1622
1,TA-00002,0.0,0.0,0.009,1,1,C-1312
2,TA-00003,0.0,0.0,0.009,1,1,C-1312
3,TA-00004,0.0,0.0,0.009,1,1,C-1312
4,TA-00005,0.0,1.0,0.21,3,3,C-1624


In [16]:
# Merge with train/val, test sets
trainval = pd.read_csv(source + 'train_set.csv')
test = pd.read_csv(source + 'test_set.csv')

# split trainval _ all rows- either or
trainval_tube_assemblies = trainval['tube_assembly_id'].unique()

train_tube_assemblies, val_tube_assemblies = train_test_split(
    trainval_tube_assemblies, random_state=42)

train = trainval[trainval.tube_assembly_id.isin(train_tube_assemblies)]
val = trainval[trainval.tube_assembly_id.isin(val_tube_assemblies)]

In [18]:
# Wrangle function
def wrangle(X):
    X = X.copy()
    
    # make date features
    X['quote_date'] = pd.to_datetime(X['quote_date'], infer_datetime_format=True)
    X['quote_date_year'] = X['quote_date'].dt.year
    X['quote_date_month'] = X['quote_date'].dt.month
    X = X.drop(columns='quote_date')
    
    # Merge data
    X = (X.merge(table, how='left')
        .merge(materials, how='left')
        .merge(tube, how='left')
        .fillna(0))
    
    # drop id
    X = X.drop(columns='tube_assembly_id')
    return X

train_wrangled = wrangle(train)
val_wrangled = wrangle(val)
test_wrangled = wrangle(test)

In [23]:
# Feature Matrix & Target Vector
target = 'cost'
X_train = train_wrangled.drop(columns=target)
X_val = val_wrangled.drop(columns=target)
X_test = test_wrangled.drop(columns='id')
y_train = train[target]
y_val = val[target]
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)
groups = train['tube_assembly_id']

In [25]:
pipeline = make_pipeline(ce.OrdinalEncoder(), XGBRegressor(random_state=42))

param_distributions = {'xgbregressor__n_estimators': randint(1500, 2500),
                       'xgbregressor__max_depth': randint(3,7)}

search = RandomizedSearchCV(pipeline, param_distributions=param_distributions,
                            n_iter=10,
                            cv=5,
                            scoring='neg_mean_squared_error',
                            verbose=10,
                            return_train_score=True,
                            n_jobs=-1)
search.fit(X_train, y_train_log, groups=groups);

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 14.5min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 19.9min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 26.6min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 31.9min finished




In [26]:
print('best hyperparameters', search.best_params_)
print('cross-validation RMSLE', np.sqrt(-search.best_score_))

best hyperparameters {'xgbregressor__max_depth': 5, 'xgbregressor__n_estimators': 1966}
cross-validation RMSLE 0.3059785586530158


In [30]:
def generate_submission(estimator, X_Test, filename):
    y_pred_log = estimator.predict(X_Test)
    y_pred = np.expm1(y_pred_log)
    submission = pd.read_csv('C:/Users/Cactuar/Projects/DS-Unit-2-Applied-Modeling/data/caterpillar/caterpillar-tube-pricing/sample_submission.csv')
    submission['cost'] = y_pred
    submission.to_csv(filename, index=False)

In [31]:
pipeline = search.best_estimator_
    
generate_submission(pipeline, X_test, 'randomizedsearch.csv')