In [39]:
# imports

from glob import glob
import category_encoders as ce
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline

## Get the Caterpillar data from Kaggle

In [1]:
# competition data from Kaggle

for path in glob('competition_data/*.csv'):
    df = pd.read_csv(path)
    print(path, df.shape)

competition_data/comp_threaded.csv (194, 32)
competition_data/comp_adaptor.csv (25, 20)
competition_data/tube_end_form.csv (27, 2)
competition_data/comp_straight.csv (361, 12)
competition_data/comp_tee.csv (4, 14)
competition_data/comp_boss.csv (147, 15)
competition_data/components.csv (2048, 3)
competition_data/comp_float.csv (16, 7)
competition_data/bill_of_materials.csv (21198, 17)
competition_data/comp_elbow.csv (178, 16)
competition_data/type_connection.csv (14, 2)
competition_data/train_set.csv (30213, 8)
competition_data/comp_sleeve.csv (50, 10)
competition_data/test_set.csv (30235, 8)
competition_data/tube.csv (21198, 16)
competition_data/comp_hfl.csv (6, 9)
competition_data/type_end_form.csv (8, 2)
competition_data/comp_other.csv (1001, 3)
competition_data/type_component.csv (29, 2)
competition_data/specs.csv (21198, 11)
competition_data/comp_nut.csv (65, 11)


In [3]:
# function for root mean squared log error

def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [16]:
# read csvs into dataframes

trainval = pd.read_csv('competition_data/train_set.csv')
test = pd.read_csv('competition_data/test_set.csv')
tube = pd.read_csv('competition_data/tube.csv')
bill_of_materials = pd.read_csv('competition_data/bill_of_materials.csv')
specs = pd.read_csv('competition_data/specs.csv')
components = pd.read_csv('competition_data/components.csv')
comp_sleeve = pd.read_csv('competition_data/comp_sleeve.csv')

trainval.shape, test.shape, tube.shape, bill_of_materials.shape, specs.shape, components.shape, comp_sleeve.shape

((30213, 8),
 (30235, 8),
 (21198, 16),
 (21198, 17),
 (21198, 11),
 (2048, 3),
 (50, 10))

In [5]:
trainval.head()

Unnamed: 0,tube_assembly_id,supplier,quote_date,annual_usage,min_order_quantity,bracket_pricing,quantity,cost
0,TA-00002,S-0066,2013-07-07,0,0,Yes,1,21.905933
1,TA-00002,S-0066,2013-07-07,0,0,Yes,2,12.341214
2,TA-00002,S-0066,2013-07-07,0,0,Yes,5,6.601826
3,TA-00002,S-0066,2013-07-07,0,0,Yes,10,4.68777
4,TA-00002,S-0066,2013-07-07,0,0,Yes,25,3.541561


In [6]:
tube.head()

Unnamed: 0,tube_assembly_id,material_id,diameter,wall,length,num_bends,bend_radius,end_a_1x,end_a_2x,end_x_1x,end_x_2x,end_a,end_x,num_boss,num_bracket,other
0,TA-00001,SP-0035,12.7,1.65,164.0,5,38.1,N,N,N,N,EF-003,EF-003,0,0,0
1,TA-00002,SP-0019,6.35,0.71,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0
2,TA-00003,SP-0019,6.35,0.71,127.0,7,19.05,N,N,N,N,EF-008,EF-008,0,0,0
3,TA-00004,SP-0019,6.35,0.71,137.0,9,19.05,N,N,N,N,EF-008,EF-008,0,0,0
4,TA-00005,SP-0029,19.05,1.24,109.0,4,50.8,N,N,N,N,EF-003,EF-003,0,0,0


In [7]:
# for each tube_assembly_id:
# list of components (by id) & quantity used
# name and type_id of components listed in components.csv

bill_of_materials.head()

Unnamed: 0,tube_assembly_id,component_id_1,quantity_1,component_id_2,quantity_2,component_id_3,quantity_3,component_id_4,quantity_4,component_id_5,quantity_5,component_id_6,quantity_6,component_id_7,quantity_7,component_id_8,quantity_8
0,TA-00001,C-1622,2.0,C-1629,2.0,,,,,,,,,,,,
1,TA-00002,C-1312,2.0,,,,,,,,,,,,,,
2,TA-00003,C-1312,2.0,,,,,,,,,,,,,,
3,TA-00004,C-1312,2.0,,,,,,,,,,,,,,
4,TA-00005,C-1624,1.0,C-1631,1.0,C-1641,1.0,,,,,,,,,,


In [19]:
bill_of_materials['component_id_1'].value_counts().head()

C-1621    2043
C-1622    1833
C-1624    1481
C-1623    1414
C-1312    1240
Name: component_id_1, dtype: int64

In [20]:
bill_of_materials['component_id_1'].nunique()

1079

In [8]:
# list of 'specs' for each tube_assembly id
# specs appear to be 6 digits, two letters dash four numbers

specs.head()

Unnamed: 0,tube_assembly_id,spec1,spec2,spec3,spec4,spec5,spec6,spec7,spec8,spec9,spec10
0,TA-00001,,,,,,,,,,
1,TA-00002,,,,,,,,,,
2,TA-00003,,,,,,,,,,
3,TA-00004,,,,,,,,,,
4,TA-00005,,,,,,,,,,


In [10]:
specs.describe()

Unnamed: 0,tube_assembly_id,spec1,spec2,spec3,spec4,spec5,spec6,spec7,spec8,spec9,spec10
count,21198,7129,6844,5840,4154,2921,2071,535,106,20,1
unique,21198,45,51,49,38,37,24,13,4,3,1
top,TA-20566,SP-0007,SP-0012,SP-0080,SP-0026,SP-0080,SP-0082,SP-0082,SP-0082,SP-0088,SP-0080
freq,1,3002,1736,1798,1492,1780,1167,322,73,18,1


In [12]:
specs.isnull().sum()

tube_assembly_id        0
spec1               14069
spec2               14354
spec3               15358
spec4               17044
spec5               18277
spec6               19127
spec7               20663
spec8               21092
spec9               21178
spec10              21197
dtype: int64

In [11]:
# name and type_id for each component id
# information for each component is organized into different csv files based on type
# for example, sleeve-type components are in comp_sleeve.csv

components.head()

Unnamed: 0,component_id,name,component_type_id
0,9999,OTHER,OTHER
1,C-0001,SLEEVE,CP-024
2,C-0002,SLEEVE,CP-024
3,C-0003,SLEEVE-FLARED,CP-024
4,C-0004,NUT,CP-026


## Train/val/test split

In [23]:
# create numpy array with unique tube_assembly_id

trainval_tube_assemblies = trainval['tube_assembly_id'].unique()
test_tube_assemblies = test['tube_assembly_id'].unique()
len(trainval_tube_assemblies), len(test_tube_assemblies)

(8855, 8856)

In [25]:
# use train/test split to split the array

train_tube_assemblies, val_tube_assemblies = train_test_split(
    trainval_tube_assemblies, random_state=42)

len(train_tube_assemblies), len(val_tube_assemblies)

(6641, 2214)

In [26]:
# getting dataframes such that their tube assembly ids are in the respective array created above

train = trainval[trainval.tube_assembly_id.isin(train_tube_assemblies)]
val = trainval[trainval.tube_assembly_id.isin(val_tube_assemblies)]
train.shape, val.shape, trainval.shape

((22628, 8), (7585, 8), (30213, 8))

## Wrangle function

In [27]:
def wrangle(X):
    X = X.copy()
    
    # Engineer date features
    X['quote_date'] = pd.to_datetime(X['quote_date'], infer_datetime_format=True)
    X['quote_date_year'] = X['quote_date'].dt.year
    X['quote_date_month'] = X['quote_date'].dt.month
    X = X.drop(columns='quote_date')
        
    # Merge tube and bill_of_materials data
    X = X.merge(tube, how='left')
    X = X.merge(bill_of_materials, how='left')
    
    # Drop tube assembly_id because our goal is to predict unknown assemblies
    X = X.drop(columns='tube_assembly_id')
    
    return X

train_wrangled = wrangle(train)
val_wrangled = wrangle(val)

In [28]:
train_wrangled.head()

Unnamed: 0,supplier,annual_usage,min_order_quantity,bracket_pricing,quantity,cost,quote_date_year,quote_date_month,material_id,diameter,...,component_id_4,quantity_4,component_id_5,quantity_5,component_id_6,quantity_6,component_id_7,quantity_7,component_id_8,quantity_8
0,S-0066,0,0,Yes,1,21.972702,2013,7,SP-0019,6.35,...,,,,,,,,,,
1,S-0066,0,0,Yes,2,12.407983,2013,7,SP-0019,6.35,...,,,,,,,,,,
2,S-0066,0,0,Yes,5,6.668596,2013,7,SP-0019,6.35,...,,,,,,,,,,
3,S-0066,0,0,Yes,10,4.754539,2013,7,SP-0019,6.35,...,,,,,,,,,,
4,S-0066,0,0,Yes,25,3.608331,2013,7,SP-0019,6.35,...,,,,,,,,,,


## Select features

In [30]:
train_wrangled.columns.values

array(['supplier', 'annual_usage', 'min_order_quantity',
       'bracket_pricing', 'quantity', 'cost', 'quote_date_year',
       'quote_date_month', 'material_id', 'diameter', 'wall', 'length',
       'num_bends', 'bend_radius', 'end_a_1x', 'end_a_2x', 'end_x_1x',
       'end_x_2x', 'end_a', 'end_x', 'num_boss', 'num_bracket', 'other',
       'component_id_1', 'quantity_1', 'component_id_2', 'quantity_2',
       'component_id_3', 'quantity_3', 'component_id_4', 'quantity_4',
       'component_id_5', 'quantity_5', 'component_id_6', 'quantity_6',
       'component_id_7', 'quantity_7', 'component_id_8', 'quantity_8'],
      dtype=object)

In [31]:
target = 'cost'
features = train_wrangled.columns.values

In [32]:
y_train = train[target]
y_val = val[target]

In [62]:
X_train = train_wrangled[features].drop(columns='cost')
X_val = val_wrangled[features].drop(columns='cost')

In [63]:
X_train.head()

Unnamed: 0,supplier,annual_usage,min_order_quantity,bracket_pricing,quantity,quote_date_year,quote_date_month,material_id,diameter,wall,...,component_id_4,quantity_4,component_id_5,quantity_5,component_id_6,quantity_6,component_id_7,quantity_7,component_id_8,quantity_8
0,S-0066,0,0,Yes,1,2013,7,SP-0019,6.35,0.71,...,,,,,,,,,,
1,S-0066,0,0,Yes,2,2013,7,SP-0019,6.35,0.71,...,,,,,,,,,,
2,S-0066,0,0,Yes,5,2013,7,SP-0019,6.35,0.71,...,,,,,,,,,,
3,S-0066,0,0,Yes,10,2013,7,SP-0019,6.35,0.71,...,,,,,,,,,,
4,S-0066,0,0,Yes,25,2013,7,SP-0019,6.35,0.71,...,,,,,,,,,,


## Use NumPy to get the log of the target vector

In [41]:
y_train_log = np.log1p(y_train)

In [67]:
X_train_filled = X_train.fillna(0)
X_val_filled = X_val.fillna(0)

## Fit a model

In [42]:
pipeline = make_pipeline(
    ce.OrdinalEncoder(),
    RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
)

In [68]:
pipeline.fit(X_train_filled, y_train_log)

Pipeline(memory=None,
     steps=[('ordinalencoder', OrdinalEncoder(cols=['supplier', 'bracket_pricing', 'material_id', 'end_a_1x', 'end_a_2x', 'end_x_1x', 'end_x_2x', 'end_a', 'end_x', 'component_id_1', 'component_id_2', 'component_id_3', 'component_id_4', 'component_id_5', 'component_id_6', 'component_id_7', 'component_id_8...timators=100, n_jobs=-1,
           oob_score=False, random_state=42, verbose=0, warm_start=False))])

## Get validation RMSLE

In [69]:
y_val_log = np.log1p(y_val)
y_pred_log = pipeline.predict(X_val_filled)

In [70]:
print(f'Validation Root Mean Squared Log Error: {rmsle(y_val_log, y_pred_log)}')

Validation Root Mean Squared Log Error: 0.08999155394778512


## Create Kaggle submission

In [71]:
test_wrangled = wrangle(test)
X_test = test_wrangled.drop(columns='id')
all(X_test.columns==X_train.columns)

True

In [75]:
# need to fix this
y_pred_log = pipeline.predict(X_test)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
y_pred = np.expm1(y_pred_log)

In [None]:

sample_submission = pd.read_csv('sample_submission.csv')
submission['cost'] = y_pred

In [None]:
submission.to_csv('submission-01.csv', index=False)