## Download Data

In [3]:
!kaggle competitions download -c caterpillar-tube-pricing
!unzip data.zip

sample_submission.csv: Skipping, found more recently modified local copy (use --force to force download)
data.zip: Skipping, found more recently modified local copy (use --force to force download)


## Load Dataframes

In [12]:
from glob import glob
import pandas as pd
data = {}
for path in glob('competition_data/*.csv'):
    name = path.split('/')[-1].split('.')[0]
    data[name] = pd.read_csv(path)
    print(path, data[name].shape)

competition_data/tube_end_form.csv (27, 2)
competition_data/components.csv (2048, 3)
competition_data/type_end_form.csv (8, 2)
competition_data/comp_threaded.csv (194, 32)
competition_data/type_connection.csv (14, 2)
competition_data/comp_hfl.csv (6, 9)
competition_data/bill_of_materials.csv (21198, 17)
competition_data/comp_float.csv (16, 7)
competition_data/type_component.csv (29, 2)
competition_data/comp_straight.csv (361, 12)
competition_data/test_set.csv (30235, 8)
competition_data/comp_other.csv (1001, 3)
competition_data/comp_sleeve.csv (50, 10)
competition_data/tube.csv (21198, 16)
competition_data/comp_tee.csv (4, 14)
competition_data/comp_nut.csv (65, 11)
competition_data/comp_boss.csv (147, 15)
competition_data/specs.csv (21198, 11)
competition_data/train_set.csv (30213, 8)
competition_data/comp_adaptor.csv (25, 20)
competition_data/comp_elbow.csv (178, 16)


In [39]:
list(data.keys())

['tube_end_form',
 'components',
 'type_end_form',
 'comp_threaded',
 'type_connection',
 'comp_hfl',
 'bill_of_materials',
 'comp_float',
 'type_component',
 'comp_straight',
 'test_set',
 'comp_other',
 'comp_sleeve',
 'tube',
 'comp_tee',
 'comp_nut',
 'comp_boss',
 'specs',
 'train_set',
 'comp_adaptor',
 'comp_elbow']

## Train/Val Split

In [11]:
import numpy as np
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split

def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [167]:
trainval = data['train_set']
test = data['test_set']
trainval_tube_assemblies = trainval['tube_assembly_id'].unique()
test_tube_assemblies = test['tube_assembly_id'].unique()
train_tube_assemblies, val_tube_assemblies = train_test_split(
    trainval_tube_assemblies, random_state=42)
train = trainval[trainval.tube_assembly_id.isin(train_tube_assemblies)]
val = trainval[trainval.tube_assembly_id.isin(val_tube_assemblies)]

## New Feature engineering

### specs 

In [177]:
#
from functools import reduce


def mergelist(s1, s2):
    if s1 is np.nan:
        s1 = ''
    if s2 is np.nan:
        s2 = ''
    return s1+s2


pd.DataFrame(data['specs'][data['specs'].columns[1:]].agg(
    lambda x: reduce(mergelist, x), axis=1).rename('specs')).head(15)

Unnamed: 0,specs
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,


In [185]:
specs = data['specs'].join(data['specs'].notna().sum(axis=1).rename('specs_ct')-1)
specs

Unnamed: 0,tube_assembly_id,spec1,spec2,spec3,spec4,spec5,spec6,spec7,spec8,spec9,spec10,specs,specs_ct
0,TA-00001,,,,,,,,,,,,1
1,TA-00002,,,,,,,,,,,,1
2,TA-00003,,,,,,,,,,,,1
3,TA-00004,,,,,,,,,,,,1
4,TA-00005,,,,,,,,,,,,1
5,TA-00006,,,,,,,,,,,,1
6,TA-00007,,,,,,,,,,,,1
7,TA-00008,,,,,,,,,,,,1
8,TA-00009,,,,,,,,,,,,1
9,TA-00010,,,,,,,,,,,,1


### bill_of_materials

In [320]:
data['bill_of_materials'].head()

Unnamed: 0,tube_assembly_id,component_id_1,quantity_1,component_id_2,quantity_2,component_id_3,quantity_3,component_id_4,quantity_4,component_id_5,quantity_5,component_id_6,quantity_6,component_id_7,quantity_7,component_id_8,quantity_8
0,TA-00001,C-1622,2.0,C-1629,2.0,,,,,,,,,,,,
1,TA-00002,C-1312,2.0,,,,,,,,,,,,,,
2,TA-00003,C-1312,2.0,,,,,,,,,,,,,,
3,TA-00004,C-1312,2.0,,,,,,,,,,,,,,
4,TA-00005,C-1624,1.0,C-1631,1.0,C-1641,1.0,,,,,,,,,,


In [323]:
def bom_aggregate(comp, x):
    components_cols
    total = 0
    for com in components_cols:
        if x[com]==comp:
            total +=x['quantity_'+ com[-1]]
    return total
     
        
        
data['bill_of_materials'].head()

0    0.0
1    2.0
2    2.0
3    2.0
4    0.0
dtype: float64

In [302]:
bom = data['bill_of_materials']
components_cols = [col for col in bom.columns if col.startswith('component')]
qty_cols = [col for col in bom.columns if col.startswith('quantity')]
comps = set()
for col in components_cols:
    comps.update(bom[col].dropna().unique())


In [341]:
def bom_aggregate(comp, x):
    components_cols
    total = 0
    for com in components_cols:
        if x[com]==comp:
            total +=x['quantity_'+ com[-1]]
    return total
bom_agg = bom[['tube_assembly_id']].copy()
for comp in comps:
    bom_agg[comp] = bom.apply(lambda x: bom_aggregate(comp, x), axis=1)
bom_agg

Unnamed: 0,tube_assembly_id,C-0711,C-1422,C-0318,C-1051,C-1164,C-0208,C-1151,C-0145,C-1187,...,C-0042,C-0695,C-2038,C-0422,C-0133,C-0450,C-0107,C-1776,C-0320,C-1140
0,TA-00001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TA-00002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,TA-00003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,TA-00004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,TA-00005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,TA-00006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,TA-00007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,TA-00008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,TA-00009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,TA-00010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [371]:
bom_agg['unique'] =  bom_agg[comps].ne(0).sum(axis=1)
bom_agg['total'] =  bom_agg[comps].sum(axis=1)
bom_agg[['unique', 'total']]

Unnamed: 0,unique,total
0,2,4.0
1,1,2.0
2,1,2.0
3,1,2.0
4,3,3.0
5,3,3.0
6,2,4.0
7,1,2.0
8,2,4.0
9,1,2.0


## Wrangle Data

In [372]:
train_tube_assemblies, val_tube_assemblies = train_test_split(
    trainval_tube_assemblies, random_state=42)

In [390]:
def wrangle(X):
    X = X.copy()
    
    # Engineer date features
    X['quote_date'] = pd.to_datetime(X['quote_date'], infer_datetime_format=True)
    X['quote_date_year'] = X['quote_date'].dt.year
    X['quote_date_month'] = X['quote_date'].dt.month
    X = X.drop(columns='quote_date')
    specs = data['specs'].join(data['specs'].notna().sum(axis=1).rename('specs_ct')-1)
    # Merge data
    tube =data['tube']
    specs = data['specs'].join(data['specs'].notna().sum(axis=1).rename('specs_ct')-1)
    specs['specs'] = data['specs'][data['specs'].columns[1:]].agg(lambda x: reduce(mergelist,x),axis=1)
    specs = specs[['tube_assembly_id', 'specs', 'specs_ct']]
    bill_of_materials = data['bill_of_materials']
    
    X = X.merge(tube, how='left')
    #X = X.merge(specs, how='left')
    X = X.merge(bill_of_materials, how='left')
    #X = X.merge(specs, on='tube_assembly_id')
    #X = X.merge(bom_agg, how='left')
    # Drop tube_assembly_id because our goal is to predict unknown assemblies
    X = X.drop(columns='tube_assembly_id')
    #Fill NaNs
    X.fillna(0, inplace=True)
    
    return X


In [391]:
train_wrangled = wrangle(train)
val_wrangled = wrangle(val)
test_wrangled = wrangle(test)

## Create and Evaluate Model

In [392]:
target = 'cost'
y_train = train[target]
y_val = val[target]

y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

features = train_wrangled.columns.drop('cost')
X_train = train_wrangled[features]
X_val = val_wrangled[features]

X_test = test_wrangled[features]

In [393]:
from sklearn.metrics import mean_squared_error
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
import category_encoders as ce
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor



pipeline = make_pipeline(
    ce.OrdinalEncoder(), 
    XGBRegressor(n_estimators=1000, n_jobs=-1)
)

pipeline.fit(X_train, y_train_log)
y_pred_log = pipeline.predict(X_val)
rmse(y_val_log, y_pred_log)

0.27310656926362875

In [225]:
0.27310656926362875

0.27310656926362875

In [398]:
y_test_pred_log = pipeline.predict(X_test)
y_test_pred = np.expm1(y_test_pred_log)
sample = pd.read_csv('sample_submission.csv')
sub = sample
sub['cost'] = y_test_pred
sub = sub.set_index('id')
sub.to_csv('pred.csv')

In [399]:
!kaggle competitions submit caterpillar-tube-pricing -f pred.csv -m "XGB"

100%|█████████████████████████████████████████| 451k/451k [00:02<00:00, 211kB/s]
Successfully submitted to Caterpillar Tube Pricing