In [170]:
from glob import glob
import pandas as pd
dframes = {}
for path in glob('competition_data/*.csv'):
    name = path.split('/')[1].split('.')[0]
    dframes[name] = pd.read_csv(path)
    print(path, dframes[name].shape)

competition_data/tube_end_form.csv (27, 2)
competition_data/components.csv (2048, 3)
competition_data/type_end_form.csv (8, 2)
competition_data/comp_threaded.csv (194, 32)
competition_data/type_connection.csv (14, 2)
competition_data/comp_hfl.csv (6, 9)
competition_data/bill_of_materials.csv (21198, 17)
competition_data/comp_float.csv (16, 7)
competition_data/type_component.csv (29, 2)
competition_data/comp_straight.csv (361, 12)
competition_data/test_set.csv (30235, 8)
competition_data/comp_other.csv (1001, 3)
competition_data/comp_sleeve.csv (50, 10)
competition_data/tube.csv (21198, 16)
competition_data/comp_tee.csv (4, 14)
competition_data/comp_nut.csv (65, 11)
competition_data/comp_boss.csv (147, 15)
competition_data/specs.csv (21198, 11)
competition_data/train_set.csv (30213, 8)
competition_data/comp_adaptor.csv (25, 20)
competition_data/comp_elbow.csv (178, 16)


In [171]:
dframes.keys()

dict_keys(['tube_end_form', 'components', 'type_end_form', 'comp_threaded', 'type_connection', 'comp_hfl', 'bill_of_materials', 'comp_float', 'type_component', 'comp_straight', 'test_set', 'comp_other', 'comp_sleeve', 'tube', 'comp_tee', 'comp_nut', 'comp_boss', 'specs', 'train_set', 'comp_adaptor', 'comp_elbow'])

In [172]:
import numpy as np
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split


def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [173]:
trainval = dframes['train_set']
test = dframes['test_set']
trainval_tube_assemblies = trainval['tube_assembly_id'].unique()
test_tube_assemblies = test['tube_assembly_id'].unique()
train = trainval[trainval.tube_assembly_id.isin(train_tube_assemblies)]
val = trainval[trainval.tube_assembly_id.isin(val_tube_assemblies)]

In [174]:
train_tube_assemblies, val_tube_assemblies = train_test_split(
    trainval_tube_assemblies, random_state=42)

In [175]:
train.head()

Unnamed: 0,tube_assembly_id,supplier,quote_date,annual_usage,min_order_quantity,bracket_pricing,quantity,cost
8,TA-00004,S-0066,2013-07-07,0,0,Yes,1,21.972702
9,TA-00004,S-0066,2013-07-07,0,0,Yes,2,12.407983
10,TA-00004,S-0066,2013-07-07,0,0,Yes,5,6.668596
11,TA-00004,S-0066,2013-07-07,0,0,Yes,10,4.754539
12,TA-00004,S-0066,2013-07-07,0,0,Yes,25,3.608331


In [176]:
for frame in dframes.keys():
    df = dframes[frame]
    shared_cols = set(df.columns) & set(train.columns)
    
    if shared_cols:
        print(frame, df.shape)
        print(list(df.columns),'\n')

bill_of_materials (21198, 17)
['tube_assembly_id', 'component_id_1', 'quantity_1', 'component_id_2', 'quantity_2', 'component_id_3', 'quantity_3', 'component_id_4', 'quantity_4', 'component_id_5', 'quantity_5', 'component_id_6', 'quantity_6', 'component_id_7', 'quantity_7', 'component_id_8', 'quantity_8'] 

test_set (30235, 8)
['id', 'tube_assembly_id', 'supplier', 'quote_date', 'annual_usage', 'min_order_quantity', 'bracket_pricing', 'quantity'] 

tube (21198, 16)
['tube_assembly_id', 'material_id', 'diameter', 'wall', 'length', 'num_bends', 'bend_radius', 'end_a_1x', 'end_a_2x', 'end_x_1x', 'end_x_2x', 'end_a', 'end_x', 'num_boss', 'num_bracket', 'other'] 

specs (21198, 11)
['tube_assembly_id', 'spec1', 'spec2', 'spec3', 'spec4', 'spec5', 'spec6', 'spec7', 'spec8', 'spec9', 'spec10'] 

train_set (30213, 8)
['tube_assembly_id', 'supplier', 'quote_date', 'annual_usage', 'min_order_quantity', 'bracket_pricing', 'quantity', 'cost'] 



In [190]:
def wrangle(X):
    X = X.copy()
    
    # Engineer date features
    X['quote_date'] = pd.to_datetime(X['quote_date'], infer_datetime_format=True)
    X['quote_date_year'] = X['quote_date'].dt.year
    X['quote_date_month'] = X['quote_date'].dt.month
    X = X.drop(columns='quote_date')
    
    # Merge data
    #tube =dframes['tube']
    specs = dframes['specs']
    bill_of_materials = dframes['bill_of_materials']
    
    X = X.merge(tube, how='left')
    #X = X.merge(specs, how='left')
    X = X.merge(bill_of_materials, how='left')
    
    
    # Drop tube_assembly_id because our goal is to predict unknown assemblies
    X = X.drop(columns='tube_assembly_id')
    #Fill NaNs
    X.fillna(0, inplace=True)
    
    return X


In [191]:
train_wrangled = wrangle(train)
val_wrangled = wrangle(val)
test_wrangled = wrangle(test)

In [192]:
target = 'cost'
y_train = train[target]
y_val = val[target]

y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

features = train_wrangled.columns.drop('cost')
X_train = train_wrangled[features]
X_val = val_wrangled[features]

X_test = test_wrangled[features]

In [194]:
X_train.head()

Unnamed: 0,supplier,annual_usage,min_order_quantity,bracket_pricing,quantity,quote_date_year,quote_date_month,material_id,diameter,wall,...,component_id_4,quantity_4,component_id_5,quantity_5,component_id_6,quantity_6,component_id_7,quantity_7,component_id_8,quantity_8
0,S-0066,0,0,Yes,1,2013,7,SP-0019,6.35,0.71,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
1,S-0066,0,0,Yes,2,2013,7,SP-0019,6.35,0.71,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2,S-0066,0,0,Yes,5,2013,7,SP-0019,6.35,0.71,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
3,S-0066,0,0,Yes,10,2013,7,SP-0019,6.35,0.71,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
4,S-0066,0,0,Yes,25,2013,7,SP-0019,6.35,0.71,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0


In [183]:
X_test.head()

Unnamed: 0,supplier,annual_usage,min_order_quantity,bracket_pricing,quantity,quote_date_year,quote_date_month,material_id,diameter,wall,...,component_id_4,quantity_4,component_id_5,quantity_5,component_id_6,quantity_6,component_id_7,quantity_7,component_id_8,quantity_8
0,S-0066,0,0,Yes,1,2013,6,SP-0035,12.7,1.65,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
1,S-0066,0,0,Yes,2,2013,6,SP-0035,12.7,1.65,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2,S-0066,0,0,Yes,5,2013,6,SP-0035,12.7,1.65,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
3,S-0066,0,0,Yes,10,2013,6,SP-0035,12.7,1.65,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
4,S-0066,0,0,Yes,25,2013,6,SP-0035,12.7,1.65,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0


In [195]:
from sklearn.metrics import mean_squared_error
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
import category_encoders as ce
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.pipeline import make_pipeline



pipeline = make_pipeline(
    ce.OrdinalEncoder(), 
    RandomForestRegressor(n_estimators=1000, random_state=55, n_jobs=-1)
)

pipeline.fit(X_train, y_train_log)
y_pred_log = pipeline.predict(X_val)
rmse(y_val_log, y_pred_log)

0.29942907867755203

In [196]:
X_train.head()

Unnamed: 0,supplier,annual_usage,min_order_quantity,bracket_pricing,quantity,quote_date_year,quote_date_month,material_id,diameter,wall,...,component_id_4,quantity_4,component_id_5,quantity_5,component_id_6,quantity_6,component_id_7,quantity_7,component_id_8,quantity_8
0,S-0066,0,0,Yes,1,2013,7,SP-0019,6.35,0.71,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
1,S-0066,0,0,Yes,2,2013,7,SP-0019,6.35,0.71,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2,S-0066,0,0,Yes,5,2013,7,SP-0019,6.35,0.71,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
3,S-0066,0,0,Yes,10,2013,7,SP-0019,6.35,0.71,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
4,S-0066,0,0,Yes,25,2013,7,SP-0019,6.35,0.71,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0


In [197]:
X_test.head(10)

Unnamed: 0,supplier,annual_usage,min_order_quantity,bracket_pricing,quantity,quote_date_year,quote_date_month,material_id,diameter,wall,...,component_id_4,quantity_4,component_id_5,quantity_5,component_id_6,quantity_6,component_id_7,quantity_7,component_id_8,quantity_8
0,S-0066,0,0,Yes,1,2013,6,SP-0035,12.7,1.65,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
1,S-0066,0,0,Yes,2,2013,6,SP-0035,12.7,1.65,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2,S-0066,0,0,Yes,5,2013,6,SP-0035,12.7,1.65,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
3,S-0066,0,0,Yes,10,2013,6,SP-0035,12.7,1.65,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
4,S-0066,0,0,Yes,25,2013,6,SP-0035,12.7,1.65,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
5,S-0066,0,0,Yes,50,2013,6,SP-0035,12.7,1.65,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
6,S-0066,0,0,Yes,100,2013,6,SP-0035,12.7,1.65,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
7,S-0066,0,0,Yes,250,2013,6,SP-0035,12.7,1.65,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
8,S-0066,0,0,Yes,1,2013,7,SP-0019,6.35,0.71,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
9,S-0066,0,0,Yes,2,2013,7,SP-0019,6.35,0.71,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0


In [198]:
y_test_pred_log = pipeline.predict(X_test)
y_test_pred = np.expm1(y_test_pred_log)

In [199]:
y_test_pred

array([22.73460289, 13.06083921,  7.42161774, ...,  4.52565056,
       29.4958345 , 29.79298188])

In [201]:
sample = pd.read_csv('sample_submission.csv')
sample

Unnamed: 0,id,cost
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
5,6,0
6,7,0
7,8,0
8,9,0
9,10,0


In [203]:
sub = sample
sub['cost'] = y_test_pred
sub = sub.set_index('id')
sub.to_csv('001.csv')

sub

Unnamed: 0_level_0,cost
id,Unnamed: 1_level_1
1,22.734603
2,13.060839
3,7.421618
4,5.511299
5,4.310038
6,4.000121
7,3.871886
8,3.803323
9,21.734607
10,12.172521


In [204]:
!head 001.csv

id,cost
1,22.734602894207462
2,13.060839207460601
3,7.421617735867778
4,5.511299076142976
5,4.310038113788265
6,4.000121413445502
7,3.8718863185822396
8,3.8033231956183116
9,21.73460653725134


In [205]:
!head sample_submission.csv

id,cost
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0
