In [17]:
from glob import glob
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
%matplotlib inline
import seaborn as sns
from sklearn.metrics import mean_squared_error

import category_encoders as ce
from sklearn.pipeline import make_pipeline

from sklearn.impute import SimpleImputer

def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


trainval = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv')
tube = pd.read_csv('tube.csv')
specs = pd.read_csv('specs.csv')
bill = pd.read_csv('bill_of_materials.csv')
sample = pd.read_csv('../sample_submission.csv')
%cd ../data/caterpillar/competition_data
%ls

[Errno 2] No such file or directory: '../data/caterpillar/competition_data'
/home/gunther/Downloads/github/DS-Unit-2-Applied-Modeling/data/caterpillar/competition_data
bill_of_materials.csv  components.csv     test_set.csv
comp_adaptor.csv       comp_other.csv     train_set.csv
comp_boss.csv          comp_sleeve.csv    tube.csv
comp_elbow.csv         comp_straight.csv  tube_end_form.csv
comp_float.csv         comp_tee.csv       type_component.csv
comp_hfl.csv           comp_threaded.csv  type_connection.csv
comp_nut.csv           specs.csv          type_end_form.csv


In [18]:
# exploring data

trainval.head()

Unnamed: 0,tube_assembly_id,supplier,quote_date,annual_usage,min_order_quantity,bracket_pricing,quantity,cost
0,TA-00002,S-0066,2013-07-07,0,0,Yes,1,21.905933
1,TA-00002,S-0066,2013-07-07,0,0,Yes,2,12.341214
2,TA-00002,S-0066,2013-07-07,0,0,Yes,5,6.601826
3,TA-00002,S-0066,2013-07-07,0,0,Yes,10,4.68777
4,TA-00002,S-0066,2013-07-07,0,0,Yes,25,3.541561


In [19]:
# MAKING TRAIN AND VAL FROM TRAINVAL


# Date Time to test and train DS

trainval['quote_date'] = pd.to_datetime(trainval['quote_date'], infer_datetime_format=True)

test['quote_date'] = pd.to_datetime(test['quote_date'], infer_datetime_format=True)

# Unique Tube assebly id to trainval and test
trainval_tube_assemblies = trainval['tube_assembly_id'].unique()

test_tube_assemblies = test['tube_assembly_id'].unique()

# List form ablove slit in to train and test trainval_tube_assemblies  >> train_tube_assemblies, val_tube_assemblies
train_tube_assemblies, val_tube_assemblies = train_test_split(trainval_tube_assemblies, random_state=42)


# Pass list id and turn in to data set. Test is already made. 

train = trainval[trainval.tube_assembly_id.isin(train_tube_assemblies)]

val = trainval[trainval.tube_assembly_id.isin(val_tube_assemblies)]



In [20]:
bill.shape,specs.shape,train.shape

((21198, 17), (21198, 11), (22628, 8))

In [21]:
# Data Wrangeling



def wrangle(X):
    X = X.copy()
    
    # Engineer date features
    X['quote_date'] = pd.to_datetime(X['quote_date'], infer_datetime_format=True)
    X['quote_date_year'] = X['quote_date'].dt.year
    X['quote_date_month'] = X['quote_date'].dt.month
    X = X.drop(columns='quote_date')
    
    X = X.merge(tube, how='left')
    X = X.merge(specs, how='left')
    X = X.merge(bill, how='left')
    
    
    # Drop tube_assembly_id because our goal is to predict unknown assemblies
    X = X.drop(columns='tube_assembly_id')
    
    return X

train_w = wrangle(train)
val_w = wrangle(val)
test_w = wrangle(test)

target = 'cost'

features = train_w.columns.drop(target)


x_train = train_w[features]

x_val = val_w[features]

x_test = test_w[features]

y_train = train[target]

y_val = val[target]

x_test.head()



Unnamed: 0,supplier,annual_usage,min_order_quantity,bracket_pricing,quantity,quote_date_year,quote_date_month,material_id,diameter,wall,...,component_id_4,quantity_4,component_id_5,quantity_5,component_id_6,quantity_6,component_id_7,quantity_7,component_id_8,quantity_8
0,S-0066,0,0,Yes,1,2013,6,SP-0035,12.7,1.65,...,,,,,,,,,,
1,S-0066,0,0,Yes,2,2013,6,SP-0035,12.7,1.65,...,,,,,,,,,,
2,S-0066,0,0,Yes,5,2013,6,SP-0035,12.7,1.65,...,,,,,,,,,,
3,S-0066,0,0,Yes,10,2013,6,SP-0035,12.7,1.65,...,,,,,,,,,,
4,S-0066,0,0,Yes,25,2013,6,SP-0035,12.7,1.65,...,,,,,,,,,,


In [22]:
# Mean Baseline

#   full_like ===  len(y_train mean ) === len(y_val)
y_pred = np.full_like(y_val, fill_value=y_train.mean())


print('RMSLE, Mean Baseline:', rmsle(y_val, y_pred))
print('R squared Mean Baseline:', r2_score(y_val, y_pred))

RMSLE, Mean Baseline: 0.9418101276064408
R squared Mean Baseline: -4.701447715138585e-06


In [23]:
%%time

# Take the log of the target

y_train_log = np.log1p(y_train)

# Make a pipeline
pipeline = make_pipeline(
    ce.OrdinalEncoder(),
    SimpleImputer(strategy='median'),
    RandomForestRegressor(n_estimators=2000, random_state=42, n_jobs=-1))

pipeline.fit(x_train, y_train_log)


y_pred_log = pipeline.predict(x_val)

y_val_log = np.log1p(y_val)


print('RMSE :',rmse(y_val_log, y_pred_log))
print('RMSLE',rmsle(y_val_log, y_pred_log))

RMSE : 0.303262248771066
RMSLE 0.08531506608807536
CPU times: user 6min 4s, sys: 1.93 s, total: 6min 6s
Wall time: 24.8 s


In [24]:
for path in glob('*.csv'):
    df = pd.read_csv(path)
    shared_columns = set(df.columns) & set(train.columns)
    if shared_columns:
        print(path, df.shape)
        print(df.columns.tolist(), '\n')
        
'''
bill_of_materials.csv
specs.csv'''


test_set.csv (30235, 8)
['id', 'tube_assembly_id', 'supplier', 'quote_date', 'annual_usage', 'min_order_quantity', 'bracket_pricing', 'quantity'] 

specs.csv (21198, 11)
['tube_assembly_id', 'spec1', 'spec2', 'spec3', 'spec4', 'spec5', 'spec6', 'spec7', 'spec8', 'spec9', 'spec10'] 

bill_of_materials.csv (21198, 17)
['tube_assembly_id', 'component_id_1', 'quantity_1', 'component_id_2', 'quantity_2', 'component_id_3', 'quantity_3', 'component_id_4', 'quantity_4', 'component_id_5', 'quantity_5', 'component_id_6', 'quantity_6', 'component_id_7', 'quantity_7', 'component_id_8', 'quantity_8'] 

train_set.csv (30213, 8)
['tube_assembly_id', 'supplier', 'quote_date', 'annual_usage', 'min_order_quantity', 'bracket_pricing', 'quantity', 'cost'] 

tube.csv (21198, 16)
['tube_assembly_id', 'material_id', 'diameter', 'wall', 'length', 'num_bends', 'bend_radius', 'end_a_1x', 'end_a_2x', 'end_x_1x', 'end_x_2x', 'end_a', 'end_x', 'num_boss', 'num_bracket', 'other'] 



'\nbill_of_materials.csv\nspecs.csv'

In [27]:
##export csv to kaggle
y_pred_log = pipeline.predict(x_test)
y_pred = np.expm1(y_pred_log)

submission = sample.copy()
submission['cost'] = y_pred

submission.to_csv('cat_submission.csv', index=False)

In [28]:
#!kaggle competitions submit -c caterpillar-tube-pricing -f cat_submission.csv -m "Test ..Mic check 123"

100%|█████████████████████████████████████████| 708k/708k [00:02<00:00, 329kB/s]
Successfully submitted to Caterpillar Tube Pricing