In [1]:
#conda install -c anaconda py-xgboost

In [2]:
#conda install -c conda-forge category_encoders

In [3]:
import pandas as pd
import numpy as np
import category_encoders as ce
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

In [4]:
source = 'C:/Users/Cactuar/Projects/DS-Unit-2-Applied-Modeling/data/caterpillar/caterpillar-tube-pricing/data/competition_data/'

In [5]:
#materials = pd.read_csv(source + 'bill_of_materials.csv')
#materials.head()

In [6]:
#components = pd.read_csv(source + 'components.csv')
#components.describe()

In [7]:
#components.head()

In [8]:
#data = {}
#for path in glob(source + '*.csv'):
#    df = pd.read_csv(path)
#    filename = path.split('/')[-1]
#    name = filename.split('.')[0]
#    data[name] = df

In [9]:
#data['bill_of_materials'].head()

In [10]:
# function for root mean squared (log) error
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [11]:
# Wrangle function 
def wrangle(X):
    X = X.copy()
    
    # Engineer features
    X['quote_date'] = pd.to_datetime(X['quote_date'], infer_datetime_format=True)
    X['quote_date_year'] = X['quote_date'].dt.year
    X['quote_date_month'] = X['quote_date'].dt.month
    X = X.drop(columns='quote_date')
    
    # Merge tube & spec
    tube = pd.read_csv(source + 'tube.csv')
    X = X.merge(tube, how='left')
    
    # Engineer features from bill_of_materials
    materials = pd.read_csv(source + 'bill_of_materials.csv')
    
    materials['components_total'] = (materials['quantity_1'].fillna(0) + 
                                     materials['quantity_2'].fillna(0) + 
                                     materials['quantity_3'].fillna(0) + 
                                     materials['quantity_4'].fillna(0) + 
                                     materials['quantity_5'].fillna(0) + 
                                     materials['quantity_6'].fillna(0) + 
                                     materials['quantity_7'].fillna(0) + 
                                     materials['quantity_8'].fillna(0))

    materials['components_distinct'] = (materials['component_id_1'].notnull().astype(int) + 
                                        materials['component_id_2'].notnull().astype(int) + 
                                        materials['component_id_3'].notnull().astype(int) + 
                                        materials['component_id_4'].notnull().astype(int) + 
                                        materials['component_id_5'].notnull().astype(int) + 
                                        materials['component_id_6'].notnull().astype(int) + 
                                        materials['component_id_7'].notnull().astype(int) + 
                                        materials['component_id_8'].notnull().astype(int))
                                        
    # Merge features from bill_of_materials
    # Use the first component_id, ignoring the others for now
    features = ['tube_assembly_id', 'component_id_1', 'components_total', 'components_distinct']
    X = X.merge(materials[features], how='left')
    
    # Get component_type_id --has lower cardinality than component_id
    components = pd.read_csv(source + 'components.csv')
    components = components.rename(columns={'component_id': 'component_id_1'})
    features = ['component_id_1', 'component_type_id']
    X = X.merge(components[features], how='left')
    
    # Count the number of specs for the tube assembly
    specs = pd.read_csv(source + 'specs.csv')
    specs['specs_total'] = specs.drop(columns=['tube_assembly_id']).count(axis=1)
    features = ['tube_assembly_id', 'specs_total', 'spec1']
    X = X.merge(specs[features], how='left')
                                        
    # drop id
    X = X.drop(columns='tube_assembly_id')
                    
    return X

In [12]:
# read in data
trainval = pd.read_csv(source + 'train_set.csv')
test = pd.read_csv(source + 'test_set.csv')

In [13]:
# split trainval
trainval_tube_assemblies = trainval['tube_assembly_id'].unique()

train_tube_assemblies, val_tube_assemblies = train_test_split(
    trainval_tube_assemblies, random_state=42)

train = trainval[trainval.tube_assembly_id.isin(train_tube_assemblies)]
val = trainval[trainval.tube_assembly_id.isin(val_tube_assemblies)]

In [14]:
# use wrangle function on all splits
train = wrangle(train)
val = wrangle(val)
test = wrangle(test)

In [15]:
# feature and target matrix
target = 'cost'
X_train = train.drop(columns=target)
X_val = val.drop(columns=target)
X_test = test.drop(columns='id')
y_train = train[target]
y_val = val[target]
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

In [27]:
# make pipeline
pipeline = make_pipeline(ce.OrdinalEncoder(),
                         XGBRegressor(n_estimators=10000, n_jobs=-1))

In [28]:
# fit pipeline
pipeline.fit(X_train, y_train_log)

  if getattr(data, 'base', None) is not None and \




Pipeline(memory=None,
     steps=[('ordinalencoder', OrdinalEncoder(cols=['supplier', 'bracket_pricing', 'material_id', 'end_a_1x', 'end_a_2x', 'end_x_1x', 'end_x_2x', 'end_a', 'end_x', 'component_id_1', 'component_type_id', 'spec1'],
        drop_invariant=False, handle_unknown='impute', impute_missing=True,
        mapping=...lpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1))])

In [22]:
# generate predictions and get root mean squared error
y_pred_log = pipeline.predict(X_val)
print('validation error:', rmse(y_val_log, y_pred_log))

validation error: 0.26458936630175744


In [32]:
newsource = 'C:/Users/Cactuar/Projects/DS-Unit-2-Applied-Modeling/data/caterpillar/caterpillar-tube-pricing/'

In [33]:
%%time

# make submission csv
def generate_submission(estimator, X_test, filename):
    y_pred_log = estimator.predict(X_test)
    y_pred = np.expm1(y_pred_log)  # Convert from log-dollars to dollars
    submission = pd.read_csv(newsource + 'sample_submission.csv')
    submission['cost'] = y_pred
    submission.to_csv(filename, index=False)
    
generate_submission(pipeline, X_test, 'catterpillar02.csv')

Wall time: 15.4 s


In [30]:
source

'C:/Users/Cactuar/Projects/DS-Unit-2-Applied-Modeling/data/caterpillar/caterpillar-tube-pricing/data/competition_data/'