In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#Glob in my dataframes
SOURCE = '../competition_data/'

from glob import glob

data = {}
for path in glob(SOURCE + '*.csv'):
    df = pd.read_csv(path)
    filename = path.split('/')[-1]
    filename2 = filename.split('\\')[1]
    name = filename2.split('.')[0]
    data[name] = df

In [20]:
#Check our work
data.keys()

dict_keys(['bill_of_materials', 'components', 'comp_adaptor', 'comp_boss', 'comp_elbow', 'comp_float', 'comp_hfl', 'comp_nut', 'comp_other', 'comp_sleeve', 'comp_straight', 'comp_tee', 'comp_threaded', 'sample_submission', 'specs', 'test_set', 'train_set', 'tube', 'tube_end_form', 'type_component', 'type_connection', 'type_end_form'])

In [4]:
trainval = data['train_set']
test = data['test_set']

In [5]:
trainval_tube_assemblies = trainval['tube_assembly_id'].unique()

In [6]:
#split trainval array based on unique ids
from sklearn.model_selection import train_test_split
train_tube_assemblies, val_tube_assemblies = train_test_split(trainval_tube_assemblies, random_state =42)

In [7]:
#Ensure no matching ids
set(train_tube_assemblies)&set(val_tube_assemblies)

set()

In [8]:
#Establish train and val dataframes
train = trainval[trainval['tube_assembly_id'].isin(train_tube_assemblies)]
val = trainval[trainval['tube_assembly_id'].isin(val_tube_assemblies)]
#Ensure shapes equal to trainval shape
train.shape, val.shape, trainval.shape

((22628, 8), (7585, 8), (30213, 8))

In [9]:
import category_encoders as ce
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

def clean_it_up(X):
    X = X.copy()
    
    #Convert quote date to datetime create month and year feature
    X['quote_date'] = pd.to_datetime(X['quote_date'], infer_datetime_format = True)
    X['quote_year'] = X['quote_date'].dt.year
    X['quote_month'] = X['quote_date'].dt.month
    X = X.drop(columns = 'quote_date')
    
    #Merge in tube
    tube = data['tube']
    X = X.merge(tube, how = 'left')
    
    #Engineer features from bill_of_materials
    materials = data['bill_of_materials']
    
    materials['components_total'] = (materials['quantity_1'].fillna(0) + 
                                     materials['quantity_2'].fillna(0) + 
                                     materials['quantity_3'].fillna(0) + 
                                     materials['quantity_4'].fillna(0) + 
                                     materials['quantity_5'].fillna(0) + 
                                     materials['quantity_6'].fillna(0) + 
                                     materials['quantity_7'].fillna(0) + 
                                     materials['quantity_8'].fillna(0))
    
    materials['components_distinct'] = (materials['component_id_1'].notnull().astype(int) + 
                                        materials['component_id_2'].notnull().astype(int) + 
                                        materials['component_id_3'].notnull().astype(int) + 
                                        materials['component_id_4'].notnull().astype(int) + 
                                        materials['component_id_5'].notnull().astype(int) + 
                                        materials['component_id_6'].notnull().astype(int) + 
                                        materials['component_id_7'].notnull().astype(int) + 
                                        materials['component_id_8'].notnull().astype(int))
    
    #Merge in just the new features
    features = ['tube_assembly_id','component_id_1', 'components_total', 'components_distinct']
    X = X.merge(materials[features], how = 'left')
    
    #Get component_type_id, lower cardinality than component_type
    components = data['components']
    components = components.rename(columns={'component_id':'component_id_1'})
    features = ['component_id_1', 'component_type_id']
    X = X.merge(components[features], how = 'left')
    
    #Get total of specs for tubes
    specs = data['specs']
    specs['specs_total'] = specs.drop(columns = ['tube_assembly_id']).count(axis = 1)
    features = ['tube_assembly_id', 'specs_total', 'spec1']
    X = X.merge(specs[features], how = 'left')
    
    #Drop tube_assembly_id because goal is to predict unknown assemblies
    X = X.drop(columns = 'tube_assembly_id')
    
    return X

train = clean_it_up(train)
val = clean_it_up(val)
test = clean_it_up(test)

In [10]:
#Arrange into X features and y target
target = 'cost'
X_train = train.drop(columns = target)
X_val = val.drop(columns =target)
X_test = test.drop(columns = 'id')

y_train = train[target]
y_val = val[target]
#Log transformed for possible later use
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

In [21]:
#Make pipeline
from xgboost import XGBRegressor
pipeline = make_pipeline(ce.OrdinalEncoder(),
                         XGBRegressor(n_estimators = 1000, 
                                               n_jobs = -1))
#Fit pipeline
pipeline.fit(X_train, y_train_log)

from sklearn.metrics import mean_squared_log_error

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

#Predict
y_pred = pipeline.predict(X_val)
print(f'Validation Error : {rmse(y_val_log, y_pred)}')

def generate_submission(estimator, X_test, filename):
    y_pred_log = estimator.predict(X_test)
    y_pred = np.expm1(y_pred_log)
    submission = data['sample_submission']
    submission['cost'] = y_pred
    submission.to_csv(filename, index=False)

  if getattr(data, 'base', None) is not None and \


Validation Error : 0.2719603739172016


In [22]:
generate_submission(pipeline, X_test, 'submission1')