In [1]:
import category_encoders as ce
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
from glob import glob
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# evaluation functions
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [3]:
# wrangle function

def wrangle(X):
    X = X.copy()
    
    # Engineer date features
    X['quote_date'] = pd.to_datetime(X['quote_date'], infer_datetime_format=True)
    X['quote_date_year'] = X['quote_date'].dt.year
    X['quote_date_month'] = X['quote_date'].dt.month
    X = X.drop(columns='quote_date')
    
    # Merge tube data
    tube = pd.read_csv('competition_data/tube.csv')
    X = X.merge(tube, how='left')
    
    # Engineer features from bill_of_materials
    materials = pd.read_csv('competition_data/bill_of_materials.csv')
    
    materials['components_total'] = (materials['quantity_1'].fillna(0) + 
                                     materials['quantity_2'].fillna(0) + 
                                     materials['quantity_3'].fillna(0) + 
                                     materials['quantity_4'].fillna(0) + 
                                     materials['quantity_5'].fillna(0) + 
                                     materials['quantity_6'].fillna(0) + 
                                     materials['quantity_7'].fillna(0) + 
                                     materials['quantity_8'].fillna(0))

    materials['components_distinct'] = (materials['component_id_1'].notnull().astype(int) + 
                                        materials['component_id_2'].notnull().astype(int) + 
                                        materials['component_id_3'].notnull().astype(int) + 
                                        materials['component_id_4'].notnull().astype(int) + 
                                        materials['component_id_5'].notnull().astype(int) + 
                                        materials['component_id_6'].notnull().astype(int) + 
                                        materials['component_id_7'].notnull().astype(int) + 
                                        materials['component_id_8'].notnull().astype(int))
    
    # Engineer features from components and bill_of_materials
    components = pd.read_csv('competition_data/components.csv')
    
    # create dictionary to map component_id to component_id_type
    component_dict = components[['component_id', 'component_type_id']].set_index('component_id').to_dict(orient='dict')
    component_dict = component_dict['component_type_id']
    
    # use the dictionary to replace component_id in bill_of_materials with component_id_type
    materials_type = materials.replace(component_dict)
    
    # create a function to return the tally of components with a specified component type in a row
    def count(row):
        tally=0
        for num in range(1,6):
            name = 'component_id_' + str(num)
            quantity = 'quantity_' + str(num)
            if row[name]==item:
                tally += row[quantity]
            return tally
    
    # get a list of unique component types
    component_type_list = components.component_type_id.unique().tolist()
    
    # iterate over the list of component types and apply the function to 
    # create a feature with the row tallies
    for item in component_type_list:
        materials_type[item] = materials_type.apply(count, axis=1)
    
    # Merge selected features from bill_of_materials
    # Just use the first component_id, ignore the others for now!
    features = ['tube_assembly_id', 'component_id_1', 'components_total', 'components_distinct',
                'OTHER', 'CP-024', 'CP-026', 'CP-028', 'CP-014', 
                'CP-018', 'CP-001', 'CP-008', 'CP-009', 'CP-002', 
                'CP-010', 'CP-021', 'CP-011', 'CP-015', 'CP-027',
                'CP-003', 'CP-004', 'CP-005', 'CP-019', 'CP-025', 
                'CP-006', 'CP-016', 'CP-020', 'CP-012', 'CP-022', 
                'CP-007', 'CP-017', 'CP-023', 'CP-029']
    X = X.merge(materials_type[features], how='left')
    
    # Get component_type_id (has lower cardinality than component_id)
    components = pd.read_csv('competition_data/components.csv')
    components = components.rename(columns={'component_id': 'component_id_1'})
    features = ['component_id_1', 'component_type_id']
    X = X.merge(components[features], how='left')
    
    # Count the number of specs for the tube assembly
    specs = pd.read_csv('competition_data/specs.csv')
    specs['specs_total'] = specs.drop(columns=['tube_assembly_id']).count(axis=1)
    features = ['tube_assembly_id', 'specs_total', 'spec1']
    X = X.merge(specs[features], how='left')
    
    return X

In [4]:
# read data
train = pd.read_csv('competition_data/train_set.csv')
test = pd.read_csv('competition_data/test_set.csv')

train.shape, test.shape

((30213, 8), (30235, 8))

In [5]:
# Wrangle train, validation, and test sets
train = wrangle(train)
test = wrangle(test)

In [6]:
# Arrange X matrix and y vector (log-transformed)
target = 'cost'
X_train = train.drop(columns=target)
X_test = test.drop(columns='id')
y_train = train[target]
y_train_log = np.log1p(y_train)

In [7]:
# prediction function
def generate_submission(estimator, X_test_param, filename):
    y_pred_log = estimator.predict(X_test_param)
    y_pred = np.expm1(y_pred_log)  # Convert from log-dollars to dollars
    submission = pd.read_csv('sample_submission.csv')
    submission['cost'] = y_pred
    submission.to_csv(filename, index=False)

## Now lets do a gradient boosting model

In [8]:
from sklearn.preprocessing import OneHotEncoder

In [9]:
encoder = ce.OneHotEncoder()
X_train_encoded = encoder.fit_transform(X_train, cols=['material_id', 'supplier'])
X_test_encoded = encoder.fit_transform(X_test, cols=['material_id', 'supplier'])

In [10]:
encoder = ce.OrdinalEncoder()
X_train_encoded = encoder.fit_transform(X_train)
X_test_encoded = encoder.transform(X_test)

In [11]:
# eval_set = [(X_train_encoded, y_train_log),
#             (X_val_encoded, y_val_log)]

# model = XGBRegressor(n_estimators=2000, n_jobs=-1, eta=0.085, max_depth=6)
# model.fit(X_train_encoded, y_train_log, eval_set=eval_set, eval_metric='rmse',
#          early_stopping_rounds=100)

In [16]:
model = XGBRegressor()

In [17]:
from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV

In [21]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

groups = train['tube_assembly_id']
param_distributions = {
    'n_estimators': randint(800, 1000), 
    'max_depth': [6,7,8],
    'learning_rate': [.08, .085, .09, .095, .1, .105, .11, .115, .12] 
}

search = RandomizedSearchCV(
    model, 
    param_distributions=param_distributions, 
    n_iter=10, 
    cv=5, 
    scoring='neg_mean_squared_error', 
    verbose=10, 
    return_train_score=True, 
    n_jobs=-1
)

search.fit(X_train, y_train_log, groups=groups);

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in fields tube_assembly_id, supplier, bracket_pricing, material_id, end_a_1x, end_a_2x, end_x_1x, end_x_2x, end_a, end_x, component_id_1, component_type_id, spec1

In [None]:
# generate submission
# generate_submission(model, X_test_encoded, 'submission-30.csv')