In [2]:
import pandas as pd
from zipfile import ZipFile

file_name = '/Users/mattmastin/desktop/caterpillar-tube-pricing.zip'

In [3]:
with ZipFile(file_name, 'r') as zip:
    zip.printdir()
    print('extracting all files now...')
    zip.extractall()
    print('Done')

File Name                                             Modified             Size
sample_submission.csv                          2018-05-12 22:28:12       230782
data.zip                                       2018-05-12 22:28:12       812858
extracting all files now...
Done


In [4]:
!unzip data.zip

Archive:  data.zip
   creating: competition_data/
  inflating: competition_data/bill_of_materials.csv  
  inflating: competition_data/comp_adaptor.csv  
  inflating: competition_data/comp_boss.csv  
  inflating: competition_data/comp_elbow.csv  
  inflating: competition_data/comp_float.csv  
  inflating: competition_data/comp_hfl.csv  
  inflating: competition_data/comp_nut.csv  
  inflating: competition_data/comp_other.csv  
  inflating: competition_data/comp_sleeve.csv  
  inflating: competition_data/comp_straight.csv  
  inflating: competition_data/comp_tee.csv  
  inflating: competition_data/comp_threaded.csv  
  inflating: competition_data/components.csv  
  inflating: competition_data/specs.csv  
  inflating: competition_data/test_set.csv  
  inflating: competition_data/train_set.csv  
  inflating: competition_data/tube.csv  
  inflating: competition_data/tube_end_form.csv  
  inflating: competition_data/type_component.csv  
  inflating: competition_data/type_connection.csv  
  i

In [65]:
# Doing the work from class before looking at adding new features

import category_encoders as ce
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

def clean(df):
    df = df.copy()
    
    df['quote_date'] = pd.to_datetime(df['quote_date'], infer_datetime_format=True)
    df['quote_year_date'] = df['quote_date'].dt.year
    df['quote_date_month'] = df['quote_date'].dt.month
    df = df.drop(columns='quote_date')
    
    tube = pd.read_csv('competition_data/tube.csv')
    df = df.merge(tube, how='left')
    
    materials = pd.read_csv('competition_data/bill_of_materials.csv')
    
    materials['components_total'] = (materials['quantity_1'].fillna(0) +
                                    materials['quantity_2'].fillna(0) +
                                    materials['quantity_3'].fillna(0) +
                                    materials['quantity_4'].fillna(0) +
                                    materials['quantity_5'].fillna(0) +
                                    materials['quantity_6'].fillna(0) +
                                    materials['quantity_7'].fillna(0) +
                                    materials['quantity_8'].fillna(0))
    
    materials['components_distinct'] = (materials['component_id_1'].notnull().astype(int) +
                                        materials['component_id_2'].notnull().astype(int) +
                                        materials['component_id_3'].notnull().astype(int) +
                                        materials['component_id_4'].notnull().astype(int) +
                                        materials['component_id_5'].notnull().astype(int) +
                                        materials['component_id_6'].notnull().astype(int) +
                                        materials['component_id_7'].notnull().astype(int) +
                                        materials['component_id_8'].notnull().astype(int))
    
    features = ['tube_assembly_id', 'component_id_1', 'components_total', 'components_distinct']
    df = df.merge(materials[features], how='left')
    
    components = pd.read_csv('competition_data/components.csv')
    components = components.rename(columns={'component_id': 'component_id_1'})
    features = ['component_id_1', 'component_type_id']
    df = df.merge(components[features], how='left')
    
    specs = pd.read_csv('competition_data/specs.csv')
    specs['specs_total'] = specs.drop(columns=['tube_assembly_id']).count(axis=1)
    features = ['tube_assembly_id', 'specs_total']
    # Deleting spec1 here as experiment
#     features = ['tube_assembly_id', 'specs_total', 'spec1']
    df = df.merge(specs[features], how='left')
    
    df = df.drop(columns='tube_assembly_id')
    
    return df

In [66]:
trainval = pd.read_csv('competition_data/train_set.csv')
test = pd.read_csv('competition_data/test_set.csv')

In [67]:
trainval_tube_assemblies = trainval['tube_assembly_id'].unique()
train_tube_assemblies, val_tube_assemblies = train_test_split(trainval_tube_assemblies, random_state=42)
train = trainval[trainval['tube_assembly_id'].isin(train_tube_assemblies)]
val = trainval[trainval['tube_assembly_id'].isin(val_tube_assemblies)]

In [68]:
train = clean(train)
val = clean(val)
test = clean(test)

In [73]:
target = 'cost'
X_train = train.drop(columns=target)
X_val = val.drop(columns=target)
X_test = test.drop(columns='id')
y_train = train[target]
y_val = val[target]
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

In [19]:
def generate_submission(estimator, X_test, filename):
    y_pred_log = estimator.predict(X_test)
    y_pred = np.expm1(y_pred_log)
    submission = pd.read_csv('sample_submission.csv')
    submission['cost'] = y_pred
    submission.to_csv(filename, index=False)

In [21]:
# %%time
from xgboost import XGBRegressor
pipeline = make_pipeline(ce.OrdinalEncoder(),
                        XGBRegressor(n_estimators=1000, n_jobs=-1))
pipeline.fit(X_train, y_train_log)
y_pred_log = pipeline.predict(X_val)
print('RMSE error for Xgbr', rmse(y_val_log, y_pred_log))

RMSE error for Xgbr 0.2700581596577784
CPU times: user 23.8 s, sys: 195 ms, total: 24 s
Wall time: 24.3 s


In [22]:
pipeline = make_pipeline(ce.OrdinalEncoder(), RandomForestRegressor(n_estimators=1000,
                                                                   n_jobs=-1))
pipeline.fit(X_train, y_train_log)
y_pred_log = pipeline.predict(X_val)
print('validation error for RFR', rmse(y_val_log, y_pred_log))

validation error for RFR 0.29451953970560635


In [25]:
print(train.shape)
train.head()

(22628, 29)


Unnamed: 0,supplier,annual_usage,min_order_quantity,bracket_pricing,quantity,cost,quote_year_date,quote_date_month,material_id,diameter,...,end_x,num_boss,num_bracket,other,component_id_1,components_total,components_distinct,component_type_id,specs_total,spec1
0,S-0066,0,0,Yes,1,21.972702,2013,7,SP-0019,6.35,...,EF-008,0,0,0,C-1312,2.0,1,CP-028,0,
1,S-0066,0,0,Yes,2,12.407983,2013,7,SP-0019,6.35,...,EF-008,0,0,0,C-1312,2.0,1,CP-028,0,
2,S-0066,0,0,Yes,5,6.668596,2013,7,SP-0019,6.35,...,EF-008,0,0,0,C-1312,2.0,1,CP-028,0,
3,S-0066,0,0,Yes,10,4.754539,2013,7,SP-0019,6.35,...,EF-008,0,0,0,C-1312,2.0,1,CP-028,0,
4,S-0066,0,0,Yes,25,3.608331,2013,7,SP-0019,6.35,...,EF-008,0,0,0,C-1312,2.0,1,CP-028,0,


In [51]:
# Trying XGReggresorEval from Richmond
# # Hasn't worked yet
# from xgboost import XGBRegressor

# def XGRegressorEval(XGBRegressor):
#     def fit(self, *args, **kwargs):
#         return super().fit(*args, eval_set=eval_set, eval_metric='rmse', early_stopping_rounds=10,
#                            **kwargs)
   

In [53]:
# pipeline = make_pipeline(ce.OrdinalEncoder(), XGBRegressorEval(n_estimators=1000, n_jobs=-1))
# pipeline.fit(X_train, y_train_log)
# y_pred_log = pipeline.predict(X_val)
# print('validation error for RFR', rmse(y_val_log, y_pred_log))

In [71]:
def clean1(df):
    df = df.copy()
    
    # Dropping column spec1 since it has so many nulls. Seeing if it helps
#     df = df.drop(columns='spec1')
    
    df['material_id'] = df['material_id'].fillna('SP-0029').astype(str)
    
    return df

In [72]:
train = clean1(train)
val = clean1(val)
test = clean1(test)

In [74]:
# Re-running XGBoost after extra cleaning function
# Lower leaving out spec1

from xgboost import XGBRegressor
pipeline = make_pipeline(ce.OrdinalEncoder(),
                        XGBRegressor(n_estimators=1000, n_jobs=-1))
pipeline.fit(X_train, y_train_log)
y_pred_log = pipeline.predict(X_val)
print('RMSE error for Xgbr v2', rmse(y_val_log, y_pred_log))

RMSE error for Xgbr v2 0.2737656410080418


In [48]:
train.describe()

Unnamed: 0,annual_usage,min_order_quantity,quantity,cost,quote_year_date,quote_date_month,diameter,wall,length,num_bends,bend_radius,num_boss,num_bracket,other,components_total,components_distinct,specs_total
count,22628.0,22628.0,22628.0,22628.0,22628.0,22628.0,22628.0,22628.0,22628.0,22628.0,22628.0,22628.0,22628.0,22628.0,22628.0,22628.0,22628.0
mean,116.90883,2.130193,38.267677,13.4505,2012.061119,7.075438,17.29447,1.381985,97.291156,3.810191,43.706186,0.026118,0.003668,0.014142,2.926153,1.910332,0.776737
std,1678.532065,13.262875,71.313125,27.622252,2.662399,2.667107,18.351639,0.630776,61.454507,2.193815,230.985857,0.190317,0.073035,0.158365,1.172669,0.881423,1.762919
min,0.0,0.0,1.0,0.523027,1987.0,1.0,3.18,0.71,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,2.0,3.855934,2012.0,6.0,9.52,0.89,49.0,2.0,19.05,0.0,0.0,0.0,2.0,1.0,0.0
50%,0.0,0.0,10.0,6.507236,2013.0,8.0,12.7,1.24,86.0,3.0,31.75,0.0,0.0,0.0,3.0,2.0,0.0
75%,2.0,0.0,40.0,13.451951,2013.0,9.0,19.05,1.65,132.0,5.0,50.8,0.0,0.0,0.0,4.0,2.0,0.0
max,150000.0,535.0,2500.0,1000.0,2017.0,12.0,203.2,7.9,507.0,17.0,9999.0,5.0,3.0,8.0,13.0,8.0,9.0


In [49]:
train.describe(exclude='number')

Unnamed: 0,supplier,bracket_pricing,material_id,end_a_1x,end_a_2x,end_x_1x,end_x_2x,end_a,end_x,component_id_1,component_type_id,spec1
count,22628,22628,22461,22628,22628,22628,22628,22628,22628,21537,21537,4306
unique,52,2,16,2,2,2,2,24,24,555,23,35
top,S-0066,Yes,SP-0029,N,N,N,N,EF-003,EF-003,C-1621,CP-025,SP-0007
freq,15391,19699,12066,22024,20757,22194,20939,13182,12731,2823,10565,1901


In [50]:
train.isnull().sum()

supplier                   0
annual_usage               0
min_order_quantity         0
bracket_pricing            0
quantity                   0
cost                       0
quote_year_date            0
quote_date_month           0
material_id              167
diameter                   0
wall                       0
length                     0
num_bends                  0
bend_radius                0
end_a_1x                   0
end_a_2x                   0
end_x_1x                   0
end_x_2x                   0
end_a                      0
end_x                      0
num_boss                   0
num_bracket                0
other                      0
component_id_1          1091
components_total           0
components_distinct        0
component_type_id       1091
specs_total                0
spec1                  18322
dtype: int64

In [54]:
train.shape

(22628, 29)

In [None]:
# # Trying to Use Random Forest to impute missing material_id values
# # Only want to pass Train through here?
# # Bogged down 

def clean2(df):
    df = df.copy
    
    dfWithMaterial = df[pd.isnull(df['material_id']) == False]
    dfWithoutMaterial = df[pd.isnull(df['material_id'])]
    
    target = 'material_id'
    X_train = train.drop(columns=target)
    y_train = 
    X_pred = 
    y_pred = 
    
    pipeline = make_pipeline(ce.OrdinalEncoder(), 
                             RandomForestRegressor(n_estimators=1000, n_jobs=-1))
    pipeline.fit(X_train, y_train_log)
    y_pred_log = pipeline.predict(X_val)