In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.metrics import mean_squared_error, mean_squared_log_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
import category_encoders as ce
from sklearn.model_selection import train_test_split
from zipfile import ZipFile
from glob import glob
from xgboost import XGBRegressor
from pdpbox.pdp import pdp_isolate, pdp_plot, pdp_interact, pdp_interact_plot
import eli5
from eli5.sklearn import PermutationImportance

In [2]:
#read zip files, print directories, extract all files
with ZipFile('data.zip', 'r') as zip:
    zip.printdir()
    zip.extractall() 

File Name                                             Modified             Size
competition_data/                              2015-06-29 21:06:12            0
competition_data/bill_of_materials.csv         2015-06-24 16:21:48      1326876
competition_data/comp_adaptor.csv              2015-06-24 16:21:48         2544
competition_data/comp_boss.csv                 2015-06-24 16:21:48        10695
competition_data/comp_elbow.csv                2015-06-24 16:21:48        12830
competition_data/comp_float.csv                2015-06-24 16:21:48          703
competition_data/comp_hfl.csv                  2015-06-24 16:21:48          438
competition_data/comp_nut.csv                  2015-06-24 16:21:48         3324
competition_data/comp_other.csv                2015-06-29 21:05:42        20941
competition_data/comp_sleeve.csv               2015-06-24 16:21:48         2565
competition_data/comp_straight.csv             2015-06-24 16:21:48        19721
competition_data/comp_tee.csv           

In [3]:
#Loaded all data
bill_data= pd.read_csv('competition_data/bill_of_materials.csv')
adaptors= pd.read_csv('competition_data\comp_adaptor.csv')
boss= pd.read_csv('competition_data\comp_boss.csv')
elbow= pd.read_csv('competition_data\comp_elbow.csv')
comp_float= pd.read_csv('competition_data\comp_float.csv')
hfl= pd.read_csv('competition_data\comp_hfl.csv')
nut= pd.read_csv('competition_data\comp_nut.csv')
other= pd.read_csv('competition_data\comp_other.csv')
sleeve= pd.read_csv('competition_data\comp_sleeve.csv')
straight= pd.read_csv('competition_data\comp_straight.csv')
tee= pd.read_csv('competition_data\comp_tee.csv')
threaded= pd.read_csv('competition_data\comp_threaded.csv')
components= pd.read_csv('competition_data\components.csv')
specs= pd.read_csv('competition_data/specs.csv')
test_data= pd.read_csv('competition_data/test_set.csv')
train_data= pd.read_csv('competition_data/train_set.csv')
tube= pd.read_csv('competition_data/tube.csv')
type_component= pd.read_csv('competition_data/type_component.csv')
type_connection= pd.read_csv('competition_data/type_connection.csv')
type_end_form= pd.read_csv('competition_data/type_end_form.csv')

In [4]:
#setting display option so i can see all columns
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [5]:
#sample submission
sample_sub= open('sample_submission.csv', 'r')
sample_sub= pd.read_csv(sample_sub)

In [6]:
#Make copies of train and test data
test_set= test_data.copy()
train_set= train_data.copy()

In [7]:
train_set.shape,test_set.shape, tube.shape

((30213, 8), (30235, 8), (21198, 16))

In [8]:
#Merge train and test with tube
train_set= pd.merge(train_set, tube)
test_set= pd.merge(test_set, tube)

In [9]:
train_set.shape,test_set.shape, bill_data.shape

((30213, 23), (30235, 23), (21198, 17))

In [10]:
#Merge train and test data with bill
train_set= pd.merge(train_set, bill_data)
test_set= pd.merge(test_set, bill_data)

In [11]:
train_set.shape, test_set.shape, specs.shape

((30213, 39), (30235, 39), (21198, 11))

In [12]:
train_set= pd.merge(train_set, specs)
test_set= pd.merge(test_set, specs)

In [13]:
train_set.shape, test_set.shape

((30213, 49), (30235, 49))

# Start

In [14]:
#merging all components data into a final components dataframe
adapt_merge= pd.merge(components, adaptors, how= 'outer', on='component_id')
boss_merge= pd.merge(adapt_merge, boss, how= 'outer', on= 'component_id')
elbow_merge= pd.merge(boss_merge, elbow, how= 'outer', on= 'component_id')
float_merge= pd.merge(elbow_merge, comp_float, how= 'outer', on= 'component_id')
hfl_merge= pd.merge(float_merge, hfl, how= 'outer', on= 'component_id')
nut_merge= pd.merge(hfl_merge, nut, how= 'outer', on= 'component_id')
other_merge= pd.merge(nut_merge, other, how= 'outer', on= 'component_id')
sleeve_merge= pd.merge(other_merge, sleeve, how= 'outer', on= 'component_id')
straight_merge= pd.merge(sleeve_merge, straight, how= 'outer', on= 'component_id')
tee_merge= pd.merge(straight_merge, tee, how= 'outer', on= 'component_id')
final_merge= pd.merge(tee_merge, threaded, how= 'outer', on= 'component_id')

In [15]:
#getting rid of duplicate columns
components_data = final_merge.loc[:,~final_merge.columns.duplicated()]

In [16]:
components_data.shape

(2048, 103)

In [17]:
train_component_1= train_set['component_id_1'].unique()
train_component_2= train_set['component_id_2'].unique()
train_component_3= train_set['component_id_3'].unique()
train_component_4= train_set['component_id_4'].unique()
train_component_5= train_set['component_id_5'].unique()
train_component_6= train_set['component_id_6'].unique()
train_component_7= train_set['component_id_7'].unique()
train_component_8= train_set['component_id_8'].unique()

In [18]:
test_component_1= test_set['component_id_1'].unique()
test_component_2= test_set['component_id_2'].unique()
test_component_3= test_set['component_id_3'].unique()
test_component_4= test_set['component_id_4'].unique()
test_component_5= test_set['component_id_5'].unique()
test_component_6= test_set['component_id_6'].unique()
test_component_7= test_set['component_id_7'].unique()
test_component_8= test_set['component_id_8'].unique()

In [19]:
train_comp_one_data= components_data[components_data['component_id'].isin(train_component_1)]
train_comp_one_data= train_comp_one_data.rename(columns= {'component_id': 'component_id_1'})
train_comp_two_data= components_data[components_data['component_id'].isin(train_component_2)]
train_comp_two_data= train_comp_two_data.rename(columns={'component_id': 'component_id_2'})
train_comp_three_data= components_data[components_data['component_id'].isin(train_component_3)]
train_comp_three_data= train_comp_three_data.rename(columns={'component_id': 'component_id_3'})
train_comp_four_data= components_data[components_data['component_id'].isin(train_component_4)]
train_comp_four_data= train_comp_four_data.rename(columns={'component_id': 'component_id_4'})
train_comp_five_data= components_data[components_data['component_id'].isin(train_component_5)]
train_comp_five_data= train_comp_five_data.rename(columns={'component_id': 'component_id_5'})
train_comp_six_data= components_data[components_data['component_id'].isin(train_component_6)]
train_comp_six_data= train_comp_six_data.rename(columns={'component_id': 'component_id_6'})
train_comp_seven_data= components_data[components_data['component_id'].isin(train_component_7)]
train_comp_seven_data= train_comp_seven_data.rename(columns={'component_id': 'component_id_7'})
train_comp_eight_data= components_data[components_data['component_id'].isin(train_component_8)]
train_comp_eight_data= train_comp_eight_data.rename(columns={'component_id': 'component_id_8'})

In [20]:
test_comp_one_data= components_data[components_data['component_id'].isin(test_component_1)]
test_comp_one_data= test_comp_one_data.rename(columns= {'component_id': 'component_id_1'})
test_comp_two_data= components_data[components_data['component_id'].isin(test_component_2)]
test_comp_two_data= test_comp_two_data.rename(columns={'component_id': 'component_id_2'})
test_comp_three_data= components_data[components_data['component_id'].isin(test_component_3)]
test_comp_three_data= test_comp_three_data.rename(columns={'component_id': 'component_id_3'})
test_comp_four_data= components_data[components_data['component_id'].isin(test_component_4)]
test_comp_four_data= test_comp_four_data.rename(columns={'component_id': 'component_id_4'})
test_comp_five_data= components_data[components_data['component_id'].isin(test_component_5)]
test_comp_five_data= test_comp_five_data.rename(columns={'component_id': 'component_id_5'})
test_comp_six_data= components_data[components_data['component_id'].isin(test_component_6)]
test_comp_six_data= test_comp_six_data.rename(columns={'component_id': 'component_id_6'})
test_comp_seven_data= components_data[components_data['component_id'].isin(test_component_7)]
test_comp_seven_data= test_comp_seven_data.rename(columns={'component_id': 'component_id_7'})
test_comp_eight_data= components_data[components_data['component_id'].isin(test_component_8)]
test_comp_eight_data= test_comp_eight_data.rename(columns={'component_id': 'component_id_8'})

In [21]:
train_set= pd.merge(train_set, train_comp_one_data, how='outer', on='component_id_1')
test_set= pd.merge(test_set, test_comp_one_data, how='outer', on='component_id_1')

In [22]:
train_set.shape, test_set.shape

((30213, 151), (30235, 151))

In [23]:
train_set= pd.merge(train_set, train_comp_two_data, how='outer', on='component_id_2')
test_set= pd.merge(test_set, test_comp_two_data, how='outer', on='component_id_2')

In [24]:
train_set.shape, test_set.shape

((30213, 253), (30235, 253))

In [25]:
train_set= pd.merge(train_set, train_comp_three_data, how='outer', on='component_id_3')
test_set= pd.merge(test_set, test_comp_three_data, how='outer', on='component_id_3')

In [26]:
train_set.shape, test_set.shape

((30213, 355), (30235, 355))

In [27]:
train_set= pd.merge(train_set, train_comp_four_data, how='outer', on='component_id_4')
test_set= pd.merge(test_set, test_comp_four_data, how='outer', on='component_id_4')

In [28]:
train_set.shape, test_set.shape

((30213, 457), (30235, 457))

In [29]:
train_set= pd.merge(train_set, train_comp_five_data, how='outer', on='component_id_5')
test_set= pd.merge(test_set, test_comp_five_data, how='outer', on='component_id_5')

In [30]:
train_set.shape, test_set.shape

((30213, 559), (30235, 559))

In [31]:
train_set= pd.merge(train_set, train_comp_six_data, how='outer', on='component_id_6')
test_set= pd.merge(test_set, test_comp_six_data, how='outer', on='component_id_6')

In [32]:
train_set.shape, test_set.shape

((30213, 661), (30235, 661))

In [33]:
train_set= pd.merge(train_set, train_comp_seven_data, how='outer', on='component_id_7')
test_set= pd.merge(test_set, test_comp_seven_data, how='outer', on='component_id_7')

In [34]:
train_set.shape, test_set.shape

((30213, 763), (30235, 763))

In [35]:
train_set= pd.merge(train_set, train_comp_eight_data, how='outer', on='component_id_8')
test_set= pd.merge(test_set, test_comp_eight_data, how='outer', on='component_id_8')

In [36]:
train_set.shape, test_set.shape

((30213, 865), (30235, 865))

In [37]:
train_set= train_set.loc[:,~train_set.columns.duplicated()]
test_set= test_set.loc[:,~test_set.columns.duplicated()]

In [38]:
train_set.shape, test_set.shape

((30213, 253), (30235, 253))

In [None]:
columns= ['end_form_id_3_x','connection_type_id_3_x', 'mj_plug_class_code_y_y']
train_set= train_set.drop(columns=columns)
test_set= test_set.drop(columns=columns)

# Stop

In [39]:
train_set= train_set.replace([np.inf, -np.inf], np.nan)
test_set= test_set.replace([np.inf, -np.inf], np.nan)

In [40]:
train_set= train_set.fillna(0)
test_set= test_set.fillna(0)

In [41]:
submission_set= test_set.copy()

In [42]:
#grabbed the unique tube assembly ids for train and test data
train_assembly_unique= train_set['tube_assembly_id'].unique()

In [43]:
#split the values into train and test assemblies
train_assemblies, test_assemblies= train_test_split(train_assembly_unique, train_size= 0.80, test_size=0.2, random_state= 42)

In [44]:
#looked at shape of both train and test assemblies
train_assemblies.shape, test_assemblies.shape

((7084,), (1771,))

In [45]:
#split train assemblies into train and val assemblies
train_assemblies, val_assemblies= train_test_split(train_assemblies, train_size= 0.80, test_size= 0.2, random_state=42)

In [46]:
#created train and val datasets by grabbing only data associated with the unique assemblies for that dataset
train= train_set[train_set['tube_assembly_id'].isin(train_assemblies)]
val= train_set[train_set['tube_assembly_id'].isin(val_assemblies)]
test= train_set[train_set['tube_assembly_id'].isin(test_assemblies)]

In [47]:
#checked the shape of train and validate
train.shape, test.shape, val.shape

((19312, 253), (6067, 253), (4834, 253))

In [48]:
#created a wrangle function
def wrangle(df):
    df= df.copy()
    df['quote_date']= pd.to_datetime(df['quote_date'], infer_datetime_format= True)
    df['quote_date_year'] = df['quote_date'].dt.year
    df['quote_date_month'] = df['quote_date'].dt.month
    df= df.drop(columns=['quote_date'])
    return df

In [49]:
train= wrangle(train)
val= wrangle(val)
test= wrangle(test)
submission_set= wrangle(submission_set)

In [50]:
train.shape, val.shape, test.shape, submission_set.shape

((19312, 254), (4834, 254), (6067, 254), (30235, 254))

In [None]:
columns= ['spec8',
'component_id_5',
'spec7',
'spec10',
'component_id_6',
'quantity_5',
'component_id_7',
'quantity_8',
'quantity_4',
'quantity_7',
'component_id_8',
'quantity_6',
'spec9',
'spec4',
'end_a_1x',
'num_bracket', 'cost', 'tube_assembly_id']

In [51]:
#log target
train_target= np.log1p(train['cost'])
val_target= np.log1p(val['cost'])
test_target= np.log1p(test['cost'])

# columns= ['spec8',
# 'component_id_5',
# 'spec7',
# 'spec10',
# 'component_id_6',
# 'quantity_5',
# 'component_id_7',
# 'quantity_8',
# 'quantity_4',
# 'quantity_7',
# 'component_id_8',
# 'quantity_6',
# 'spec9',
# 'spec4',
# 'end_a_1x',
# 'num_bracket']
columns= ['cost', 'tube_assembly_id']

#drop target and tube id
train_features= train.drop(columns=columns)
val_features= val.drop(columns= columns)
test_features= test.drop(columns=columns)

#dropping columns in submission set
submission_features= submission_set.drop(columns= ['id', 'tube_assembly_id'])

In [52]:
train_features.shape, train_target.shape, val_features.shape, val_target.shape, test_features.shape, test_target.shape, submission_features.shape

((19312, 252),
 (19312,),
 (4834, 252),
 (4834,),
 (6067, 252),
 (6067,),
 (30235, 252))

In [53]:
#created a pipeline with an ordinal encoder and randomforestregressor
pipeline= make_pipeline(ce.OrdinalEncoder(),
                       RandomForestRegressor(n_estimators=1000, random_state=42, n_jobs=-1))

In [None]:
#fit training data to pipleline
pipeline.fit(train_features, train_target)

In [None]:
#predictions, root mean squared log error, and r2 score of validation set
y_pred= pipeline.predict(val_features)
print('RMSLE: ' + str(np.sqrt(mean_squared_error(val_target, y_pred))) + '\nR2_Score: '+ str(r2_score(val_target, y_pred)))

In [None]:
#prediction on test set with RMSLE and R2 score
y_pred= pipeline.predict(test_features)
print('RMSLE: ' + str(np.sqrt(mean_squared_error(test_target, y_pred))) + '\nR2_Score: '+ str(r2_score(test_target, y_pred)))

In [54]:
#ordinal encoder on train, test, and val sets
encoder= ce.OrdinalEncoder()
train_encoded= encoder.fit_transform(train_features)
val_encoded= encoder.transform(val_features)
test_encoded= encoder.transform(test_features)
# submission_encoded= encoder.transform(submission_features)

In [55]:
#Gradient boosting model
model= XGBRegressor(n_estimators= 2000,eval_metric='rmse', early_stopping_rounds=10, random_state=42, n_jobs=-1)

In [56]:
#fit train data to model
model.fit(train_encoded, train_target)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=10, eval_metric='rmse',
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=2000, n_jobs=-1,
       nthread=None, objective='reg:linear', random_state=42, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [57]:
#made predictions with model on val set
y_pred= model.predict(val_encoded)

In [67]:
#RMSLE and R2 score on val set
print('RMSLE: ' + str(np.sqrt(mean_squared_error(val_target, y_pred))) + '\nR2_Score: '+ str(r2_score(val_target, y_pred)))

RMSLE: 0.271852107564588
R2_Score: 0.8937679832374168


In [59]:
#predictions on test set
y_pred= model.predict(test_encoded)

In [66]:
#RSMLE and r2 score on test set
print('RMSLE: ' + str(np.sqrt(mean_squared_error(test_target, y_pred))) + '\nR2_Score: '+ str(r2_score(test_target, y_pred)))

ValueError: Found input variables with inconsistent numbers of samples: [6067, 4834]

In [61]:
permuter = PermutationImportance(model, scoring='neg_mean_squared_error', 
                                 cv='prefit', n_iter=2, random_state=42)
permuter.fit(val_encoded, val_target)

PermutationImportance(cv='prefit',
           estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=10, eval_metric='rmse',
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=2000, n_jobs=-1,
       nthread=None, objective='reg:linear', random_state=42, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1),
           n_iter=2, random_state=42, refit=True,
           scoring='neg_mean_squared_error')

In [62]:
feature_names = val_encoded.columns.tolist()
eli5.show_weights(permuter, top=None, feature_names=feature_names)

Weight,Feature
0.7318  ± 0.0276,quantity
0.2657  ± 0.0021,min_order_quantity
0.0939  ± 0.0028,diameter_x
0.0627  ± 0.0018,supplier
0.0558  ± 0.0023,annual_usage
0.0327  ± 0.0012,quote_date_year
0.0207  ± 0.0005,length
0.0134  ± 0.0009,component_id_3
0.0122  ± 0.0011,end_a
0.0105  ± 0.0025,wall


In [63]:
print('Shape before removing features:', train_encoded.shape)
mask = permuter.feature_importances_ > 0
features = train_encoded.columns[mask]
train_encoded = train_encoded[features]
print('Shape after removing features:', train_encoded.shape)
val_encoded = val_encoded[features]
test_encoded= test_encoded[features]

Shape before removing features: (19312, 252)
Shape after removing features: (19312, 97)


In [64]:
model.fit(train_encoded, train_target)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=10, eval_metric='rmse',
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=2000, n_jobs=-1,
       nthread=None, objective='reg:linear', random_state=42, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [65]:
y_pred= model.predict(val_encoded)

In [None]:
feature = 'spec3'

isolated = pdp_isolate(
    model=model, 
    dataset=val_encoded, 
    model_features=val_encoded.columns, 
    feature=feature
)

pdp_plot(isolated, feature_name=feature);

In [None]:
sub_pred= model.predict(submission_encoded)

In [None]:
sub_pred= np.expm1(sub_pred)

In [None]:
sample_sub['cost']= sub_pred

In [None]:
sample_sub.to_csv('sample_submission.csv', index=False)