# Assignment
- Continue to participate in the [Kaggle Caterpillar competition](https://www.kaggle.com/c/caterpillar-tube-pricing).
- Do more feature engineering. 
- Use xgboost for gradient boosting.
- Submit new predictions.
- Commit your notebook to your fork of the GitHub repo.

## Stretch Goals
- Improve your scores on Kaggle.
- Make visualizations and share on Slack.
- Look at [Kaggle Kernels](https://www.kaggle.com/c/caterpillar-tube-pricing/kernels) for ideas about feature engineering and visualization.
- Read more about gradient boosting:
  - [A Gentle Introduction to the Gradient Boosting Algorithm for Machine Learning](https://machinelearningmastery.com/gentle-introduction-gradient-boosting-algorithm-machine-learning/)
  - [A Kaggle Master Explains Gradient Boosting](http://blog.kaggle.com/2017/01/23/a-kaggle-master-explains-gradient-boosting/)
  - [_An Introduction to Statistical Learning_](http://www-bcf.usc.edu/~gareth/ISL/ISLR%20Seventh%20Printing.pdf) Chapter 8
  - [Gradient Boosting Explained](http://arogozhnikov.github.io/2016/06/24/gradient_boosting_explained.html)

In [95]:
import pandas as pd
import numpy as np
from glob import glob
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
import category_encoders as ce
from sklearn.metrics import mean_squared_error
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
trainval = pd.read_csv('competition_data/train_set.csv')
test = pd.read_csv('competition_data/test_set.csv')

In [7]:
# Create dict of all dataframes
dfs = {}
for path in glob('competition_data/*.csv'):
    folder, file = path.split('/')
    name, extension = file.split('.')
    if name != 'train_set' and name != 'test_set':
        dfs[name] = pd.read_csv(path)

In [10]:
for df in dfs:
    display(dfs[df])
# dfs['comp_threaded']

Unnamed: 0,component_id,component_type_id,adaptor_angle,overall_length,hex_size,end_form_id_1,connection_type_id_1,length_1,thread_size_1,thread_pitch_1,nominal_size_1,end_form_id_2,connection_type_id_2,length_2,thread_size_2,thread_pitch_2,nominal_size_2,end_form_id_3,connection_type_id_3,length_3,thread_size_3,thread_pitch_3,nominal_size_3,end_form_id_4,connection_type_id_4,length_4,thread_size_4,thread_pitch_4,nominal_size_4,unique_feature,orientation,weight
0,C-0007,CP-014,,24.4,12.7,A-001,B-001,,0.44,20.0,,A-004,,,,,6.35,,,,,,,,,,,,,No,No,0.03
1,C-0030,CP-015,90.0,,,A-003,,47.0,,,42,A-003,,47.0,,,42.0,,,,,,,,,,,,,No,Yes,0.91
2,C-0041,CP-014,,28.0,32.0,A-004,,,,,21.7,A-001,B-002,,0.81,16.0,,,,,,,,,,,,,,No,No,0.12
3,C-0043,CP-014,,38.0,27.0,A-003,,,,,21.7,A-001,B-002,,0.81,16.0,,,,,,,,,,,,,,No,No,0.09
4,C-0044,CP-014,,30.5,41.28,A-004,,,,,34,A-001,B-002,,1.44,12.0,,,,,,,,,,,,,,No,No,0.18
5,C-0069,CP-015,90.0,,,A-001,B-002,29.0,0.81,16.0,,A-004,,16.0,,,17.3,,,,,,,,,,,,,Yes,Yes,0.22
6,C-0070,CP-015,90.0,,,A-003,,30.0,,,21.7,A-001,B-002,27.0,0.81,16.0,,,,,,,,,,,,,,No,Yes,0.18
7,C-0072,CP-015,90.0,,,A-003,,30.0,,,21.7,A-001,B-002,33.5,1.0,14.0,,,,,,,,,,,,,,No,Yes,0.29
8,C-0073,CP-015,90.0,,,A-004,,21.0,,,22.22,A-001,B-002,35.0,1.0,14.0,,,,,,,,,,,,,,No,Yes,0.54
9,C-0074,CP-014,,40.5,27.0,A-003,,,,,21.7,A-001,B-002,,1.0,14.0,,,,,,,,,,,,,,No,No,0.13


Unnamed: 0,component_id,component_type_id,adaptor_angle,overall_length,end_form_id_1,connection_type_id_1,length_1,thread_size_1,thread_pitch_1,nominal_size_1,end_form_id_2,connection_type_id_2,length_2,thread_size_2,thread_pitch_2,nominal_size_2,hex_size,unique_feature,orientation,weight
0,C-0005,CP-028,,58.4,A-001,B-001,,1.31,12.0,,A-001,B-004,,1.0,11.5,,34.93,No,No,0.21
1,C-0006,CP-028,,34.8,A-001,B-001,,0.44,20.0,,A-001,B-005,,0.75,16.0,,22.2,No,No,0.08
2,C-1435,CP-028,,20.3,A-007,B-004,,,,15.88,A-001,B-007,,0.88,18.0,,22.22,No,No,0.02
3,C-1546,CP-028,,26.4,A-007,B-004,,0.12,27.0,,A-001,B-004,,0.12,27.0,,15.88,No,No,0.03
4,C-1583,CP-028,,44.5,A-001,B-005,,1.31,12.0,,A-007,B-005,,1.06,12.0,,38.1,No,No,0.26
5,C-1634,CP-028,,34.5,A-001,B-005,,0.75,16.0,,A-001,B-002,,0.69,16.0,,22.23,No,No,0.06
6,C-1975,CP-028,,13.2,A-007,B-007,,,,3.18,A-001,B-007,,0.31,28.0,,,No,No,0.01
7,C-0428,CP-028,,26.99,A-001,B-004,,0.25,18.0,,A-007,,,,,9.52,17.46,No,No,0.03
8,C-0443,CP-028,,22.35,A-007,B-007,,,,19.05,9999,9999,,1.06,16.0,,26.97,No,No,
9,C-0823,CP-028,,16.8,A-007,B-007,,,,9.52,A-001,9999,,0.62,18.0,9.52,15.75,No,No,0.01


Unnamed: 0,end_form_id,forming
0,EF-001,Yes
1,EF-002,No
2,EF-003,No
3,EF-004,No
4,EF-005,Yes
5,EF-006,Yes
6,EF-007,Yes
7,EF-008,Yes
8,EF-009,Yes
9,EF-010,Yes


Unnamed: 0,component_id,component_type_id,bolt_pattern_long,bolt_pattern_wide,head_diameter,overall_length,thickness,mj_class_code,groove,unique_feature,orientation,weight
0,C-0012,CP-001,66.68,31.75,,40.0,20.0,,No,No,Yes,0.79
1,C-0014,CP-001,47.6,22.2,,38.0,15.0,,Yes,No,Yes,0.34
2,C-0015,CP-001,66.7,31.8,,40.0,20.0,,Yes,No,Yes,0.79
3,C-0019,CP-002,77.8,42.9,,,36.5,MJ-003,No,No,Yes,1.53
4,C-0029,CP-001,47.63,22.23,,,16.0,,Yes,No,Yes,0.29
5,C-0036,CP-003,96.82,44.45,,,36.0,,Yes,No,Yes,1.85
6,C-0038,CP-004,52.38,,,,16.0,,Yes,No,Yes,0.17
7,C-0046,CP-005,76.2,,,,36.5,MJ-003,Yes,No,Yes,0.38
8,C-0062,CP-003,96.82,44.45,,,36.0,,No,No,Yes,1.85
9,C-0079,CP-002,130.18,77.77,,,28.0,MJ-001,Yes,No,Yes,3.09


Unnamed: 0,component_id,component_type_id,bolt_pattern_long,bolt_pattern_wide,extension_length,overall_length,thickness,drop_length,mj_class_code,mj_plug_class_code,groove,unique_feature,orientation,weight
0,C-0271,OTHER,58.7,30.2,57.1,93.0,57,28.5,MJ-003,Threaded,No,No,Yes,1.53
1,C-1809,OTHER,58.72,30.18,57.09,108.0,57,28.5,MJ-003,MJ-005,No,No,Yes,2.18
2,C-1830,OTHER,52.4,26.2,43.5,78.5,51,25.5,MJ-003,Threaded,No,Yes,Yes,1.14
3,C-1865,OTHER,58.7,30.2,57.1,107.0,57,28.5,MJ-003,MJ-005,No,No,Yes,1.95


Unnamed: 0,component_id,component_type_id,type,connection_type_id,outside_shape,base_type,height_over_tube,bolt_pattern_long,bolt_pattern_wide,groove,base_diameter,shoulder_diameter,unique_feature,orientation,weight
0,C-0008,CP-018,Boss,B-005,Round,Flat Bottom,17.0,,,No,22.0,,Yes,Yes,0.03
1,C-0009,CP-018,Boss,B-004,Round,Flat Bottom,13.0,,,No,25.0,,No,Yes,0.03
2,C-0020,CP-018,Boss,B-005,Round,Saddle,28.4,,,No,35.0,,Yes,Yes,0.07
3,C-0054,CP-018,Boss,B-005,Round,Saddle,27.1,,,No,,,Yes,Yes,0.18
4,C-0071,CP-018,Boss,B-005,Round,Shoulder,20.0,,,No,30.0,23.0,Yes,Yes,0.08
5,C-0082,CP-018,Boss,B-002,Round,Saddle,15.65,,,No,,,Yes,Yes,0.04
6,C-0083,CP-019,,B-012,,,36.5,77.8,42.9,No,,,No,Yes,2.09
7,C-0084,CP-019,,B-012,,,36.5,88.9,50.8,No,,,No,Yes,2.6
8,C-0111,CP-018,Boss,B-009,Round,Shoulder,19.0,,,No,35.0,30.0,Yes,Yes,0.1
9,C-0117,CP-018,Boss,B-005,Round,Saddle,22.0,,,No,25.4,,No,Yes,0.08


Unnamed: 0,component_id,name,component_type_id
0,9999,OTHER,OTHER
1,C-0001,SLEEVE,CP-024
2,C-0002,SLEEVE,CP-024
3,C-0003,SLEEVE-FLARED,CP-024
4,C-0004,NUT,CP-026
5,C-0005,ADAPTER-STR,CP-028
6,C-0006,ADAPTER-STR,CP-028
7,C-0007,CONNECTOR-FLARE,CP-014
8,C-0008,BOSS,CP-018
9,C-0009,BOSS,CP-018


Unnamed: 0,component_id,component_type_id,bolt_pattern_long,bolt_pattern_wide,thickness,orientation,weight
0,C-0027,CP-021,148.0,96.0,18.0,Yes,2.23
1,C-0454,CP-022,58.72,30.18,28.0,No,0.59
2,C-0455,CP-022,58.72,30.18,28.0,No,0.53
3,C-0494,CP-022,52.4,26.2,15.85,No,0.23
4,C-0496,CP-022,58.8,30.2,14.2,No,0.28
5,C-0508,CP-021,77.76,42.88,25.4,Yes,1.14
6,C-0572,CP-022,69.85,35.71,15.7,No,0.45
7,C-0797,CP-021,120.65,69.85,20.0,Yes,1.98
8,C-0891,CP-021,47.62,22.22,28.0,Yes,0.47
9,C-1096,CP-022,76.4,69.9,25.0,No,2.56


Unnamed: 0,tube_assembly_id,component_id_1,quantity_1,component_id_2,quantity_2,component_id_3,quantity_3,component_id_4,quantity_4,component_id_5,quantity_5,component_id_6,quantity_6,component_id_7,quantity_7,component_id_8,quantity_8
0,TA-00001,C-1622,2.00,C-1629,2.00,,,,,,,,,,,,
1,TA-00002,C-1312,2.00,,,,,,,,,,,,,,
2,TA-00003,C-1312,2.00,,,,,,,,,,,,,,
3,TA-00004,C-1312,2.00,,,,,,,,,,,,,,
4,TA-00005,C-1624,1.00,C-1631,1.00,C-1641,1.00,,,,,,,,,,
5,TA-00006,C-1624,1.00,C-1631,1.00,C-1641,1.00,,,,,,,,,,
6,TA-00007,C-1622,2.00,C-1629,2.00,,,,,,,,,,,,
7,TA-00008,C-1312,2.00,,,,,,,,,,,,,,
8,TA-00009,C-1625,2.00,C-1632,2.00,,,,,,,,,,,,
9,TA-00010,C-1768,2.00,,,,,,,,,,,,,,


Unnamed: 0,component_id,component_type_id,bolt_pattern_long,bolt_pattern_wide,extension_length,overall_length,thickness,drop_length,elbow_angle,mj_class_code,mj_plug_class_code,plug_diameter,groove,unique_feature,orientation,weight
0,C-0013,CP-008,152.4,92.08,105.0,185.0,113.0,75.0,90.0,,,,Yes,No,Yes,8.89
1,C-0016,CP-009,57.2,27.8,42.0,69.0,44.0,24.0,90.0,,,,No,No,Yes,1.17
2,C-0017,CP-009,57.2,27.8,42.0,69.0,47.0,26.0,90.0,,,,Yes,No,Yes,1.25
3,C-0018,CP-009,66.6,31.8,50.0,80.0,57.0,31.5,90.0,,,,Yes,No,Yes,1.86
4,C-0021,CP-010,75.0,,31.5,70.0,25.0,12.5,90.0,,,,No,Yes,Yes,0.9
5,C-0022,CP-010,39.41,,24.0,39.0,7.0,9.0,90.0,,,,Yes,Yes,Yes,0.1
6,C-0023,CP-008,52.4,26.2,20.4,58.5,61.3,44.4,125.0,,,,Yes,No,Yes,1.3
7,C-0028,CP-011,58.7,30.2,53.1,100.0,53.0,25.5,,MJ-003,MJ-005,,No,No,Yes,1.63
8,C-0042,CP-008,38.1,17.48,16.5,33.0,34.0,17.5,90.0,,,,Yes,No,Yes,0.36
9,C-0061,CP-010,38.07,,37.0,66.3,20.3,9.5,90.0,,,,Yes,Yes,Yes,0.2


Unnamed: 0,connection_type_id,name
0,B-001,37 deg Flare-SAE J514
1,B-002,ORFS-SAE J1453
2,B-003,Hi-Duty
3,B-004,NPTF-SAE J476/J514
4,B-005,SAE STOR-SAE J1926
5,B-006,45 deg Flare-SAE J512
6,B-007,45 deg Inv Flare-SAE J512
7,B-008,A-C
8,B-009,Metric STOR-ISO 6149
9,B-010,Plain


Unnamed: 0,component_id,component_type_id,connection_type_id,length,intended_nut_thread,intended_nut_pitch,unique_feature,plating,orientation,weight
0,C-0001,CP-024,B-001,17.3,1.06,12,No,No,No,0.01
1,C-0002,CP-024,B-001,11.2,0.5,20,No,No,No,0.01
2,C-0003,CP-024,B-001,19.3,1.19,12,No,No,No,0.01
3,C-0048,CP-024,B-002,9.5,0.56,18,No,No,No,0.01
4,C-0049,CP-024,B-002,9.5,0.81,16,No,No,No,0.01
5,C-0050,CP-024,B-002,10.5,1.0,14,No,No,No,0.01
6,C-0051,CP-024,B-002,14.0,1.19,12,No,No,No,0.03
7,C-0052,CP-024,B-002,15.5,1.44,12,No,No,No,0.04
8,C-0053,CP-024,B-002,15.5,2.0,12,No,No,No,0.08
9,C-0058,CP-024,B-002,8.6,0.69,16,No,No,No,0.01


Unnamed: 0,tube_assembly_id,material_id,diameter,wall,length,num_bends,bend_radius,end_a_1x,end_a_2x,end_x_1x,end_x_2x,end_a,end_x,num_boss,num_bracket,other
0,TA-00001,SP-0035,12.70,1.65,164.00,5,38.10,N,N,N,N,EF-003,EF-003,0,0,0
1,TA-00002,SP-0019,6.35,0.71,137.00,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0
2,TA-00003,SP-0019,6.35,0.71,127.00,7,19.05,N,N,N,N,EF-008,EF-008,0,0,0
3,TA-00004,SP-0019,6.35,0.71,137.00,9,19.05,N,N,N,N,EF-008,EF-008,0,0,0
4,TA-00005,SP-0029,19.05,1.24,109.00,4,50.80,N,N,N,N,EF-003,EF-003,0,0,0
5,TA-00006,SP-0029,19.05,1.24,79.00,4,50.80,N,N,N,N,EF-003,EF-003,0,0,0
6,TA-00007,SP-0035,12.70,1.65,202.00,5,38.10,N,N,N,N,EF-003,EF-003,0,0,0
7,TA-00008,SP-0039,6.35,0.71,174.00,6,19.05,N,N,N,N,EF-008,EF-008,0,0,0
8,TA-00009,SP-0029,25.40,1.65,135.00,4,63.50,N,N,N,N,EF-003,EF-003,0,0,0
9,TA-00010,SP-0046,42.70,4.80,290.00,4,110.00,N,N,N,N,EF-021,EF-021,0,0,0


Unnamed: 0,component_id,component_type_id,hose_diameter,corresponding_shell,coupling_class,material,plating,orientation,weight
0,C-0872,CP-023,4.8,C-0855,SP-0098,SP-0016,Yes,No,0.01
1,C-0873,CP-023,4.8,C-0856,SP-0098,SP-0016,Yes,No,0.01
2,C-0874,CP-023,4.8,C-0857,SP-0098,SP-0038,Yes,No,0.0
3,C-1039,CP-023,15.9,C-1040,SP-0097,SP-0095,No,No,0.05
4,C-1041,CP-023,15.9,C-1042,SP-0099,SP-0095,No,No,0.07
5,C-1043,CP-023,25.4,C-1044,SP-0099,SP-0095,No,No,0.2


Unnamed: 0,end_form_id,name
0,A-001,Male (Stud)
1,A-002,Male (Swivel)
2,A-003,Braze-Weld Boss
3,A-004,Braze-Weld Socket
4,A-005,Swivel Nut
5,A-006,Bulkhead Male
6,A-007,Port
7,9999,Other


Unnamed: 0,component_id,part_name,weight
0,C-1385,NUT-FLARED,0.01
1,C-1386,SLEEVE-FLARED,0.01
2,C-1369,COLLAR,0.00
3,C-0422,WASHER-FUEL LIN,0.00
4,C-1817,FITTING-NUT,0.01
5,C-1374,NUT,0.02
6,C-1375,NUT,0.04
7,C-1439,NUT-FUEL LINE,0.04
8,C-1355,ADAPTER,0.03
9,C-0958,ADAPTER,0.10


Unnamed: 0,component_type_id,name
0,CP-001,4-bolt Tig Straight
1,CP-002,4-bolt MJ Straight
2,CP-003,4-bolt Braze/Weld Straight
3,CP-004,2-bolt Braze/Weld Straight
4,CP-005,2-bolt MJ Straight
5,CP-006,Braze/Weld Flange Head
6,CP-007,MJ Flange Head
7,CP-008,4-bolt Braze/Weld Elbow
8,CP-009,4-bolt Tig Elbow
9,CP-010,2-bolt Braze/Weld Elbow


Unnamed: 0,tube_assembly_id,spec1,spec2,spec3,spec4,spec5,spec6,spec7,spec8,spec9,spec10
0,TA-00001,,,,,,,,,,
1,TA-00002,,,,,,,,,,
2,TA-00003,,,,,,,,,,
3,TA-00004,,,,,,,,,,
4,TA-00005,,,,,,,,,,
5,TA-00006,,,,,,,,,,
6,TA-00007,,,,,,,,,,
7,TA-00008,,,,,,,,,,
8,TA-00009,,,,,,,,,,
9,TA-00010,,,,,,,,,,


Unnamed: 0,component_id,component_type_id,hex_nut_size,seat_angle,length,thread_size,thread_pitch,diameter,blind_hole,orientation,weight
0,C-1621,CP-025,20.64,,17.0,.687,16.0,,,No,0.01
1,C-1624,CP-025,34.92,,26.5,1.187,12.0,,,No,0.04
2,C-1623,CP-025,28.58,,23.5,1.000,14.0,,,No,0.04
3,C-1622,CP-025,23.81,,20.0,.812,16.0,,,No,0.04
4,C-1625,CP-025,41.28,,27.5,1.437,12.0,,,No,0.13
5,C-1626,CP-025,47.63,,27.5,1.687,12.0,,,No,0.15
6,C-1620,CP-025,17.46,,15.0,.562,18.0,,,No,0.02
7,C-1867,CP-025,57.15,,27.5,2.000,12.0,,,No,0.22
8,C-2029,CP-025,23.81,,20.0,.812,16.0,,,No,0.03
9,C-0579,CP-025,24.0,,20.0,.812,16.0,,,No,0.16


In [9]:
dfs['test_set']

KeyError: 'test_set'

In [35]:
def return_shared_columns(df1, df2):
    shared_column = set(df1.columns) & set(df2.columns)
    if shared_column:
        return shared_column
    else:
        return None
shared_columns= {}
for df in dfs:
    shared_columns[df] = {}
    for df2 in dfs:
        if df != df2:
            shared_column = return_shared_columns(dfs[df], dfs[df2])
            if shared_column:
                shared_columns[df][df2] = shared_column


In [36]:
shared_columns['comp_threaded']

{'comp_adaptor': {'adaptor_angle',
  'component_id',
  'component_type_id',
  'connection_type_id_1',
  'connection_type_id_2',
  'end_form_id_1',
  'end_form_id_2',
  'hex_size',
  'length_1',
  'length_2',
  'nominal_size_1',
  'nominal_size_2',
  'orientation',
  'overall_length',
  'thread_pitch_1',
  'thread_pitch_2',
  'thread_size_1',
  'thread_size_2',
  'unique_feature',
  'weight'},
 'comp_straight': {'component_id',
  'component_type_id',
  'orientation',
  'overall_length',
  'unique_feature',
  'weight'},
 'comp_tee': {'component_id',
  'component_type_id',
  'orientation',
  'overall_length',
  'unique_feature',
  'weight'},
 'comp_boss': {'component_id',
  'component_type_id',
  'orientation',
  'unique_feature',
  'weight'},
 'components': {'component_id', 'component_type_id'},
 'comp_float': {'component_id', 'component_type_id', 'orientation', 'weight'},
 'comp_elbow': {'component_id',
  'component_type_id',
  'orientation',
  'overall_length',
  'unique_feature',
  'w

In [23]:
type(shared_columns)

dict

In [43]:
for d1 in shared_columns:
    print(d1 + '\n')
    for d2 in shared_columns[df]:
        print(d2 + '\n')
        print(shared_columns[df][d2])
    

comp_threaded

comp_threaded

{'weight', 'component_id', 'orientation', 'component_type_id'}
comp_adaptor

{'weight', 'component_id', 'orientation', 'component_type_id'}
comp_straight

{'weight', 'component_id', 'orientation', 'component_type_id'}
comp_tee

{'weight', 'component_id', 'orientation', 'component_type_id'}
comp_boss

{'weight', 'component_id', 'orientation', 'component_type_id'}
components

{'component_id', 'component_type_id'}
comp_float

{'weight', 'component_id', 'orientation', 'component_type_id'}
comp_elbow

{'weight', 'component_id', 'orientation', 'component_type_id'}
comp_sleeve

{'component_id', 'component_type_id', 'weight', 'orientation', 'length'}
tube

{'length', 'diameter'}
comp_hfl

{'weight', 'component_id', 'orientation', 'component_type_id'}
comp_other

{'weight', 'component_id'}
type_component

{'component_type_id'}
comp_adaptor

comp_threaded

{'weight', 'component_id', 'orientation', 'component_type_id'}
comp_adaptor

{'weight', 'component_id', 'orient

In [44]:
train_shared = {}
for df in dfs:
    shared_column = return_shared_columns(trainval, dfs[df])
    if shared_column:
        train_shared[df] = shared_column

In [46]:
for df in train_shared:
    temp_df = dfs[df]
    categoricals = temp_df.select_dtypes(exclude='number').columns.tolist()
    for col in categoricals:
        temp_df[col] = temp_df[col].fillna('MISSING')
    trainval = trainval.merge(temp_df, how='left')

In [75]:
for df in train_shared:
    temp_df = dfs[df]
    categoricals = temp_df.select_dtypes(exclude='number').columns.tolist()
    for col in categoricals:
        temp_df[col] = temp_df[col].fillna('MISSING')
    test = test.merge(temp_df, how='left')

In [47]:
trainval.columns.tolist()

['tube_assembly_id',
 'supplier',
 'quote_date',
 'annual_usage',
 'min_order_quantity',
 'bracket_pricing',
 'quantity',
 'cost',
 'component_id_1',
 'quantity_1',
 'component_id_2',
 'quantity_2',
 'component_id_3',
 'quantity_3',
 'component_id_4',
 'quantity_4',
 'component_id_5',
 'quantity_5',
 'component_id_6',
 'quantity_6',
 'component_id_7',
 'quantity_7',
 'component_id_8',
 'quantity_8',
 'material_id',
 'diameter',
 'wall',
 'length',
 'num_bends',
 'bend_radius',
 'end_a_1x',
 'end_a_2x',
 'end_x_1x',
 'end_x_2x',
 'end_a',
 'end_x',
 'num_boss',
 'num_bracket',
 'other',
 'spec1',
 'spec2',
 'spec3',
 'spec4',
 'spec5',
 'spec6',
 'spec7',
 'spec8',
 'spec9',
 'spec10']

In [63]:
shared_list = []
train_list = []
for i in train_shared:
    train_list.append(i)
for i in shared_columns:
    shared_list.append(i)
train_list

['bill_of_materials', 'tube', 'specs']

In [67]:
remaining_difference = set(shared_list).difference(train_list)

In [70]:
remaining_difference

{'comp_adaptor',
 'comp_boss',
 'comp_elbow',
 'comp_float',
 'comp_hfl',
 'comp_nut',
 'comp_other',
 'comp_sleeve',
 'comp_straight',
 'comp_tee',
 'comp_threaded',
 'components',
 'tube_end_form',
 'type_component',
 'type_connection',
 'type_end_form'}

In [74]:
for df in remaining_difference:
    if df != 'components':
        temp_df = dfs[df]
        display(temp_df)
        categoricals = temp_df.select_dtypes(exclude='number').columns.tolist()
        for col in categoricals:
            temp_df[col] = temp_df[col].fillna('MISSING')
        print(df)
        trainval = trainval.merge(temp_df, how='left', on='component_id_1')

Unnamed: 0,component_id,component_type_id,adaptor_angle,overall_length,end_form_id_1,connection_type_id_1,length_1,thread_size_1,thread_pitch_1,nominal_size_1,end_form_id_2,connection_type_id_2,length_2,thread_size_2,thread_pitch_2,nominal_size_2,hex_size,unique_feature,orientation,weight
0,C-0005,CP-028,,58.4,A-001,B-001,,1.31,12.0,,A-001,B-004,,1.0,11.5,,34.93,No,No,0.21
1,C-0006,CP-028,,34.8,A-001,B-001,,0.44,20.0,,A-001,B-005,,0.75,16.0,,22.2,No,No,0.08
2,C-1435,CP-028,,20.3,A-007,B-004,,,,15.88,A-001,B-007,,0.88,18.0,,22.22,No,No,0.02
3,C-1546,CP-028,,26.4,A-007,B-004,,0.12,27.0,,A-001,B-004,,0.12,27.0,,15.88,No,No,0.03
4,C-1583,CP-028,,44.5,A-001,B-005,,1.31,12.0,,A-007,B-005,,1.06,12.0,,38.1,No,No,0.26
5,C-1634,CP-028,,34.5,A-001,B-005,,0.75,16.0,,A-001,B-002,,0.69,16.0,,22.23,No,No,0.06
6,C-1975,CP-028,,13.2,A-007,B-007,,,,3.18,A-001,B-007,,0.31,28.0,,,No,No,0.01
7,C-0428,CP-028,,26.99,A-001,B-004,,0.25,18.0,,A-007,MISSING,,,,9.52,17.46,No,No,0.03
8,C-0443,CP-028,,22.35,A-007,B-007,,,,19.05,9999,9999,,1.06,16.0,,26.97,No,No,
9,C-0823,CP-028,,16.8,A-007,B-007,,,,9.52,A-001,9999,,0.62,18.0,9.52,15.75,No,No,0.01


comp_adaptor


MergeError: No common columns to perform merge on. Merge options: left_on=None, right_on=None, left_index=False, right_index=False

In [76]:
trainval.columns

Index(['tube_assembly_id', 'supplier', 'quote_date', 'annual_usage', 'min_order_quantity', 'bracket_pricing', 'quantity', 'cost', 'component_id_1', 'quantity_1', 'component_id_2', 'quantity_2', 'component_id_3', 'quantity_3', 'component_id_4', 'quantity_4', 'component_id_5', 'quantity_5', 'component_id_6', 'quantity_6', 'component_id_7', 'quantity_7', 'component_id_8', 'quantity_8', 'material_id', 'diameter', 'wall', 'length', 'num_bends', 'bend_radius', 'end_a_1x', 'end_a_2x', 'end_x_1x', 'end_x_2x', 'end_a', 'end_x', 'num_boss', 'num_bracket', 'other', 'spec1', 'spec2', 'spec3', 'spec4', 'spec5', 'spec6', 'spec7', 'spec8', 'spec9', 'spec10'], dtype='object')

In [77]:
test.columns

Index(['id', 'tube_assembly_id', 'supplier', 'quote_date', 'annual_usage', 'min_order_quantity', 'bracket_pricing', 'quantity', 'component_id_1', 'quantity_1', 'component_id_2', 'quantity_2', 'component_id_3', 'quantity_3', 'component_id_4', 'quantity_4', 'component_id_5', 'quantity_5', 'component_id_6', 'quantity_6', 'component_id_7', 'quantity_7', 'component_id_8', 'quantity_8', 'material_id', 'diameter', 'wall', 'length', 'num_bends', 'bend_radius', 'end_a_1x', 'end_a_2x', 'end_x_1x', 'end_x_2x', 'end_a', 'end_x', 'num_boss', 'num_bracket', 'other', 'spec1', 'spec2', 'spec3', 'spec4', 'spec5', 'spec6', 'spec7', 'spec8', 'spec9', 'spec10'], dtype='object')

In [79]:
trainval_tube_assemblies = trainval['tube_assembly_id'].unique()
test_tube_assemblies = test['tube_assembly_id'].unique()
len(trainval_tube_assemblies), len(test_tube_assemblies)

(8855, 8856)

In [80]:
train_tube_assemblies, val_tube_assemblies = train_test_split(
    trainval_tube_assemblies
)

In [81]:
train = trainval[trainval.tube_assembly_id.isin(train_tube_assemblies)].drop(columns='tube_assembly_id')
val = trainval[trainval.tube_assembly_id.isin(val_tube_assemblies)].drop(columns='tube_assembly_id')
train.shape, val.shape, trainval.shape

((22822, 48), (7391, 48), (30213, 49))

In [82]:
train.isnull().sum()

supplier                  0
quote_date                0
annual_usage              0
min_order_quantity        0
bracket_pricing           0
quantity                  0
cost                      0
component_id_1            0
quantity_1             1044
component_id_2            0
quantity_2             6732
component_id_3            0
quantity_3            17514
component_id_4            0
quantity_4            22308
component_id_5            0
quantity_5            22777
component_id_6            0
quantity_6            22801
component_id_7            0
quantity_7            22814
component_id_8            0
quantity_8            22819
material_id               0
diameter                  0
wall                      0
length                    0
num_bends                 0
bend_radius               0
end_a_1x                  0
end_a_2x                  0
end_x_1x                  0
end_x_2x                  0
end_a                     0
end_x                     0
num_boss            

In [97]:
from xgboost import XGBRegressor

target = 'cost'
y_train_log = np.log1p(train[target])
y_val_log = np.log1p(val[target])
X_train = train.drop(columns=[target])
X_val = val.drop(columns=[target])

encoder = ce.OrdinalEncoder()
X_train_encoded = encoder.fit_transform(X_train)
X_val_encoded = encoder.transform(X_val)

eval_set = [(X_train_encoded, y_train_log),
            (X_val_encoded, y_val_log)]

model = XGBRegressor(n_estimators=1000, n_jobs=-1)

# class XGBRegressorEval(XGBRegressor):
#     def fit(self, *args, **kwargs):
#         return super().fit(*args, eval_set=eval_set, eval_metrics='rmse',
#                            early_stopping_rounds=10, **kwargs)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
# pipeline = make_pipeline(ce.OrdinalEncoder(),
#                          XGBRegressor(n_estimators=1000, n_jobs=-1)
#                         )

In [None]:
model.fit(X_train_encoded, y_train_log, eval_set=eval_set, eval_metric='rmse',
          early_stopping_rounds=12)

In [92]:
pipeline.fit(X_train, y_train_log)

NameError: name 'eval_set' is not defined