In [110]:
import pandas as pd
from glob import glob
import numpy as np
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import category_encoders as ce
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline

In [2]:
!wget https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Applied-Modeling/master/data/caterpillar/caterpillar-tube-pricing.zip

--2019-07-22 14:10:34--  https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Applied-Modeling/master/data/caterpillar/caterpillar-tube-pricing.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.196.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.196.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 823789 (804K) [application/zip]
Saving to: ‘caterpillar-tube-pricing.zip’


2019-07-22 14:10:35 (2.79 MB/s) - ‘caterpillar-tube-pricing.zip’ saved [823789/823789]



In [2]:
!ls *.zip

caterpillar-tube-pricing.zip


In [4]:
!unzip caterpillar-tube-pricing.zip

Archive:  caterpillar-tube-pricing.zip
  inflating: sample_submission.csv   
  inflating: data.zip                


In [5]:
!unzip data.zip

Archive:  data.zip
   creating: competition_data/
  inflating: competition_data/bill_of_materials.csv  
  inflating: competition_data/comp_adaptor.csv  
  inflating: competition_data/comp_boss.csv  
  inflating: competition_data/comp_elbow.csv  
  inflating: competition_data/comp_float.csv  
  inflating: competition_data/comp_hfl.csv  
  inflating: competition_data/comp_nut.csv  
  inflating: competition_data/comp_other.csv  
  inflating: competition_data/comp_sleeve.csv  
  inflating: competition_data/comp_straight.csv  
  inflating: competition_data/comp_tee.csv  
  inflating: competition_data/comp_threaded.csv  
  inflating: competition_data/components.csv  
  inflating: competition_data/specs.csv  
  inflating: competition_data/test_set.csv  
  inflating: competition_data/train_set.csv  
  inflating: competition_data/tube.csv  
  inflating: competition_data/tube_end_form.csv  
  inflating: competition_data/type_component.csv  
  inflating: competition_data/type

In [8]:
for path in glob('competition_data/*.csv'):
    df = pd.read_csv(path)
    print(path, df.shape)

competition_data/comp_threaded.csv (194, 32)
competition_data/comp_adaptor.csv (25, 20)
competition_data/tube_end_form.csv (27, 2)
competition_data/comp_straight.csv (361, 12)
competition_data/comp_tee.csv (4, 14)
competition_data/comp_boss.csv (147, 15)
competition_data/components.csv (2048, 3)
competition_data/comp_float.csv (16, 7)
competition_data/bill_of_materials.csv (21198, 17)
competition_data/comp_elbow.csv (178, 16)
competition_data/type_connection.csv (14, 2)
competition_data/train_set.csv (30213, 8)
competition_data/comp_sleeve.csv (50, 10)
competition_data/test_set.csv (30235, 8)
competition_data/tube.csv (21198, 16)
competition_data/comp_hfl.csv (6, 9)
competition_data/type_end_form.csv (8, 2)
competition_data/comp_other.csv (1001, 3)
competition_data/type_component.csv (29, 2)
competition_data/specs.csv (21198, 11)
competition_data/comp_nut.csv (65, 11)


In [11]:
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [134]:
train = pd.read_csv('competition_data/train_set.csv', parse_dates=[2,])

test = pd.read_csv('competition_data/test_set.csv', parse_dates=[3,])

tube = pd.read_csv('competition_data/tube.csv')

materials = pd.read_csv('competition_data/bill_of_materials.csv')

specs = pd.read_csv('competition_data/specs.csv')

In [135]:
train = pd.merge(train, tube, on='tube_assembly_id')
train = pd.merge(train, materials, on ='tube_assembly_id')
test = pd.merge(test, tube, on='tube_assembly_id')
test = pd.merge(test, materials, on ='tube_assembly_id')

In [147]:
train = train.fillna(0)

test = test.fillna(0)

In [137]:
train_df, val_df = train_test_split(
    train, random_state=42
)

In [138]:
train_target = train_df.cost.tolist()
val_target = val_df.cost.tolist()

In [139]:
train_df = train_df.drop(columns=['quote_date', 'cost', 'tube_assembly_id'])

val_df = val_df.drop(columns=['quote_date', 'cost', 'tube_assembly_id'])

test = test.drop(columns=['quote_date', 'tube_assembly_id'])

In [141]:
pipeline = make_pipeline(
    ce.OrdinalEncoder(),
    RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
)

pipeline.fit(train_df, train_target)

Pipeline(memory=None,
     steps=[('ordinalencoder', OrdinalEncoder(cols=['supplier', 'bracket_pricing', 'material_id', 'end_a_1x', 'end_a_2x', 'end_x_1x', 'end_x_2x', 'end_a', 'end_x', 'component_id_1', 'component_id_2', 'component_id_3', 'component_id_4', 'component_id_5', 'component_id_6', 'component_id_7', 'component_id_8...timators=100, n_jobs=-1,
           oob_score=False, random_state=42, verbose=0, warm_start=False))])

In [142]:
y_pred = pipeline.predict(val_df)

rmsle(val_target, y_pred)

0.27537677313417197

In [153]:
train_df.head()

Unnamed: 0,supplier,annual_usage,min_order_quantity,bracket_pricing,quantity,material_id,diameter,wall,length,num_bends,...,component_id_4,quantity_4,component_id_5,quantity_5,component_id_6,quantity_6,component_id_7,quantity_7,component_id_8,quantity_8
2028,S-0066,0,0,Yes,2,SP-0019,6.35,0.71,119.0,5,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
7743,S-0066,0,0,Yes,50,SP-0029,25.4,3.05,278.0,4,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
20154,S-0013,150,15,No,15,SP-0035,15.88,1.65,93.0,4,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
28073,S-0066,1,0,Yes,50,SP-0035,12.7,1.24,134.0,5,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
19527,S-0041,1,0,Yes,1,SP-0029,19.05,1.65,94.0,5,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0


In [156]:
test = test.drop(columns='id')

In [157]:
y_pred = pipeline.predict(test)

In [143]:
sample_submission = pd.read_csv('sample_submission.csv')

In [158]:
sample_submission['cost'] = y_pred

In [163]:
sample_submission.to_csv('submission.csv', index=False)

In [164]:
pd.read_csv('submission.csv')

Unnamed: 0,id,cost
0,1,22.690919
1,2,13.089504
2,3,7.419529
3,4,5.539497
4,5,4.291382
5,6,3.918280
6,7,3.804076
7,8,3.738920
8,9,21.693523
9,10,12.167613


In [36]:
trainval['quote_date'] = pd.to_datetime(train['quote_date'])

test['quote_date'] = pd.to_datetime(train['quote_date'])

In [40]:
trainval_tube_assemblies = trainval['tube_assembly_id'].unique()
test_tube_assemblies = test['tube_assembly_id'].unique()

len(trainval_tube_assemblies), len(test_tube_assemblies)

(8855, 8856)

In [38]:
set(trainval_tube_assemblies) & set(test_tube_assemblies)

set()

In [41]:
trainval_tube_assemblies

array(['TA-00002', 'TA-00004', 'TA-00005', ..., 'TA-21195', 'TA-21196',
       'TA-21197'], dtype=object)

In [42]:
train_tube_assemblies, val_tube_assemblies = train_test_split(
    trainval_tube_assemblies, random_state=42
)

In [43]:
len(train_tube_assemblies), len(val_tube_assemblies)

(6641, 2214)

In [44]:
train = trainval[trainval.tube_assembly_id.isin(train_tube_assemblies)]
val = trainval[trainval.tube_assembly_id.isin(val_tube_assemblies)]

In [45]:
target = 'cost'

features = 

y_train = train[target]
y_val = val[target]