In [1]:
from glob import glob
import pandas as pd

for path in glob('competition_data/*.csv'):
    df = pd.read_csv(path)
    print(path, df.shape)

competition_data/comp_threaded.csv (194, 32)
competition_data/comp_adaptor.csv (25, 20)
competition_data/tube_end_form.csv (27, 2)
competition_data/comp_straight.csv (361, 12)
competition_data/comp_tee.csv (4, 14)
competition_data/comp_boss.csv (147, 15)
competition_data/components.csv (2048, 3)
competition_data/comp_float.csv (16, 7)
competition_data/bill_of_materials.csv (21198, 17)
competition_data/comp_elbow.csv (178, 16)
competition_data/type_connection.csv (14, 2)
competition_data/train_set.csv (30213, 8)
competition_data/comp_sleeve.csv (50, 10)
competition_data/test_set.csv (30235, 8)
competition_data/tube.csv (21198, 16)
competition_data/comp_hfl.csv (6, 9)
competition_data/type_end_form.csv (8, 2)
competition_data/comp_other.csv (1001, 3)
competition_data/type_component.csv (29, 2)
competition_data/specs.csv (21198, 11)
competition_data/comp_nut.csv (65, 11)


In [57]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [58]:
train = pd.read_csv('competition_data/train_set.csv')
test = pd.read_csv('competition_data/test_set.csv')
tube = pd.read_csv('competition_data/tube.csv')
mats = pd.read_csv('competition_data/bill_of_materials.csv')
comps = pd.read_csv('competition_data/components.csv')
specs = pd.read_csv('competition_data/specs.csv')
end_form = pd.read_csv('competition_data/tube_end_form.csv')

In [59]:
# merging tube df with end_form df
tube = tube.merge(end_form,how='left',left_on='end_a',right_on='end_form_id').merge(end_form,how='left',left_on='end_x',right_on='end_form_id')

In [60]:
# drop duplicate ids
tube = tube.drop(['end_form_id_x','end_form_id_y'],axis=1)
# rename forming columns to match end_a and end_x
tube = tube.rename({'forming_x':'forming_a','forming_y':'forming_x'},axis=1)

#### merging comps onto mats.
We are only going to use the first component

In [61]:
# merging comps on mats but only on the first component
mats = mats.merge(comps,left_on='component_id_1',right_on='component_id',how='left')

In [62]:
# dropping redundant columns
mats = mats.drop(['component_id','component_type_id'],axis=1)

In [63]:
train = train.merge(tube, left_on='tube_assembly_id',right_on='tube_assembly_id',how='left')
test = test.merge(tube, left_on='tube_assembly_id',right_on='tube_assembly_id',how='left')

In [64]:
train = train.merge(mats,left_on='tube_assembly_id',right_on='tube_assembly_id',how='left')
test = test.merge(mats,left_on='tube_assembly_id',right_on='tube_assembly_id',how='left')

## Feature Engineering

In [65]:
import seaborn as sns
import numpy as np

In [66]:
# transform skewed cols
def transform_skewed_cols(df, skew_level=4):
    skew_cols = list(train.skew()[train.skew() > skew_level].index)
    for col in skew_cols:
        df[col] = np.log1p(df[col])
    return df

In [67]:
train = transform_skewed_cols(train)
test = transform_skewed_cols(test)

  """


In [68]:
# remove quote date and replace by year and month cols
def convert_add_dates(df):
    df['quote_date'] = pd.to_datetime(df['quote_date'],infer_datetime_format=True)
    df['month'] = df['quote_date'].dt.month
    df['year'] = df['quote_date'].dt.year
    df = df.drop('quote_date',axis=1)
    return df

In [69]:
test = convert_add_dates(test)
train = convert_add_dates(train)

### SVM for missing material Id

In [70]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import category_encoders as ce

In [71]:
train_nas = train[train['material_id'].isna()]
test_nas = test[test['material_id'].isna()]
train_not_null = train[~train['material_id'].isna()]
test_not_null = test[~test['material_id'].isna()]

In [72]:
train_not_null.fillna(0,inplace=True)
test_not_null.fillna(0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [73]:
target = 'material_id'
features = list(train.columns)
features.remove('material_id')
features.remove('cost')

In [74]:
X = train_not_null[features]
y = train_not_null[target]
X_test = test_not_null[features]
y_test = test_not_null[target]

In [75]:
encode_cols = list(X.describe(exclude='number').columns)
encoder = ce.OrdinalEncoder(cols=encode_cols)

In [76]:
X_encode = encoder.fit_transform(X)
X_test_encode = encoder.transform(X_test)

In [77]:
model = RandomForestClassifier(n_estimators=100,max_depth=20,n_jobs=-1)

In [78]:
model.fit(X_encode,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [79]:
model.score(X_encode,y)

0.9925960512273212

In [80]:
model.score(X_test_encode, y_test)

0.7937951947749009

In [81]:
# lets input the missing values on our null set

In [82]:
train_nas = train_nas[features].fillna(0)
test_nas = test_nas[features].fillna(0)

In [83]:
test_preds = model.predict(encoder.transform(test_nas))
train_preds = model.predict(encoder.transform(train_nas))

In [84]:
test_nas['material_id'] = test_preds
train_nas['material_id'] = train_preds

In [85]:
#putting predictions back into test and train
test = pd.concat([test_nas,test.loc[~test['material_id'].isna()]])
train = pd.concat([train_nas,train.loc[~train['material_id'].isna()]])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


## Xgboost

In [86]:
import xgboost as xgb
from sklearn.pipeline import make_pipeline

In [87]:
test = test.fillna(0)
train = train.fillna(0)

In [88]:
# train test splitting based on tube id

In [89]:
unique_tubes = train['tube_assembly_id'].unique()

In [90]:
train_tubes, val_tubes = train_test_split(unique_tubes)

In [91]:
set(train_tubes) & set(val_tubes)

set()

In [92]:
X_train = train.loc[train['tube_assembly_id'].isin(train_tubes)]
X_val = train.loc[train['tube_assembly_id'].isin(val_tubes)]

In [93]:
# drop tube_id
X_train = X_train.drop('tube_assembly_id',axis=1)
X_val = X_val.drop('tube_assembly_id',axis=1)
test = test.drop('tube_assembly_id',axis=1)

#### define ordinal and one hot columns

In [94]:
card = X_train.describe(exclude='number').T.sort_values(by='unique')
numerics = X_train.describe()
numerics = list(numerics.columns)
numerics.remove('cost')

In [95]:
hot_encode_cols = list(card.loc[card['unique'] < 150].index)
ordinal_encode_cols = list(card.loc[card['unique'] >= 150].index)

#### pipeline

In [96]:
pipeline =  make_pipeline(ce.OrdinalEncoder(cols=ordinal_encode_cols),
                   ce.OneHotEncoder(cols=hot_encode_cols,use_cat_names=True))

#### feature defining

In [97]:
target = 'cost'
features = numerics + hot_encode_cols + ordinal_encode_cols

In [98]:
X_train_encode = pipeline.fit_transform(X_train[features])

In [99]:
model = xgb.XGBRegressor(n_estimators=100, n_jobs=-1,max_depth=20)

In [106]:
X_val_encode = pipeline.transform(X_val[features])

In [109]:
eval_set = [(X_train_encode, X_train[target]), 
            (X_val_encode, X_val[target])]

In [110]:
model.fit(X_train_encode, X_train[target],eval_set=eval_set, eval_metric='rmse', 
          early_stopping_rounds=10)

[0]	validation_0-rmse:1.70087	validation_1-rmse:1.69425
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:1.53777	validation_1-rmse:1.53845
[2]	validation_0-rmse:1.39101	validation_1-rmse:1.39818
[3]	validation_0-rmse:1.25892	validation_1-rmse:1.2739
[4]	validation_0-rmse:1.14016	validation_1-rmse:1.16592
[5]	validation_0-rmse:1.03296	validation_1-rmse:1.06674
[6]	validation_0-rmse:0.93639	validation_1-rmse:0.981015
[7]	validation_0-rmse:0.849353	validation_1-rmse:0.903975
[8]	validation_0-rmse:0.770925	validation_1-rmse:0.835305
[9]	validation_0-rmse:0.700135	validation_1-rmse:0.774581
[10]	validation_0-rmse:0.636478	validation_1-rmse:0.722979
[11]	validation_0-rmse:0.578815	validation_1-rmse:0.676585
[12]	validation_0-rmse:0.526898	validation_1-rmse:0.63464
[13]	validation_0-rmse:0.480107	validation_1-rmse:0.600204
[14]	validation_0-rmse:0.437705	validati

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=20, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)