In [19]:
from glob import glob
import pandas as pd

for path in glob('competition_data/*.csv'):
    df = pd.read_csv(path)
    print(path, df.shape)

competition_data/comp_threaded.csv (194, 32)
competition_data/comp_adaptor.csv (25, 20)
competition_data/tube_end_form.csv (27, 2)
competition_data/comp_straight.csv (361, 12)
competition_data/comp_tee.csv (4, 14)
competition_data/comp_boss.csv (147, 15)
competition_data/components.csv (2048, 3)
competition_data/comp_float.csv (16, 7)
competition_data/bill_of_materials.csv (21198, 17)
competition_data/comp_elbow.csv (178, 16)
competition_data/type_connection.csv (14, 2)
competition_data/train_set.csv (30213, 8)
competition_data/comp_sleeve.csv (50, 10)
competition_data/test_set.csv (30235, 8)
competition_data/tube.csv (21198, 16)
competition_data/comp_hfl.csv (6, 9)
competition_data/type_end_form.csv (8, 2)
competition_data/comp_other.csv (1001, 3)
competition_data/type_component.csv (29, 2)
competition_data/specs.csv (21198, 11)
competition_data/comp_nut.csv (65, 11)


In [20]:
train = pd.read_csv('competition_data/train_set.csv')
test = pd.read_csv('competition_data/test_set.csv')
tube = pd.read_csv('competition_data/tube.csv')
mats = pd.read_csv('competition_data/bill_of_materials.csv')
comps = pd.read_csv('competition_data/components.csv')
specs = pd.read_csv('competition_data/specs.csv')
end_form = pd.read_csv('competition_data/tube_end_form.csv')

In [21]:
# merging tube df with end_form df
tube = tube.merge(end_form,how='left',left_on='end_a',right_on='end_form_id').merge(end_form,how='left',left_on='end_x',right_on='end_form_id')

In [22]:
# drop duplicate ids
tube = tube.drop(['end_form_id_x','end_form_id_y'],axis=1)
# rename forming columns to match end_a and end_x
tube = tube.rename({'forming_x':'forming_a','forming_y':'forming_x'},axis=1)

#### merging comps onto mats.
We are only going to use the first component

In [23]:
# merging comps on mats but only on the first component
mats = mats.merge(comps,left_on='component_id_1',right_on='component_id',how='left')

In [24]:
# dropping redundant columns
mats = mats.drop(['component_id','component_type_id'],axis=1)

In [25]:
train = train.merge(tube, left_on='tube_assembly_id',right_on='tube_assembly_id',how='left')
test = test.merge(tube, left_on='tube_assembly_id',right_on='tube_assembly_id',how='left')

In [26]:
train = train.merge(mats,left_on='tube_assembly_id',right_on='tube_assembly_id',how='left')
test = test.merge(mats,left_on='tube_assembly_id',right_on='tube_assembly_id',how='left')

## Feature Engineering

In [27]:
import seaborn as sns
import numpy as np

In [28]:
# transform skewed cols
def transform_skewed_cols(df, skew_level=4):
    skew_cols = list(train.skew()[train.skew() > skew_level].index)
    for col in skew_cols:
        df[col] = np.log1p(df[col])
    return df

In [29]:
train = transform_skewed_cols(train)
test = transform_skewed_cols(test)

  """


In [30]:
# remove quote date and replace by year and month cols
def convert_add_dates(df):
    df['quote_date'] = pd.to_datetime(df['quote_date'],infer_datetime_format=True)
    df['month'] = df['quote_date'].dt.month
    df['year'] = df['quote_date'].dt.year
    df = df.drop('quote_date',axis=1)
    return df

In [31]:
test = convert_add_dates(test)
train = convert_add_dates(train)

### SVM for missing material Id

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import category_encoders as ce

In [33]:
train_nas = train[train['material_id'].isna()]
test_nas = test[test['material_id'].isna()]
train_not_null = train[~train['material_id'].isna()]
test_not_null = test[~test['material_id'].isna()]

In [34]:
train_not_null.fillna(0,inplace=True)
test_not_null.fillna(0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [35]:
target = 'material_id'
features = list(train.columns)
features.remove('material_id')
features.remove('cost')

In [36]:
X = train_not_null[features]
y = train_not_null[target]
X_test = test_not_null[features]
y_test = test_not_null[target]

In [37]:
encode_cols = list(X.describe(exclude='number').columns)
encoder = ce.OrdinalEncoder(cols=encode_cols)

In [38]:
X_encode = encoder.fit_transform(X)
X_test_encode = encoder.transform(X_test)

In [39]:
model = RandomForestClassifier(n_estimators=100,max_depth=20,n_jobs=-1)

In [40]:
model.fit(X_encode,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [41]:
model.score(X_encode,y)

0.9931630202774814

In [42]:
model.score(X_test_encode, y_test)

0.8014595621313606

In [43]:
# lets input the missing values on our null set

In [44]:
train_nas = train_nas[features].fillna(0)
test_nas = test_nas[features].fillna(0)

In [45]:
test_preds = model.predict(encoder.transform(test_nas))
train_preds = model.predict(encoder.transform(train_nas))

In [46]:
test_nas['material_id'] = test_preds
train_nas['material_id'] = train_preds

In [47]:
#putting predictions back into test and train
test = pd.concat([test_nas,test.loc[~test['material_id'].isna()]])
train = pd.concat([train_nas,train.loc[~train['material_id'].isna()]])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


## Xgboost

In [48]:
import xgboost
from xgboost import XGBRegressor
from sklearn.pipeline import make_pipeline

In [None]:
xgboost.__version__

'0.81'

In [None]:
test = test.fillna(0)
train = train.fillna(0)

In [None]:
# train test splitting based on tube id

In [None]:
unique_tubes = train['tube_assembly_id'].unique()

In [None]:
train_tubes, val_tubes = train_test_split(unique_tubes)

In [None]:
set(train_tubes) & set(val_tubes)

In [None]:
X_train = train.loc[train['tube_assembly_id'].isin(train_tubes)]
X_val = train.loc[train['tube_assembly_id'].isin(val_tubes)]

In [None]:
# drop tube_id
X_train = X_train.drop('tube_assembly_id',axis=1)
X_val = X_val.drop('tube_assembly_id',axis=1)
test = test.drop('tube_assembly_id',axis=1)

#### define ordinal and one hot columns

In [None]:
card = X_train.describe(exclude='number').T.sort_values(by='unique')
numerics = X_train.describe()
numerics = list(numerics.columns)
numerics.remove('cost')

In [None]:
hot_encode_cols = list(card.loc[card['unique'] < 150].index)
ordinal_encode_cols = list(card.loc[card['unique'] >= 150].index)

#### pipeline

In [None]:
pipeline =  make_pipeline(ce.OrdinalEncoder(cols=ordinal_encode_cols),
                   ce.OneHotEncoder(cols=hot_encode_cols,use_cat_names=True),
                                   XGBRegressor(n_estimators=100, n_jobs=-1))

#### feature defining

In [1]:
target = 'cost'
features = numerics + hot_encode_cols + ordinal_encode_cols

NameError: name 'numerics' is not defined

In [None]:
pipeline.fit(X_train[features],X_train[target])