In [117]:
%pylab inline

from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd

from soln.dataset import AllCategoricalsFeaturizer
from soln.dataset import generate_xv_splits
from soln.dataset import get_augmented_train_and_test_set
from soln.utils import dump_decision_tree
from soln.utils import eval_regressor
from soln.utils import print_feature_importances

pd.set_option('display.max_columns', None)

Populating the interactive namespace from numpy and matplotlib


In [118]:
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()

CPU times: user 14 s, sys: 24 ms, total: 14.1 s
Wall time: 14.4 s


In [119]:
%time X_train, y_train, X_test, y_test = next(generate_xv_splits(aug_train_set))

CPU times: user 128 ms, sys: 0 ns, total: 128 ms
Wall time: 130 ms


In [120]:
# Keep only the test examples that have unknown components.

from soln.dataset import get_component_info_df
from soln.dataset import load_raw_components
comp_types, group_dfs, cluster_dfs = load_raw_components()
cinfo_df = get_component_info_df(comp_types, group_dfs, cluster_dfs)

from soln.utils import count_components
train_counts = count_components(X_train, cinfo_df)
train_counts.rename(columns={'count': 'train_count'}, inplace=True)
test_counts = count_components(X_test, cinfo_df)
test_counts.rename(columns={'count': 'test_count'}, inplace=True)
all_counts = cinfo_df[['component_id', 'component_type_id', 'component_group_id']]
all_counts = all_counts.merge(train_counts, on='component_id')
all_counts = all_counts.merge(test_counts, on='component_id')

known_cids = set(all_counts.component_id[all_counts.train_count > 0].values)
print len(all_counts), len(known_cids)

has_unk = []
for cids in X_test.components:
    has_unk.append(any([cid not in known_cids for cid in cids]))
print len(X_test), len(has_unk)

X_test['has_unk'] = has_unk
print X_test.has_unk.value_counts()
print X_test.has_unk.value_counts(normalize=True)
tmp_df = X_test[['tube_assembly_id', 'has_unk']].drop_duplicates()
print len(X_test), len(tmp_df)
print tmp_df.has_unk.value_counts()
print tmp_df.has_unk.value_counts(normalize=True)

X_test_orig = X_test
y_test_orig = y_test
print X_train.shape, y_train.shape
print X_test_orig.shape, y_test_orig.shape
X_test = X_test_orig[X_test_orig.has_unk == True].reset_index(drop=True)
X_test.pop('has_unk')
y_test = y_test_orig[X_test_orig.has_unk == True].reset_index(drop=True)
print X_test.shape, y_test.shape

2047 1141
2943 2943
False    2791
True      152
dtype: int64
False    0.948352
True     0.051648
dtype: float64
2943 895
False    828
True      67
dtype: int64
False    0.92514
True     0.07486
dtype: float64
(27270, 53) (27270,)
(2943, 54) (2943,)
(152, 53) (152,)


In [121]:
featurizer = AllCategoricalsFeaturizer()
%time featurizer.fit(X_train)
%time X_train_feats = featurizer.transform(X_train)
%time X_test_feats = featurizer.transform(X_test)
X_train_feats.info(verbose=True)

CPU times: user 1.44 s, sys: 0 ns, total: 1.44 s
Wall time: 1.46 s
CPU times: user 1.42 s, sys: 264 ms, total: 1.68 s
Wall time: 1.73 s
CPU times: user 40 ms, sys: 0 ns, total: 40 ms
Wall time: 42 ms
<class 'pandas.core.frame.DataFrame'>
Int64Index: 27270 entries, 0 to 27269
Data columns (total 599 columns):
annual_usage                                           int64
min_order_quantity                                     int64
bracket_pricing                                        bool
quantity                                               int64
diameter                                               float64
wall_thickness                                         float64
length                                                 float64
num_bends                                              int64
bend_radius                                            float64
end_a_1x                                               bool
end_a_2x                                               bool
end_x_1x      

In [None]:
# Experiment: remove components in the 'straight' group.

assert False

cids_to_remove = set(cinfo_df.component_id[cinfo_df.component_group_id == 'straight'])
print len(cids_to_remove)

for col in list(X_train_feats.columns):
    if col.startswith('components '):
        cid = col[len('components '):]
        if cid in cids_to_remove:
            print "popping", col
            X_train_feats.pop(col)
            X_test_feats.pop(col)

X_train_feats.info(verbose=True)

X_train_feats.shape, X_test_feats.shape

In [122]:
X_train_np = X_train_feats.astype(np.float).values
y_train_np = y_train.values
X_test_np = X_test_feats.astype(np.float).values
y_test_np = y_test.values
print X_train_np.shape, X_test_np.shape, y_train_np.shape, y_test_np.shape

(27270, 599) (152, 599) (27270,) (152,)


In [123]:
import xgboost as xgb

params = {
    'objective': 'reg:linear',
    'eta': 0.02,
    'min_child_weight': 6,
    'subsample': 0.7,
    'colsample_bytree': 0.6,
    'silent': 1,
    'max_depth': 8,
}

xgtrain = xgb.DMatrix(X_train_np, label=y_train_np)
xgtest = xgb.DMatrix(X_test_np)

In [124]:
num_rounds = 1000
%time model = xgb.train(params.items(), xgtrain, num_rounds)
%time y_train_pred = model.predict(xgtrain)
train_rmsle = np.sqrt(mean_squared_error(y_train_np, y_train_pred))
%time y_test_pred = model.predict(xgtest)
test_rmsle = np.sqrt(mean_squared_error(y_test_np, y_test_pred))
print train_rmsle, test_rmsle

CPU times: user 2min 39s, sys: 380 ms, total: 2min 39s
Wall time: 1min 37s
CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 4.96 ms
CPU times: user 44 ms, sys: 0 ns, total: 44 ms
Wall time: 27.6 ms
0.124960740984 0.445949340695


In [62]:
in_test_not_train = all_counts[(all_counts.train_count == 0) & (all_counts.test_count > 0)]
print len(in_test_not_train)
print in_test_not_train.component_group_id.value_counts()

78
other       50
straight    10
boss         8
threaded     6
elbow        2
adaptor      2
dtype: int64


In [127]:
df = X_test.copy()
df['true_log_cost'] = y_test
df['pred_log_cost'] = y_test_pred
df['err2'] = (df.true_log_cost - df.pred_log_cost) ** 2
print np.sqrt(df.err2.mean())
df.sort('err2', ascending=False, inplace=True)
# df[:10]

0.445949340695


In [131]:
print np.sqrt(df.err2[df.supplier == 'S-0066'].mean())

0.415543115089


In [133]:
print np.sqrt(df.err2[df.bracketing_pattern == (1, 2, 5, 10, 25, 50, 100, 250)].mean())

0.237514483043


In [134]:
print np.sqrt(df.err2[df.bracketing_pattern == ()].mean())

0.57245976886


In [55]:
# Approach 1: Replace unknown 'straight' components with their nearest known neighbor.

straight = pd.read_csv('straight_vecs.csv')
straight.set_index('component_id', drop=True, inplace=True)
straight_np = straight.astype(np.float).values
print straight.shape
print straight_np.shape

from scipy.cluster.vq import whiten
straight_np_wh = whiten(straight_np)
cid_to_row = {}
for i, cid in enumerate(straight.index):
    cid_to_row[cid] = straight_np_wh[i, :]

unknown_cids = set(in_test_not_train.component_id[in_test_not_train.component_group_id == 'straight'].values)
print unknown_cids

from scipy.spatial.distance import euclidean

cid_to_subst = {}
for cid in unknown_cids:
    cid_row = cid_to_row[cid]
    best_target_cid = None
    best_dist = np.inf
    for target_cid, target_cid_row in cid_to_row.iteritems():
        if target_cid in unknown_cids:
            continue
        dist = euclidean(cid_row, target_cid_row)
        if dist < best_dist:
            best_target_cid = target_cid
            best_dist = dist
    cid_to_subst[cid] = best_target_cid
    print "unknown cid {} mapped to known cid {} with dist {}".format(cid, best_target_cid, best_dist)

cid_to_subst

(361, 26)
(361, 26)
set(['C-0334', 'C-1494', 'C-1999', 'C-0141', 'C-1549', 'C-0621', 'C-1897', 'C-0466', 'C-1785', 'C-0362'])
unknown cid C-0334 mapped to known cid C-0741 with dist 0.424655790434
unknown cid C-1494 mapped to known cid C-1495 with dist 0.0148741682958
unknown cid C-1999 mapped to known cid C-0457 with dist 1.30878282013
unknown cid C-0141 mapped to known cid C-1996 with dist 0.459905470358
unknown cid C-1549 mapped to known cid C-1740 with dist 0.0863014648659
unknown cid C-0621 mapped to known cid C-1900 with dist 1.48704650335
unknown cid C-1897 mapped to known cid C-1344 with dist 0.887588960897
unknown cid C-0466 mapped to known cid C-1433 with dist 0.151038244132
unknown cid C-1785 mapped to known cid C-1329 with dist 0.23612852287
unknown cid C-0362 mapped to known cid C-0038 with dist 0.41829806389


{'C-0141': 'C-1996',
 'C-0334': 'C-0741',
 'C-0362': 'C-0038',
 'C-0466': 'C-1433',
 'C-0621': 'C-1900',
 'C-1494': 'C-1495',
 'C-1549': 'C-1740',
 'C-1785': 'C-1329',
 'C-1897': 'C-1344',
 'C-1999': 'C-0457'}

In [46]:
cids = ('C-0334', 'C-0741')
straight[straight.index.isin(cids)]

Unnamed: 0_level_0,bolt_pattern_long,bolt_pattern_wide,head_diameter,overall_length,thickness,groove,unique_feature,orientation,weight,MJ-001,MJ-002,MJ-003,MJ-007,MJ-other,CP-001,CP-002,CP-003,CP-004,CP-005,CP-006,CP-007,bolt_pattern_long_missing,bolt_pattern_wide_missing,head_diameter_missing,overall_length_missing,weight_missing
component_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
C-0334,71.77567,40.841225,47.63,27.906098,9.53,True,False,False,0.104,0,0,0,0,1,0,0,0,0,0,1,0,True,True,False,True,False
C-0741,71.77567,40.841225,50.8,27.906098,8.24,True,False,False,0.14,0,0,0,0,1,0,0,0,0,0,1,0,True,True,False,True,False


In [60]:
X_test_mangled = X_test.copy()
orig_components = X_test_mangled.pop('components')
subst_components = []
for cids in orig_components.values:
    subst_cids = []
    for cid in cids:
        if cid in unknown_cids:
            target_cid = cid_to_subst[cid]
        else:
            target_cid = cid
        subst_cids.append(target_cid)
    subst_components.append(subst_cids)
X_test_mangled['components'] = subst_components

X_test_mangled['orig_components'] = orig_components
print X_test_mangled[['tube_assembly_id', 'components', 'orig_components']][:10]
X_test_mangled.pop('orig_components')
None

# Note that we only make substitutions for 'straight' at the moment,
# so some component lists will remain unchanged...

  tube_assembly_id        components   orig_components
0         TA-00181  [C-1468, C-1480]  [C-1468, C-1480]
1         TA-00181  [C-1468, C-1480]  [C-1468, C-1480]
2         TA-00181  [C-1468, C-1480]  [C-1468, C-1480]
3         TA-00181  [C-1468, C-1480]  [C-1468, C-1480]
4         TA-00566  [C-1329, C-1329]  [C-1785, C-1785]
5         TA-00968          [C-1764]          [C-1764]
6         TA-01243          [C-1996]          [C-0141]
7         TA-01243          [C-1996]          [C-0141]
8         TA-01243          [C-1996]          [C-0141]
9         TA-01243          [C-1996]          [C-0141]


In [61]:
print X_test.shape, X_test_mangled.shape
X_test_mangled_feats = featurizer.transform(X_test_mangled)
X_test_mangled_np = X_test_mangled_feats.astype(np.float).values
xgtest_mangled = xgb.DMatrix(X_test_mangled_np)
y_test_mangled_pred = model.predict(xgtest_mangled)
test_mangled_rmsle = np.sqrt(mean_squared_error(y_test_np, y_test_mangled_pred))
print test_mangled_rmsle

(152, 50) (152, 50)
0.432077643867


In [110]:
# Experiment: Train on test set, to see if our model can even represent this shit.

num_rounds = 1000
hack_train = xgb.DMatrix(X_test_np, label=y_test_np)
%time model = xgb.train(params.items(), hack_train, num_rounds)
%time y_hack_train_pred = model.predict(hack_train)
hack_train_rmsle = np.sqrt(mean_squared_error(y_test_np, y_hack_train_pred))
print train_rmsle

CPU times: user 1.64 s, sys: 24 ms, total: 1.66 s
Wall time: 1.04 s
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 185 µs
0.123782731596


In [136]:
X_test.bracketing_pattern.value_counts()

()                                      51
(1, 2, 5, 10, 25, 50, 100, 250)         40
(15, 30, 50, 60, 100, 200, 300, 400)     8
(1, 6, 20)                               6
(1, 2, 5, 10, 25, 50)                    6
(1, 3, 5, 10, 20, 50)                    6
(2, 5, 10, 15, 20)                       5
(1, 2, 3, 5, 7)                          5
(1, 3, 5, 7, 9)                          5
(1, 2, 3, 5)                             4
(3, 5, 10, 15)                           4
(1, 2, 5)                                3
(10, 20, 50)                             3
(50, 100)                                2
(5, 20)                                  2
(1, 2)                                   2
dtype: int64