It is solution [public ranking #16, private ranking #18][1] by [Willie Liao][2].

[1]: https://www.kaggle.com/willieliao/et1-ridge3-med-adj
[2]: https://www.kaggle.com/willieliao

There are several things we can learn from it:

1. data preprocessing: fillna, clip
2. feature contruction: moving average, lag, 
3. choice of model and ensemble: extraTreeRegressor, Ridge, weighted sum

In [1]:
import kagglegym
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import Ridge
from sklearn.base import TransformerMixin
from sklearn.pipeline import FeatureUnion, make_pipeline
from gc import collect
from collections import deque

In [2]:
#display settings
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 50)
pd.set_option('display.float_format', '{:.4f}'.format)

In [3]:
N_THREADS = -1
RANDOM_SEED = 20170228

In [4]:
#Kaggle data is usually noisy
#It is common to clip response variable y

Y_CLIP_LO, Y_CLIP_HI = -0.075, 0.075
Y_LAG_CLIP_LO, Y_LAG_CLIP_HI = -0.1, 0.1
TS_ADJ_CLIP_LO, TS_ADJ_CLIP_HI = 0.05, 2.0
CUMMED_ADJ_RATIO = 0.04
MIN_ADJ_DATA = 100

In [5]:
cols_na = ['technical_' + str(i) for i in [0, 9, 13, 16, 20, 30, 32, 38, 44]]
cols_diff = ['technical_' + str(i) for i in [11, 13, 20, 22, 27, 30, 34, 44]] + ['derived_0']
cols_backshift = cols_diff + ['ma', 'fundamental_11']
cols_ts = ['ma', 'y_lag', 'sign_change']

In [6]:
env = kagglegym.make()
o = env.reset()

In [7]:
###################################
### transform and fill missing with medians
###################################
class CountFillMissing(TransformerMixin):
    def __init__(self, cols_orig, cols_na, cols_medians):
        self.cols_orig = cols_orig
        self.cols_na = cols_na
        self.cols_medians = cols_medians

    def fit(self, X=None):
        return self

    def transform(self, X):
        # see physical_meaning_tech_20_30.ipynb for definition of moving average
        X['ma'] = X['technical_20'] + X['technical_13'] - X['technical_30']
        X = X.assign(nas=0, nas1=0)
        for c in self.cols_orig:
            X[c + '_nan'] = pd.isnull(X[c])
            X['nas'] += X[c + '_nan']
            if c in self.cols_na:
                X['nas1'] += X[c + '_nan']
        X.fillna(self.cols_medians, inplace=True)
        return X

    def fit_transform(self, X, y=None, **fit_params):
        return self.transform(X)



In [8]:
###################################
### transform and fill missing with
### medians
###################################

class MovingAverage(TransformerMixin):
    def __init__(self, cols_backshift, cols_diff, cols_medians):
        self.cols_backshift = cols_backshift
        self.cols_diff = cols_diff
        self.cols_medians = cols_medians
        self.cols_keep = list({'id', 'ma', 'y_lag', 'y_lag_prod', 'sign_change_sum', 'ti'}
                              | set(self.cols_backshift) | set(self.cols_diff))
        # Store latest features for differences and cumulative columns
        self.previous = None

    def fit(self, X=None):
        return self

    def transform(self, X):
        # Previous values
        X = pd.merge(X, self.previous, how='left', on='id', suffixes=['', '_B'], sort=False)
        for c in self.cols_backshift:
            X[c + '_B'].fillna(self.cols_medians[c], inplace=True)
            if c in self.cols_diff:
                X[c + '_D'] = X[c] - X[c + '_B']

        # Fill if no previous value
        X.ti.fillna(-1, inplace=True)
        X.loc[X.y_lag.isnull(), 'y_lag'] = X.loc[X.y_lag.isnull(), 'ma']
        X.loc[X.y_lag_prod.isnull(), 'y_lag_prod'] = X.y_lag.loc[X.y_lag_prod.isnull()] + 1.0
        X.sign_change_sum.fillna(0, inplace=True)

        #############################
        ### feature construction
        #############################
        # Moving Averages
        X['ti'] += 1
        X.rename(columns={'y_lag_prod': 'y_lag_prod_B', 'y_lag': 'y_lag_B'}, inplace=True)
        X['y_lag'] = (15.0 * X['ma'] - 14.0 * X['ma_B']).clip(Y_LAG_CLIP_LO, Y_LAG_CLIP_HI)
        X['y_lag_prod'] = X['y_lag_prod_B'] * (1.0 + X['y_lag'])
        X['y_lag_diff'] = X['y_lag_prod'] - X['y_lag_prod_B']
        X['sign_change'] = X['y_lag'] == X['y_lag_B']
        X['sign_change_sum'] += X['sign_change']
        X['sign_change_cum'] = X['sign_change_sum'] / X['ti']
        X.loc[X.ti < 10, 'sign_change_cum'] = 0.5
        X.drop(['y_lag_prod_B', 'y_lag_B'], axis=1, inplace=True)

        # Need to keep previous ids not present in current timestamp
        self.previous = pd.concat([X[self.cols_keep], self.previous.loc[~self.previous.id.isin(X.id)]])
        return X

    def fit_transform(self, X, y=None, **fit_params):
        # Previous values
        X.sort_values(['id', 'timestamp'], inplace=True)
        X.reset_index(drop=True, inplace=True)
        g = X.groupby('id')
        X['ti'] = g.cumcount()
        for c in self.cols_backshift:
            X[c + '_B'] = g[c].shift(1)
            X[c + '_B'].fillna(self.cols_medians[c], inplace=True)
            if c in self.cols_diff:
                X[c + '_D'] = X[c] - X[c + '_B']
        del g

        # Lagged target
        X['y_lag'] = (15.0 * X['ma'] - 14.0 * X['ma_B']).clip(Y_LAG_CLIP_LO, Y_LAG_CLIP_HI)

        # Cumulative Values
        X['y_lag_prod'] = X['y_lag'] + 1.0
        X['y_lag_prod'] = X.groupby('id')['y_lag_prod'].cumprod()
        X['y_lag_diff'] = X['y_lag_prod'] - X.groupby('id')['y_lag_prod'].shift(1)
        X['y_lag_diff'].fillna(0.0, inplace=True)

        # Sign Change
        g = X.groupby('id')['y_lag']
        X['sign_change'] = np.sign(X.y_lag) != np.sign(g.shift(1).fillna(0.0))
        g = X.groupby('id')
        X['sign_change_sum'] = g['sign_change'].cumsum()
        X['sign_change_cum'] = X['sign_change_sum'] / X['ti']
        X.loc[X.ti < 10, 'sign_change_cum'] = 0.5

        self.previous = g[self.cols_keep].last().reset_index(drop=True)
        del g
        return X

In [9]:
###################################
### Add column 'extreme0', 
### number of extreme values
###################################

class ExtremeValues(TransformerMixin):
    def fit(self):
        return self

    def transform(self, X):
        X['extreme0'] = (
            (X.technical_21 < -1.6).astype(int)
            + (X.technical_35 < -1.0).astype(int)
            + (X.technical_36 < -1.0).astype(int)
            + (X.technical_21 > 2.0).astype(int)
            + (X.technical_27 < -1.3).astype(int)
            + (X.fundamental_53 < -1.0).astype(int))
        return X

    def fit_transform(self, X, y=None, **fit_params):
        return self.transform(X)

In [10]:
###################################
### fit model for selected columns
### on specified rows
###################################

class ModelTransformer(TransformerMixin):
    # Hack to use row and column filters
    def __init__(self, model, cols, rows):
        self.model = model
        self.cols = cols
        self.rows = rows

    def fit(self, X, y):
        self.model.fit(X.loc[self.rows, self.cols], y.loc[self.rows])
        return self

    def transform(self, X, **transform_params):
        return pd.DataFrame(self.model.predict(X.loc[:, self.cols]))

In [11]:
###################################
### Preprocess
###################################
print('### Preprocess')
# train = pd.read_hdf('../input/train.h5')
train = o.train
print('train before preprocess:', train.shape)
print('timestamps:', train["timestamp"].nunique())

# c_B: lag(1) term of column c
# ti: # appearance of the corresponding id
# y_lag_prod: cummulative production return upto y_lag
# sign_change_sum: # sign changes
cols_excl = ([env.ID_COL_NAME, env.SAMPLE_COL_NAME, env.TARGET_COL_NAME, env.TIME_COL_NAME]
             + [c + '_B' for c in cols_backshift] + ['ti', 'y_lag_prod', 'sign_change_sum'])
cols_orig = [c for c in o.train.columns if c not in cols_excl] + ['ma']
cols_na_count = [c + '_nan' for c in cols_orig if c not in cols_excl]

train['ma'] = train['technical_20'] + train['technical_13'] - train['technical_30']
cols_medians = train[cols_orig].median(axis=0).to_dict()

####################################
### Preprocessing piped classes
####################################
preprocess_pipe = make_pipeline(
    CountFillMissing(cols_orig, cols_na, cols_medians)
    , MovingAverage(cols_backshift, cols_diff, cols_medians)
    , ExtremeValues()
)
train = preprocess_pipe.fit_transform(train)
print('train after preprocess:', train.shape)

print('Store previous targets for cumulative median')
y_lag_meds = train.loc[:, ['id', 'y_lag']]

### Preprocess
('train before preprocess:', (806298, 111))
('timestamps:', 906)
('train after preprocess:', (806298, 251))
Store previous targets for cumulative median


In [12]:
###################################
### Models
###################################


###################################
### Use all the features for tree model
### And selected features for LR
###################################

cols_et = [c for c in train.columns if c not in cols_excl]
cols_lr0 = ['y_lag', 'ma', 'technical_11', 'fundamental_11', 'technical_11_B', 'fundamental_11_B']
cols_lr1 = ['y_lag', 'technical_22', 'technical_34', 'technical_22_B', 'technical_34_B']
cols_lr2 = ['ma', 'y_lag_prod', 'y_lag_diff']

post_ts10 = (train.timestamp > 10)
y_is_within_cut = (post_ts10) & (Y_CLIP_LO < train.y) & (train.y < Y_CLIP_HI)


In [13]:
print('MODEL: Extra Trees')
print('Features:', len(cols_et))
rfr = ExtraTreesRegressor(n_estimators=75, max_depth=5, min_samples_split=30, min_samples_leaf=16, n_jobs=N_THREADS, random_state=20170214)
model_et = rfr.fit(train.loc[post_ts10, cols_et], train.loc[post_ts10, 'y'])

###################################
### Linear Models are susceptible
### to outliers. try truncated one.
###################################
print('Linear Regression')
model_lr0 = Ridge(fit_intercept=False)
model_lr0.fit(train.loc[y_is_within_cut, cols_lr0], train.loc[y_is_within_cut, 'y'])

model_lr1 = Ridge(fit_intercept=False)
model_lr1.fit(train.loc[y_is_within_cut, cols_lr1], train.loc[y_is_within_cut, 'y'])

model_lr2 = Ridge(fit_intercept=False)
model_lr2.fit(train.loc[y_is_within_cut, cols_lr2], train.loc[y_is_within_cut, 'y'])

models = {'et': model_et, 'lr0': model_lr0, 'lr1': model_lr1, 'lr2': model_lr2}
model_cols = {'et': cols_et, 'lr0': cols_lr0, 'lr1': cols_lr1, 'lr2': cols_lr2}
model_weights = {'et': 0.6, 'lr0': 0.22, 'lr1': 0.03, 'lr2': 0.15}

MODEL: Extra Trees
('Features:', 234)
Linear Regression


In [14]:
### Rolling Timestamp adjustment
test = train.loc[train.timestamp > train.timestamp.max() - 14, :]
test['y_hat'] = 0.0
for n, m in models.items():
    test['y_hat'] += m.predict(test[model_cols[n]]) * model_weights[n]
med_ts = test.groupby('timestamp')[['y_lag', 'y_hat']].apply(np.std)
med_ts = deque(med_ts['y_lag'] / med_ts['y_hat'])

# Clean up
train.drop([c for c in train.columns if c not in ['id', 'timestamp', 'y']], axis=1, inplace=True)
del train, test, post_ts10, y_is_within_cut
collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


137

In [None]:
###################################
### rolling fitting
###################################

while True:
    # Preprocess
    test = o.features
    test = preprocess_pipe.transform(test)

    # Predict
    test['y_hat'] = 0.0
    for n, m in models.items():
        test['y_hat'] += m.predict(test[model_cols[n]]) * model_weights[n]

    # Adjust y_hat by timestamp variability
    if len(test) > MIN_ADJ_DATA:
        y_lag_sd_ts = test.y_lag.std()
        y_hat_sd_ts = test.y_hat.std()

        if y_lag_sd_ts > 1e-8 and y_hat_sd_ts > 1e-8:
            a = y_lag_sd_ts / y_hat_sd_ts
            m = np.median(med_ts)
            if a > 1e-8 and m > 1e-8:
                adj = np.clip(a / m, TS_ADJ_CLIP_LO, TS_ADJ_CLIP_HI)
                test['y_hat'] *= adj
                med_ts.popleft()
                med_ts.append(a)

    # Adjust y_hat by cumulative median
    y_lag_meds = pd.concat([y_lag_meds, test[['id', 'y_lag']]])
    y_lag_med = y_lag_meds.groupby('id').median().reset_index(drop=False)
    test = pd.merge(test, y_lag_med, how='left', on='id', suffixes=['', '_med'])
    test.loc[test.ti<10, 'y_lag_med'] = 0.0
    test['y_hat'] = test['y_hat'] * (1 - CUMMED_ADJ_RATIO) + test['y_lag_med'] * (CUMMED_ADJ_RATIO)

    # Clip
    test['y_hat'] = test['y_hat'].clip(Y_CLIP_LO, Y_CLIP_HI)

    # Cleanup
    pred = o.target
    pred['y'] = test['y_hat']
    test.drop([c for c in test.columns if c not in ['id', 'timestamp', 'y_hat']], axis=1, inplace=True)
    del y_lag_med
    collect()

    o, reward, done, info = env.step(pred)

    if done:
        print("el fin ...", info["public_score"])
        break
    if o.features.timestamp[0] % 100 == 0:
        print('{} {:.2f} {:.0f} {:.2f}'.format(o.features.timestamp[0], reward, a, adj))


1000 -0.31 64 1.15
1100 -0.18 64 1.03
1200 -0.17 54 0.95
1300 0.00 61 1.01
1400 -0.13 64 1.04
1500 -0.17 52 1.01
1600 -0.20 52 0.93
1700 -0.30 53 0.91
