In [5]:
import kagglegym
import numpy as np
import pandas as pd
import random
from sklearn import ensemble, linear_model, metrics

env = kagglegym.make()
o = env.reset()
train = o.train
print(train.shape)
d_mean= train.median(axis=0)
train["nbnulls"]=train.isnull().sum(axis=1)
col=[x for x in train.columns if x not in ['id', 'timestamp', 'y']]

rnd=17

#keeping na information on some columns (best selected by the tree algorithms)
add_nas_ft=True
nas_cols=['technical_9', 'technical_0', 'technical_32', 'technical_16', 'technical_38', 
'technical_44', 'technical_20', 'technical_30', 'technical_13']
#columns kept for evolution from one month to another (best selected by the tree algorithms)
add_diff_ft=True
diff_cols=['technical_22','technical_20', 'technical_30', 'technical_13', 'technical_34']

if add_nas_ft:
    for elt in nas_cols:
        train[elt + '_na'] = pd.isnull(train[elt]).apply(lambda x: 1 if x else 0)
        #no need to keep columns with no information
        if len(train[elt + '_na'].unique())==1:
            print("removed:", elt, '_na')
            del train[elt + '_na']
            nas_cols.remove(elt)


if add_diff_ft:
    train=train.sort_values(by=['id','timestamp'])
    for elt in diff_cols:
        #a quick way to obtain deltas from one month to another but it is false on the first
        #month of each id
        train[elt+"_d"]= train[elt].rolling(2).apply(lambda x:x[1]-x[0]).fillna(0)
    #removing month 0 to reduce the impact of erroneous deltas
    train=train[train.timestamp!=0]

print(train.shape)
cols=[x for x in train.columns if x not in ['id', 'timestamp', 'y']]


(806298, 111)
(805548, 126)


In [3]:
#homemade class used to infer randomly on the way the model learns
class createLinearFeatures:
    
    def __init__(self, n_neighbours=1, max_elts=None, verbose=True, random_state=None):
        self.rnd=random_state
        self.n=n_neighbours
        self.max_elts=max_elts
        self.verbose=verbose
        self.neighbours=[]
        self.clfs=[]
        
    def fit(self,train,y):
        if self.rnd!=None:
            random.seed(self.rnd)
        if self.max_elts==None:
            self.max_elts=len(train.columns)
        list_vars=list(train.columns)
        random.shuffle(list_vars)
        
        lastscores=np.zeros(self.n)+1e15

        for elt in list_vars[:self.n]:
            self.neighbours.append([elt])
        list_vars=list_vars[self.n:]
        
        for elt in list_vars:
            indice=0
            scores=[]
            for elt2 in self.neighbours:
                if len(elt2)<self.max_elts:
                    clf=linear_model.LinearRegression(fit_intercept=False, normalize=True, copy_X=True, n_jobs=-1) 
                    clf.fit(train[elt2+[elt]], y)
                    scores.append(metrics.mean_squared_error(y,clf.predict(train[elt2 + [elt]])))
                    indice=indice+1
                else:
                    scores.append(lastscores[indice])
                    indice=indice+1
            gains=lastscores-scores
            if gains.max()>0:
                temp=gains.argmax()
                lastscores[temp]=scores[temp]
                self.neighbours[temp].append(elt)

        indice=0
        for elt in self.neighbours:
            clf=linear_model.LinearRegression(fit_intercept=False, normalize=True, copy_X=True, n_jobs=-1) 
            clf.fit(train[elt], y)
            self.clfs.append(clf)
            if self.verbose:
                print(indice, lastscores[indice], elt)
            indice=indice+1
                    
    def transform(self, train):
        indice=0
        for elt in self.neighbours:
            #this line generates a warning. Could be avoided by working and returning
            #with a copy of train.
            #kept this way for memory management
            train['neighbour'+str(indice)]=self.clfs[indice].predict(train[elt])
            indice=indice+1
        return train
    
    def fit_transform(self, train, y):
        self.fit(train, y)
        return self.transform(train)



In [4]:
#a home-made class attempt to remove outliers by successive quantization on residuals
class recurrent_linear_approx():
    def __init__(self, quant=.999, limit_size_train=.9):
        self.quant=quant
        self.limit_size_train=limit_size_train
        self.bestmodel=[]
       
    def fit(self, train, y):
        internal_model=linear_model.RidgeCV(fit_intercept=False)
        bestscore=1e15
        better=True
        indextrain=train.dropna().index
        limitlen=len(train)*self.limit_size_train
        while better:
            internal_model.fit(train.ix[indextrain], y.ix[indextrain])
            score=metrics.mean_squared_error(internal_model.predict(train.ix[indextrain]), y.ix[indextrain])
            if score < bestscore:
                bestscore=score
                self.bestmodel=internal_model
                residual=y.ix[indextrain]-internal_model.predict(train.ix[indextrain])
                indextrain=residual[abs(residual)<=abs(residual).quantile(self.quant)].index
                if len(indextrain)<limitlen:
                    better=False
            else:
                better=False
                self.bestmodel=internal_model

    def predict(self, test):
        return self.bestmodel.predict(test)



In [6]:
#generation of linear models
cols2fit=['technical_22','technical_20', 'technical_30_d', 'technical_20_d', 'technical_30', 
'technical_13', 'technical_34']
models=[]
columns=[]
residuals=[]
for elt in cols2fit:
    print("fitting linear model on ", elt)
    model=recurrent_linear_approx(quant=.99, limit_size_train=.9)
    model.fit(train.loc[:,[elt]],train.loc[:, 'y'])
    models.append(model)
    columns.append([elt])
    residuals.append(abs(model.predict(train[[elt]].fillna(d_mean))-train.y))

train=train.fillna(d_mean)
    


('fitting linear model on ', 'technical_22')
('fitting linear model on ', 'technical_20')
('fitting linear model on ', 'technical_30_d')
('fitting linear model on ', 'technical_20_d')
('fitting linear model on ', 'technical_30')
('fitting linear model on ', 'technical_13')
('fitting linear model on ', 'technical_34')


In [7]:
#adding all trees generated by a tree regressor
print("adding new features")
featureexpander=createLinearFeatures(n_neighbours=30, max_elts=2, verbose=True, random_state=rnd)
index2use=train[abs(train.y)<0.086].index
featureexpander.fit(train.ix[index2use,cols],train.ix[index2use,'y'])
trainer=featureexpander.transform(train[cols])
treecols=trainer.columns

print("training trees")
model = ensemble.ExtraTreesRegressor(n_estimators=100, max_depth=4, n_jobs=-1, random_state=rnd, verbose=0)
model.fit(trainer,train.y)
print(pd.DataFrame(model.feature_importances_,index=treecols).sort_values(by=[0]).tail(30))
for elt in model.estimators_:
    models.append(elt)
    columns.append(treecols)
    residuals.append(abs(elt.predict(trainer)-train.y))




adding new features
(0, 0.00037784202140755951, [u'fundamental_63', u'fundamental_20'])
(1, 0.00037783532752655447, [u'technical_33', u'fundamental_42'])
(2, 0.00037783311563543975, [u'fundamental_29', u'fundamental_23'])
(3, 0.00037783940206281841, [u'fundamental_41', u'fundamental_46'])
(4, 0.0003778421669267118, [u'technical_41', u'fundamental_44'])
(5, 0.00037783652078360319, [u'fundamental_7', u'fundamental_0'])
(6, 0.00037780959974043071, [u'fundamental_54', u'fundamental_5'])
(7, 0.00037784167216159403, [u'fundamental_17', 'nbnulls'])
(8, 0.00037783363950438797, [u'technical_6', u'fundamental_45'])
(9, 0.00037784193409606814, [u'fundamental_6', u'fundamental_19'])
(10, 0.00037783838342875242, [u'technical_0', u'fundamental_35'])
(11, 0.00037783297011628747, [u'fundamental_31', u'technical_44'])
(12, 0.00037781347054988146, [u'fundamental_12', u'fundamental_13'])
(13, 0.00037781929131597281, [u'fundamental_18', u'fundamental_26'])
(14, 0.00037784263258799911, [u'fundamental_22', 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


training trees
                        0
fundamental_23   0.007772
neighbour13      0.007933
technical_0_na   0.008082
technical_21     0.008340
technical_40     0.008478
neighbour19      0.009486
fundamental_53   0.009672
fundamental_18   0.010207
technical_9_na   0.010329
technical_38_na  0.011448
technical_19     0.011714
fundamental_8    0.012363
technical_32_na  0.012939
fundamental_58   0.013495
technical_7      0.013629
technical_17     0.014164
technical_30_d   0.014676
neighbour23      0.016480
technical_44_na  0.023595
technical_6      0.024417
neighbour28      0.026658
technical_20_d   0.027523
technical_2      0.033885
neighbour8       0.037184
technical_43     0.041851
neighbour20      0.052079
fundamental_11   0.067610
technical_11     0.073563
technical_20     0.092986
technical_30     0.121845


In [10]:
#model selection : create a new target selecting models with lowest asolute residual for each line
#the objective at this step is to keep only the few best elements which should
#lead to a better generalization
num_to_keep=10
targetselector=np.array(residuals).T
targetselector=np.argmin(targetselector, axis=1)
print("selecting best models:")
print(pd.Series(targetselector).value_counts().head(num_to_keep))

tokeep=pd.Series(targetselector).value_counts().head(num_to_keep).index
tokeepmodels=[]
tokeepcolumns=[]
tokeepresiduals=[]
for elt in tokeep:
    tokeepmodels.append(models[elt])
    tokeepcolumns.append(columns[elt])
    tokeepresiduals.append(residuals[elt])



selecting best models:
0     177555
1     119791
18     75122
2      65190
6      45465
3      27463
15     21027
41     18533
37     18134
52     15273
dtype: int64


In [11]:
#creating a new target for a model in charge of predicting which model is best for the current line
targetselector=np.array(tokeepresiduals).T
targetselector=np.argmin(targetselector, axis=1)

print("training selection model")
modelselector = ensemble.ExtraTreesClassifier(n_estimators=100, max_depth=4, n_jobs=-1, random_state=rnd, verbose=0)
modelselector.fit(trainer, targetselector)
print(pd.DataFrame(modelselector.feature_importances_,index=treecols).sort_values(by=[0]).tail(30))

# original: lastvalues=train[train.timestamp==905][['id']+diff_cols].copy()
lastvalues=train[train.timestamp==train.timestamp.iloc[-1]][['id']+diff_cols].copy()



training selection model
                        0
technical_6      0.008497
fundamental_21   0.008564
technical_11     0.009069
technical_16_na  0.009074
technical_27     0.009353
technical_9_na   0.010328
technical_35     0.010585
technical_20_d   0.011106
technical_30_d   0.011864
technical_13     0.012851
neighbour28      0.013220
technical_2      0.013555
technical_36     0.015371
technical_17     0.015576
technical_43     0.015749
technical_29     0.017248
technical_0_na   0.017879
technical_32_na  0.019099
neighbour23      0.020424
technical_38_na  0.021112
technical_30     0.021985
neighbour20      0.024403
technical_14     0.031557
technical_7      0.031869
technical_40     0.038140
technical_20     0.044078
technical_34     0.077328
neighbour18      0.102321
technical_44_na  0.115022
technical_22     0.131173
end of trainind, now predicting


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(100, 6, 0.022921731779680134, -0.1275476969156128)
(200, 19, -0.16869656636732488, -0.11998823422526599)
(300, 28, -0.050756281559958698, -0.11969185541748409)
(400, 36, -0.14310864159303302, -0.12002827888008366)
(500, 44, -0.12920172381612383, -0.12107404327716705)
(600, 55, -0.13926399577265822, -0.11999842810919668)
(700, 64, -0.23252552884455999, -0.12707330697343383)
(800, 73, -0.080823434599645819, -0.12987168531543578)
(900, 80, -0.14540602738078329, -0.12976604214373177)
0.0191473798751


In [None]:
print("end of trainind, now predicting")
indice=0
countplus=0
rewards=[]
while True:
    indice+=1
    test = o.features
    test["nbnulls"]=test.isnull().sum(axis=1)
    if add_nas_ft:
        for elt in nas_cols:
            test[elt + '_na'] = pd.isnull(test[elt]).apply(lambda x: 1 if x else 0)
    test=test.fillna(d_mean)
    d_mean=test.median(axis=0)

    pred = o.target
    if add_diff_ft:
        #creating deltas from lastvalues
        indexcommun=list(set(lastvalues.id) & set(test.id))
        lastvalues=pd.concat([test[test.id.isin(indexcommun)]['id'],
            pd.DataFrame(test[diff_cols][test.id.isin(indexcommun)].values-lastvalues[diff_cols][lastvalues.id.isin(indexcommun)].values,
            columns=diff_cols, index=test[test.id.isin(indexcommun)].index)],
            axis=1)
        #adding them to test data    
        test=test.merge(right=lastvalues, how='left', on='id', suffixes=('','_d')).fillna(0)
        #storing new lastvalues
        lastvalues=test[['id']+diff_cols].copy()
    
    testid=test.id
    test=featureexpander.transform(test[cols])
    #prediction using modelselector and models list
    selected_prediction = modelselector.predict_proba(test.loc[: ,treecols])
    for ind,elt in enumerate(tokeepmodels):
        pred['y']+=selected_prediction[:,ind]*elt.predict(test[tokeepcolumns[ind]])

    indexbase=pred.index
    pred.index=testid    
    oldpred=pred['y']
    pred.index=indexbase
    
    o, reward, done, info = env.step(pred)
    rewards.append(reward)
    if reward>0:
        countplus+=1
    
    if indice%100==0:
        print(indice, countplus, reward, np.mean(rewards))
        
    if done:
        print(info["public_score"])
        break

In [18]:
targetselector2=np.array(residuals).T


In [23]:
tmp=(np.mean(targetselector2,axis=0))

In [20]:
tokeep

Int64Index([0, 1, 18, 2, 6, 3, 15, 41, 37, 52], dtype='int64')

In [26]:
tmp.argsort()[:(num_to_keep)]

array([37, 22, 86, 72, 33, 97, 85, 57, 84, 12])

In [30]:
targetselector=np.array(residuals).T
targetselector=np.argmin(targetselector, axis=1)
print("selecting best models:")
print(pd.Series(targetselector).value_counts())

selecting best models:
0      177555
1      119791
18      75122
2       65190
6       45465
3       27463
15      21027
41      18533
37      18134
52      15273
104     12419
22      11741
63      11710
64      11572
17      10208
97       8289
102      8281
23       8001
84       6467
44       6434
29       6124
12       6115
30       5435
95       5205
99       4916
48       4663
87       4578
67       4576
68       4229
106      4110
        ...  
49        574
72        568
77        566
83        566
25        550
92        542
60        532
56        523
61        509
27        499
7         461
4         432
78        373
35        353
21        338
100       324
28        300
59        266
76        240
82        191
90        187
45        180
75        176
31        144
74        143
43        138
71        128
42        124
9         110
91         65
dtype: int64


In [31]:
modelselector.predict_proba

<bound method ExtraTreesClassifier.predict_proba of ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=4, max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=-1, oob_score=False, random_state=17,
           verbose=0, warm_start=False)>

In [37]:
(np.array(tokeepresiduals).T).shape

(805548, 10)

In [39]:
targetselector.shape

(805548,)