In [1]:
import numpy as np
import pandas as pd
import gc
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from scipy.stats import rankdata

In [2]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

features = [x for x in train_df.columns if x.startswith("var")]


In [3]:
#Reverse features
for var in features:
    if np.corrcoef( train_df['target'], train_df[var] )[1][0] < 0:
        train_df[var] = train_df[var] * -1
        test_df[var]  = test_df[var]  * -1

In [4]:
#count all values
var_stats = {}
hist_df = pd.DataFrame()
for var in features:
    var_stats = train_df[var].append(test_df[var]).value_counts()
    hist_df[var] = pd.Series(test_df[var]).map(var_stats)
    hist_df[var] = hist_df[var] > 1
#remove fake test rows
ind = hist_df.sum(axis=1) != 200

In [5]:
#recount values without fake rows
var_stats = {}
for var in features:
    var_stats[var] = train_df[var].append(test_df[ind][var]).value_counts()

In [6]:
def logit(p):
    return np.log(p) - np.log(1 - p)

def var_to_feat(vr, var_stats, feat_id ):
    new_df = pd.DataFrame()
    new_df["var"] = vr.values
    new_df["hist"] = pd.Series(vr).map(var_stats)
    new_df["feature_id"] = feat_id
    new_df["var_rank"] = new_df["var"].rank()/200000.
    return new_df.values

In [7]:
TARGET = np.array( list(train_df['target'].values) * 200 )

TRAIN = []
var_mean = {}
var_var  = {}
for var in features:
    tmp = var_to_feat(train_df[var], var_stats[var], int(var[4:]) )
    var_mean[var] = np.mean(tmp[:,0]) 
    var_var[var]  = np.var(tmp[:,0])
    tmp[:,0] = (tmp[:,0]-var_mean[var])/var_var[var]
    TRAIN.append( tmp )
TRAIN = np.vstack( TRAIN )

del train_df
_=gc.collect()

print( TRAIN.shape, len( TARGET ) )

(40000000, 4) 40000000


In [8]:
model = lgb.LGBMClassifier(**{
     'learning_rate': 0.04,
     'num_leaves': 31,
     'max_bin': 1023,
     'min_child_samples': 1000,
     'reg_alpha': 0.1,
     'reg_lambda': 0.2,
     'feature_fraction': 1.0,
     'bagging_freq': 1,
     'bagging_fraction': 0.85,
     'objective': 'binary',
     'n_jobs': -1,
     'n_estimators':200,})

MODELS = []
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=11111)
for fold_, (train_indexes, valid_indexes) in enumerate(skf.split(TRAIN, TARGET)):
    print('Fold:', fold_ )
    model = model.fit( TRAIN[train_indexes], TARGET[train_indexes],
                      eval_set = (TRAIN[valid_indexes], TARGET[valid_indexes]),
                      verbose = 10,
                      eval_metric='auc',
                      early_stopping_rounds=25,
                      categorical_feature = [2] )
    MODELS.append( model )

del TRAIN, TARGET
_=gc.collect()

Fold: 0


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's auc: 0.528068	valid_0's binary_logloss: 0.325585
[20]	valid_0's auc: 0.528253	valid_0's binary_logloss: 0.325354
[30]	valid_0's auc: 0.5283	valid_0's binary_logloss: 0.325251
[40]	valid_0's auc: 0.528371	valid_0's binary_logloss: 0.325201
[50]	valid_0's auc: 0.528432	valid_0's binary_logloss: 0.325176
[60]	valid_0's auc: 0.528445	valid_0's binary_logloss: 0.325162
[70]	valid_0's auc: 0.52843	valid_0's binary_logloss: 0.325154
[80]	valid_0's auc: 0.528391	valid_0's binary_logloss: 0.32515
[90]	valid_0's auc: 0.528347	valid_0's binary_logloss: 0.325148
Early stopping, best iteration is:
[65]	valid_0's auc: 0.528454	valid_0's binary_logloss: 0.325157
Fold: 1


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's auc: 0.526741	valid_0's binary_logloss: 0.325592
[20]	valid_0's auc: 0.527081	valid_0's binary_logloss: 0.325365
[30]	valid_0's auc: 0.527221	valid_0's binary_logloss: 0.325263
[40]	valid_0's auc: 0.527306	valid_0's binary_logloss: 0.325215
[50]	valid_0's auc: 0.527396	valid_0's binary_logloss: 0.32519
[60]	valid_0's auc: 0.527444	valid_0's binary_logloss: 0.325175
[70]	valid_0's auc: 0.527445	valid_0's binary_logloss: 0.325168
[80]	valid_0's auc: 0.527465	valid_0's binary_logloss: 0.325163
[90]	valid_0's auc: 0.52746	valid_0's binary_logloss: 0.32516
[100]	valid_0's auc: 0.52746	valid_0's binary_logloss: 0.325158
[110]	valid_0's auc: 0.527498	valid_0's binary_logloss: 0.325156
[120]	valid_0's auc: 0.527492	valid_0's binary_logloss: 0.325155
[130]	valid_0's auc: 0.527458	valid_0's binary_logloss: 0.325155
Early stopping, best iteration is:
[110]	valid_0's auc: 0.527498	valid_0's binary_logloss: 0.325156
Fold: 

New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's auc: 0.528186	valid_0's binary_logloss: 0.325583
[20]	valid_0's auc: 0.528469	valid_0's binary_logloss: 0.325348
[30]	valid_0's auc: 0.528523	valid_0's binary_logloss: 0.325244
[40]	valid_0's auc: 0.528568	valid_0's binary_logloss: 0.325193
[50]	valid_0's auc: 0.528572	valid_0's binary_logloss: 0.325168
[60]	valid_0's auc: 0.528543	valid_0's binary_logloss: 0.325153
[70]	valid_0's auc: 0.528526	valid_0's binary_logloss: 0.325145
Early stopping, best iteration is:
[49]	valid_0's auc: 0.528584	valid_0's binary_logloss: 0.32517
Fold: 3


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's auc: 0.527267	valid_0's binary_logloss: 0.325586
[20]	valid_0's auc: 0.527657	valid_0's binary_logloss: 0.325357
[30]	valid_0's auc: 0.527799	valid_0's binary_logloss: 0.325255
[40]	valid_0's auc: 0.52782	valid_0's binary_logloss: 0.325207
[50]	valid_0's auc: 0.527854	valid_0's binary_logloss: 0.325183
[60]	valid_0's auc: 0.527898	valid_0's binary_logloss: 0.325169
[70]	valid_0's auc: 0.5279	valid_0's binary_logloss: 0.325162
[80]	valid_0's auc: 0.527903	valid_0's binary_logloss: 0.325158
[90]	valid_0's auc: 0.527917	valid_0's binary_logloss: 0.325155
[100]	valid_0's auc: 0.52792	valid_0's binary_logloss: 0.325153
[110]	valid_0's auc: 0.527915	valid_0's binary_logloss: 0.325152
Early stopping, best iteration is:
[93]	valid_0's auc: 0.527927	valid_0's binary_logloss: 0.325154
Fold: 4


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's auc: 0.528537	valid_0's binary_logloss: 0.325578
[20]	valid_0's auc: 0.528778	valid_0's binary_logloss: 0.325341
[30]	valid_0's auc: 0.528977	valid_0's binary_logloss: 0.325233
[40]	valid_0's auc: 0.529028	valid_0's binary_logloss: 0.325181
[50]	valid_0's auc: 0.529068	valid_0's binary_logloss: 0.325154
[60]	valid_0's auc: 0.529078	valid_0's binary_logloss: 0.32514
[70]	valid_0's auc: 0.529078	valid_0's binary_logloss: 0.32513
[80]	valid_0's auc: 0.52911	valid_0's binary_logloss: 0.325125
[90]	valid_0's auc: 0.529115	valid_0's binary_logloss: 0.325122
[100]	valid_0's auc: 0.529111	valid_0's binary_logloss: 0.325121
[110]	valid_0's auc: 0.529135	valid_0's binary_logloss: 0.325119
[120]	valid_0's auc: 0.529138	valid_0's binary_logloss: 0.325118
[130]	valid_0's auc: 0.529135	valid_0's binary_logloss: 0.325117
[140]	valid_0's auc: 0.529135	valid_0's binary_logloss: 0.325117
[150]	valid_0's auc: 0.529121	valid_0's 

New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's auc: 0.528025	valid_0's binary_logloss: 0.325597
[20]	valid_0's auc: 0.528384	valid_0's binary_logloss: 0.325369
[30]	valid_0's auc: 0.528583	valid_0's binary_logloss: 0.325267
[40]	valid_0's auc: 0.528685	valid_0's binary_logloss: 0.325218
[50]	valid_0's auc: 0.52874	valid_0's binary_logloss: 0.325194
[60]	valid_0's auc: 0.528744	valid_0's binary_logloss: 0.325181
[70]	valid_0's auc: 0.528765	valid_0's binary_logloss: 0.325174
[80]	valid_0's auc: 0.528755	valid_0's binary_logloss: 0.325169
[90]	valid_0's auc: 0.528732	valid_0's binary_logloss: 0.325167
Early stopping, best iteration is:
[68]	valid_0's auc: 0.528768	valid_0's binary_logloss: 0.325174
Fold: 6


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's auc: 0.528632	valid_0's binary_logloss: 0.325578
[20]	valid_0's auc: 0.528933	valid_0's binary_logloss: 0.325339
[30]	valid_0's auc: 0.528982	valid_0's binary_logloss: 0.325232
[40]	valid_0's auc: 0.529017	valid_0's binary_logloss: 0.325181
[50]	valid_0's auc: 0.52904	valid_0's binary_logloss: 0.325154
[60]	valid_0's auc: 0.529058	valid_0's binary_logloss: 0.325138
[70]	valid_0's auc: 0.529054	valid_0's binary_logloss: 0.32513
[80]	valid_0's auc: 0.52905	valid_0's binary_logloss: 0.325124
[90]	valid_0's auc: 0.529059	valid_0's binary_logloss: 0.32512
[100]	valid_0's auc: 0.529046	valid_0's binary_logloss: 0.325119
[110]	valid_0's auc: 0.529038	valid_0's binary_logloss: 0.325117
Early stopping, best iteration is:
[86]	valid_0's auc: 0.529077	valid_0's binary_logloss: 0.325122
Fold: 7


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's auc: 0.528481	valid_0's binary_logloss: 0.325589
[20]	valid_0's auc: 0.528741	valid_0's binary_logloss: 0.325359
[30]	valid_0's auc: 0.528838	valid_0's binary_logloss: 0.325256
[40]	valid_0's auc: 0.528863	valid_0's binary_logloss: 0.325208
[50]	valid_0's auc: 0.528913	valid_0's binary_logloss: 0.325183
[60]	valid_0's auc: 0.528915	valid_0's binary_logloss: 0.325169
[70]	valid_0's auc: 0.528884	valid_0's binary_logloss: 0.325161
Early stopping, best iteration is:
[54]	valid_0's auc: 0.528921	valid_0's binary_logloss: 0.325177
Fold: 8


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's auc: 0.527734	valid_0's binary_logloss: 0.325589
[20]	valid_0's auc: 0.528047	valid_0's binary_logloss: 0.325358
[30]	valid_0's auc: 0.528117	valid_0's binary_logloss: 0.325254
[40]	valid_0's auc: 0.528177	valid_0's binary_logloss: 0.325205
[50]	valid_0's auc: 0.528263	valid_0's binary_logloss: 0.32518
[60]	valid_0's auc: 0.528307	valid_0's binary_logloss: 0.325167
[70]	valid_0's auc: 0.528368	valid_0's binary_logloss: 0.325159
[80]	valid_0's auc: 0.528382	valid_0's binary_logloss: 0.325154
[90]	valid_0's auc: 0.528381	valid_0's binary_logloss: 0.325151
[100]	valid_0's auc: 0.528374	valid_0's binary_logloss: 0.32515
[110]	valid_0's auc: 0.528369	valid_0's binary_logloss: 0.325149
[120]	valid_0's auc: 0.528372	valid_0's binary_logloss: 0.325148
Early stopping, best iteration is:
[95]	valid_0's auc: 0.528393	valid_0's binary_logloss: 0.32515
Fold: 9


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's auc: 0.528183	valid_0's binary_logloss: 0.325586
[20]	valid_0's auc: 0.528359	valid_0's binary_logloss: 0.325356
[30]	valid_0's auc: 0.528532	valid_0's binary_logloss: 0.325251
[40]	valid_0's auc: 0.528611	valid_0's binary_logloss: 0.325202
[50]	valid_0's auc: 0.52865	valid_0's binary_logloss: 0.325176
[60]	valid_0's auc: 0.528679	valid_0's binary_logloss: 0.325162
[70]	valid_0's auc: 0.528729	valid_0's binary_logloss: 0.325154
[80]	valid_0's auc: 0.528725	valid_0's binary_logloss: 0.32515
[90]	valid_0's auc: 0.528738	valid_0's binary_logloss: 0.325147
[100]	valid_0's auc: 0.528741	valid_0's binary_logloss: 0.325145
[110]	valid_0's auc: 0.528721	valid_0's binary_logloss: 0.325145
[120]	valid_0's auc: 0.528751	valid_0's binary_logloss: 0.325144
[130]	valid_0's auc: 0.52875	valid_0's binary_logloss: 0.325143
[140]	valid_0's auc: 0.528749	valid_0's binary_logloss: 0.325143
[150]	valid_0's auc: 0.528759	valid_0's 

In [9]:
ypred = np.zeros( (200000,200) )
for feat,var in enumerate(features):
    tmp = var_to_feat(test_df[var], var_stats[var], int(var[4:]) )
    tmp[:,0] = (tmp[:,0]-var_mean[var])/var_var[var]
    for model_id in range(10):
        model = MODELS[model_id]
        ypred[:,feat] += model.predict_proba( tmp )[:,1] / 10.
ypred = np.mean( logit(ypred), axis=1 )

sub = test_df[['ID_code']]
sub['target'] = ypred
sub['target'] = sub['target'].rank() / 200000.
sub.to_csv('golden_sub.csv', index=False)
print( sub.head(10) )


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


  ID_code    target
0  test_0  0.795210
1  test_1  0.875305
2  test_2  0.845110
3  test_3  0.868790
4  test_4  0.742180
5  test_5  0.068705
6  test_6  0.109855
7  test_7  0.640410
8  test_8  0.055695
9  test_9  0.164060
