In [1]:
import numpy as np
import pandas as pd
import gc
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from scipy.stats import rankdata
from sklearn.metrics import roc_auc_score

In [2]:
train_df = pd.read_csv("../input/train.csv")
test_df  = pd.read_csv("../input/test.csv")
features = [x for x in train_df.columns if x.startswith("var")]

In [3]:
#Reverse some features.
#Not really necessary for LGB, but helps a little
for var in features:
    if np.corrcoef( train_df['target'], train_df[var] )[1][0] < 0:
        train_df[var] = train_df[var] * -1
        test_df[var]  = test_df[var]  * -1

In [4]:
#count train values to split Rare/NonRare values
var_stats = {}
for var in features:
    var_stats[var] = train_df[var].value_counts()

In [5]:
def logit(p):
    return np.log(p) - np.log(1 - p)

def var_to_feat(vr, var_stats, feat_id ):
    new_df = pd.DataFrame()
    new_df["var"] = vr.values
    new_df["hist"] = pd.Series(vr).map(var_stats)
    new_df["feature_id"] = feat_id
    new_df["var_rank"] = new_df["var"].rank()/200000.
    return new_df.values

In [6]:
TARGET = np.array( list(train_df['target'].values) * 200 )

TRAIN = []
var_mean = {}
var_var  = {}
for var in features:
    tmp = var_to_feat(train_df[var], var_stats[var], int(var[4:]) )
    var_mean[var] = np.mean(tmp[:,0]) 
    var_var[var]  = np.var(tmp[:,0])
    tmp[:,0] = (tmp[:,0]-var_mean[var])/var_var[var]
    TRAIN.append( tmp )
TRAIN = np.vstack( TRAIN )

target = train_df['target'].values
del train_df
_=gc.collect()

print( TRAIN.shape, len( TARGET ) )

(40000000, 4) 40000000


In [7]:
model = lgb.LGBMClassifier(**{
     'learning_rate': 0.03,
     'num_leaves': 31,
     'max_bin': 1023,
     'min_child_samples': 1000,
     'feature_fraction': 1.0,
     'bagging_freq': 1,
     'bagging_fraction': 0.85,
     'objective': 'binary',
     'n_jobs': -1,
     'n_estimators':200,})

NFOLDS = 10
predtrain = np.zeros( len(TARGET) )
MODELS = []
skf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=11111)
for fold_, (train_indexes, valid_indexes) in enumerate(skf.split(TRAIN, TARGET)):
    print('Fold:', fold_ )
    model = model.fit( TRAIN[train_indexes], TARGET[train_indexes],
                      eval_set = (TRAIN[valid_indexes], TARGET[valid_indexes]),
                      verbose = 100,
                      eval_metric='auc',
                      early_stopping_rounds=20,
                      categorical_feature = [2] )
    MODELS.append( model )
    predtrain[valid_indexes] = model.predict_proba( TRAIN[valid_indexes] )[:,1] 

#Reshape to original format 200k x 200
pred = np.reshape( predtrain , (200000,200) , order='F' )
#Use logit for better performance
print( NFOLDS,'-Fold CV AUC:',roc_auc_score( target, np.mean( logit(pred),axis=1)  ) )
_=gc.collect()

Fold: 0


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 20 rounds.
[100]	valid_0's binary_logloss: 0.325236	valid_0's auc: 0.52819
Early stopping, best iteration is:
[103]	valid_0's binary_logloss: 0.325235	valid_0's auc: 0.528192
Fold: 1


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 20 rounds.
[100]	valid_0's binary_logloss: 0.325246	valid_0's auc: 0.527266
Early stopping, best iteration is:
[141]	valid_0's binary_logloss: 0.32524	valid_0's auc: 0.5273
Fold: 2


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 20 rounds.
Early stopping, best iteration is:
[63]	valid_0's binary_logloss: 0.325262	valid_0's auc: 0.528359
Fold: 3


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 20 rounds.
[100]	valid_0's binary_logloss: 0.325234	valid_0's auc: 0.527697
Early stopping, best iteration is:
[104]	valid_0's binary_logloss: 0.325233	valid_0's auc: 0.527699
Fold: 4


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 20 rounds.
[100]	valid_0's binary_logloss: 0.325206	valid_0's auc: 0.52901
Early stopping, best iteration is:
[121]	valid_0's binary_logloss: 0.325201	valid_0's auc: 0.529025
Fold: 5


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 20 rounds.
[100]	valid_0's binary_logloss: 0.325254	valid_0's auc: 0.528546
[200]	valid_0's binary_logloss: 0.325246	valid_0's auc: 0.528622
Did not meet early stopping. Best iteration is:
[196]	valid_0's binary_logloss: 0.325246	valid_0's auc: 0.528626
Fold: 6


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 20 rounds.
[100]	valid_0's binary_logloss: 0.325208	valid_0's auc: 0.52891
Early stopping, best iteration is:
[134]	valid_0's binary_logloss: 0.325201	valid_0's auc: 0.528932
Fold: 7


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 20 rounds.
Early stopping, best iteration is:
[50]	valid_0's binary_logloss: 0.325298	valid_0's auc: 0.528562
Fold: 8


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 20 rounds.
[100]	valid_0's binary_logloss: 0.325245	valid_0's auc: 0.528089
Early stopping, best iteration is:
[98]	valid_0's binary_logloss: 0.325245	valid_0's auc: 0.528099
Fold: 9


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 20 rounds.
[100]	valid_0's binary_logloss: 0.325246	valid_0's auc: 0.528503
Early stopping, best iteration is:
[100]	valid_0's binary_logloss: 0.325246	valid_0's auc: 0.528503
10 -Fold CV AUC: 0.9165690729625898


In [8]:
ypred = np.zeros( (200000,200) )
for feat,var in enumerate(features):
    #build dataset
    tmp = var_to_feat(test_df[var], var_stats[var], int(var[4:]) )
    #Standard Scale feature according train statistics
    tmp[:,0] = (tmp[:,0]-var_mean[var])/var_var[var]
    tmp[:,1] = tmp[:,1] + 1
    #Write 1 to frequency of values not seem in trainset
    tmp[ np.isnan(tmp) ] = 1
    #Predict testset for N folds
    for model_id in range(NFOLDS):
        model = MODELS[model_id]
        ypred[:,feat] += model.predict_proba( tmp )[:,1] / NFOLDS
ypred = np.mean( logit(ypred), axis=1 )

sub = test_df[['ID_code']]
sub['target'] = ypred
sub['target'] = sub['target'].rank() / 200000.
sub.to_csv('golden_sub.csv', index=False)
print( sub.head(20) )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


    ID_code    target
0    test_0  0.857510
1    test_1  0.902815
2    test_2  0.883070
3    test_3  0.880810
4    test_4  0.665730
5    test_5  0.029380
6    test_6  0.096035
7    test_7  0.753030
8    test_8  0.042735
9    test_9  0.066690
10  test_10  0.905125
11  test_11  0.742660
12  test_12  0.531325
13  test_13  0.600985
14  test_14  0.162855
15  test_15  0.445255
16  test_16  0.957105
17  test_17  0.460795
18  test_18  0.771680
19  test_19  0.193745
