In [56]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score as cv_scoring
from sklearn.metrics import log_loss
from tqdm import tqdm_notebook
import matplotlib.pylab as plt
import copy
from catboost import CatBoostClassifier, CatBoost

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import optimizers
from keras.wrappers.scikit_learn import KerasClassifier 

plt.style.use('ggplot')

In [57]:
train = pd.read_csv('train.csv', delimiter=';', na_values='None')
test  = pd.read_csv('test.csv', delimiter=';', na_values='None')

N_train = train.shape[0]
data = pd.concat([train, test], 0).reset_index(drop=True)

In [58]:
data = np.abs(data)

while np.any(data['ap_hi'] > 800):
    data.loc[data['ap_hi'] > 800, 'ap_hi'] /= 10

for i in xrange(3):
    data.loc[data['ap_hi'] < 22, 'ap_hi'] *= 10

while np.any(data['ap_lo'] > 600):
    data.loc[data['ap_lo'] > 600, 'ap_lo'] /= 10

for i in xrange(3):
    data.loc[data['ap_lo'] < 11, 'ap_lo'] *= 10
    
data['bmi'] = data['weight'] / (data['height'] / 100) ** 2

In [59]:
x_train = data[:N_train].drop(['id', 'cardio'], 1)
y_train = data[:N_train]['cardio']

x_test = data[N_train:].drop(['id', 'cardio'], 1)

In [60]:
data_nn = copy.deepcopy(data)
features = data_nn.columns.drop('cardio')

data_nn[features] = (data_nn[features] - data_nn[features].mean()) / data_nn[features].std()
data_nn = data_nn.fillna(data_nn.mean())

In [61]:
x_train_nn = data_nn[:N_train].drop(['id', 'cardio'], 1).values
y_train_nn = data_nn[:N_train]['cardio'].values

x_test_nn = data_nn[N_train:].drop(['id', 'cardio'], 1).values

#x_train_nn = data_nn.loc[:int(0.8*N_train)-1].drop(['id', 'cardio'], 1).values
#y_train_nn = data_nn.loc[:int(0.8*N_train)-1]['cardio'].values

#x_test_nn = data_nn.loc[int(0.8*N_train):N_train-1].drop(['id', 'cardio'], 1).values


#y_test_nn = data_nn.loc[int(0.8*N_train):N_train-1]['cardio'].values

In [73]:
np.random.randint(0,1,1)

array([0])

In [78]:
y_pred_cb = np.zeros(len(x_test))

n = 15

for i in tqdm_notebook(xrange(n)):
    model = CatBoostClassifier(iterations = 500 + np.random.randint(-50,50,1)[0], 
                               l2_leaf_reg = 3 + np.random.uniform(-1, 1, 1)[0],
                               depth = 6 + np.random.randint(-1, 2, 1)[0],
                               learning_rate=0.03 + np.random.uniform(-0.01, 0.01, 1)[0],
                           )
    
    model.fit(x_train, y_train)
    
    y_pred_cb += model.predict_proba(x_test)[:,1]

y_pred_cb /= n




In [79]:
y_pred_cb

array([ 0.50442863,  0.53199236,  0.40463755, ...,  0.43266525,
        0.31372551,  0.6790423 ])

In [53]:
ltrain = lgb.Dataset(x_train, y_train)

In [54]:
def cv_score(params):
    score = 0
    for i in xrange(1):
        
        lgb_params = {
                        'objective'        : 'binary',
                        'metric'           : 'binary_logloss',
                        'num_leaves'       : max(5,int(params['num_leaves'])),
                        'max_depth'        : max(1, int(params['max_depth'])),
                        'learning_rate'    : max(0.0001, params['learning_rate']),
                        'feature_fraction' : max(0.5, min(1, params['feature_fraction'])),
                        'bagging_fraction' : max(0.5, min(1, params['bagging_fraction'])),
                        'bagging_freq'     : max(1, int(params['bagging_freq'])),
                        'colsample_bytree' : max(0.5, min(1, params['colsample_bytree'])),
                        'max_bin'          : max(100, int(params['max_bin'])),
                        'reg_alpha'        : max(0, params['reg_alpha']),
                        'reg_lambda'       : max(0, params['reg_lambda']),
                        'seed'             : np.random.randint(0, 64101, 1)[0],
                    }
        
        ltrain = lgb.Dataset(x_train, y_train)
        
        cv_result = lgb.cv(lgb_params, ltrain, num_boost_round = 1000, nfold = 5, 
                   early_stopping_rounds=20, verbose_eval = 0)
        
        score = score + min(cv_result['binary_logloss-mean']) 
        #print score
        
    return score / 1

lgb_params = {
                'objective'        : 'binary',
                'metric'           : 'binary_logloss',
                'num_leaves'       : 22,
                'max_depth'        : 6,
                'learning_rate'    : 0.02,
                'feature_fraction' : 0.7,
                'bagging_fraction' : 0.9,
                'bagging_freq'     : 5,
                'colsample_bytree' : 0.9,
                'max_bin'          : 10,
                'reg_alpha'        : 10,
                'reg_lambda'       : 10,
            }

#cv_score(lgb_params)

In [59]:
def diff_evol(func, params_bounds, N_samples, N_epochs, p_mut, w):
    
    param_names = params_bounds.keys()
    bounds      = params_bounds.values()
    
    scores = np.array([])
    
    n_coord = len(bounds)
    
    samples = np.array([np.random.uniform(x[0], x[1], size = N_samples) for x in bounds]).T
    
    for epoch in tqdm_notebook(range(N_epochs)):
        for it in xrange(N_samples):
            N_best = min(N_samples, len(samples))
            i, j, k = np.random.randint(0, N_best, 3)
            new_sample = np.copy(samples[it])
            coord_to_mut = np.random.binomial(1, p_mut, n_coord).astype(bool)
            if coord_to_mut.any():
                new_sample[coord_to_mut] = samples[i][coord_to_mut] +\
                    w*(samples[j][coord_to_mut] - samples[k][coord_to_mut])
                
                samples = np.vstack([samples, new_sample])
        
        #n_center = samples.mean(axis=0)
        
        rand_samples = np.array([np.random.uniform(x[0], x[1], 
                                                   size = int((0.2)*N_samples)) for x in bounds]).T
        
        samples = np.vstack([samples, rand_samples])
        
        
        print 'epoch = %d'%epoch
        for sample in samples[N_samples:]:
            q = dict([(name, param) for name, param in zip(param_names, sample)])
            #print q
            score = cv_score(q)
            scores = np.append(scores, score)
            print score
        
        indices = np.argsort(scores)[:N_samples]
        scores = scores[indices]
        samples = samples[indices]
        samples = samples[:N_samples]
        scores = scores[:N_samples]
        
        print scores
        
    report = [{}, {}, {}, {}, {}]
    for j, mini_report in enumerate(report):
        for i, name in enumerate(param_names):
            mini_report[name] = samples[j][i]
        
    return report

In [56]:
params_bounds = {
                'num_leaves'       : (5, 50),
                'max_depth'        : (2, 25),
                'learning_rate'    : (0.002, 0.02),
                'feature_fraction' : (0.5, 1.0),
                'bagging_fraction' : (0.5, 1.0),
                'bagging_freq'     : (1, 25),
                'colsample_bytree' : (0.5, 1.0),
                'max_bin'          : (200, 55000),
                'reg_alpha'        : (0, 32),
                'reg_lambda'       : (0, 32),
}

In [57]:
full_report = diff_evol(cv_score, params_bounds,  N_samples = 10, N_epochs = 5, p_mut = 0.2, w = 0.4)

epoch = 0
0.541397228598
0.541961140359
0.540260749861
0.540252902329
0.541699204923
0.542287132574
0.540383534767
0.540498330095
0.547198306746
0.540538729342
0.540212749653
0.540392407453
[ 0.54021275  0.5402529   0.54026075  0.54038353  0.54039241  0.54049833
  0.54053873  0.54139723  0.5416992   0.54196114]
epoch = 1
0.541415345463
0.540634775656
0.540291533399
0.540193309441
0.550440607059
0.540467308942
0.540242793827
0.553476431231
0.543062386149
0.541396141805
[ 0.54019331  0.54021275  0.54024279  0.5402529   0.54026075  0.54029153
  0.54038353  0.54039241  0.54046731  0.54049833]
epoch = 2
0.54015759961
0.541204547579
0.540301759198
0.540604920493
0.540239592199
0.540271411584
0.542207999615
0.540399130402
0.54034004976
0.540270084183
0.54576279329
[ 0.5401576   0.54019331  0.54021275  0.54023959  0.54024279  0.5402529
  0.54026075  0.54027008  0.54027141  0.54029153]
epoch = 3
0.540280948972
0.540228919309
0.5412419207
0.540304051589
0.540144007819
0.54129015852
0.55092917884

In [58]:
full_report

[{'bagging_fraction': 0.77548149688865875,
  'bagging_freq': 8.8876714538782728,
  'colsample_bytree': 0.52574186877213946,
  'feature_fraction': 0.82592550425630784,
  'learning_rate': 0.0069055875316154335,
  'max_bin': 10645.670344488455,
  'max_depth': 12.892658634731252,
  'num_leaves': 29.460005659570587,
  'reg_alpha': 7.5323235299173739,
  'reg_lambda': 28.252406887595278},
 {'bagging_fraction': 0.77548149688865875,
  'bagging_freq': 8.8876714538782728,
  'colsample_bytree': 0.52574186877213946,
  'feature_fraction': 0.82592550425630784,
  'learning_rate': 0.0069055875316154335,
  'max_bin': 10645.670344488455,
  'max_depth': 12.892658634731252,
  'num_leaves': 29.460005659570587,
  'reg_alpha': 7.5323235299173739,
  'reg_lambda': 28.252406887595278},
 {'bagging_fraction': 0.77548149688865875,
  'bagging_freq': 8.8876714538782728,
  'colsample_bytree': 0.52574186877213946,
  'feature_fraction': 0.82592550425630784,
  'learning_rate': 0.0069055875316154335,
  'max_bin': 10645.67

In [60]:
report = full_report[0]

report

{'bagging_fraction': 0.77548149688865875,
 'bagging_freq': 8.8876714538782728,
 'colsample_bytree': 0.52574186877213946,
 'feature_fraction': 0.82592550425630784,
 'learning_rate': 0.0069055875316154335,
 'max_bin': 10645.670344488455,
 'max_depth': 12.892658634731252,
 'num_leaves': 29.460005659570587,
 'reg_alpha': 7.5323235299173739,
 'reg_lambda': 28.252406887595278}

In [61]:
lgb_params = {
                'objective'        : 'binary',
                'metric'           : 'binary_logloss',
                'num_leaves'       : max(5,int(report['num_leaves'])),
                'max_depth'        : max(1, int(report['max_depth'])),
                'learning_rate'    : max(0.0001, report['learning_rate']),
                'feature_fraction' : max(0.5, min(1, report['feature_fraction'])),
                'bagging_fraction' : max(0.5, min(1, report['bagging_fraction'])),
                'bagging_freq'     : max(1, int(report['bagging_freq'])),
                'colsample_bytree' : max(0.5, min(1, report['colsample_bytree'])),
                'max_bin'          : max(100, int(report['max_bin'])),
                'reg_alpha'        : max(0, report['reg_alpha']),
                'reg_lambda'       : max(0, report['reg_lambda']),
            }


cv_result = lgb.cv(lgb_params, ltrain, num_boost_round=1000, nfold=5, 
                   early_stopping_rounds=20, verbose_eval=10)


[10]	cv_agg's binary_logloss: 0.677354 + 0.00012138
[20]	cv_agg's binary_logloss: 0.6643 + 0.000208064
[30]	cv_agg's binary_logloss: 0.651362 + 0.000315803
[40]	cv_agg's binary_logloss: 0.6412 + 0.000433937
[50]	cv_agg's binary_logloss: 0.632391 + 0.000540133
[60]	cv_agg's binary_logloss: 0.624272 + 0.00062243
[70]	cv_agg's binary_logloss: 0.616198 + 0.000705429
[80]	cv_agg's binary_logloss: 0.60881 + 0.000775931
[90]	cv_agg's binary_logloss: 0.60308 + 0.000858969
[100]	cv_agg's binary_logloss: 0.597474 + 0.000930741
[110]	cv_agg's binary_logloss: 0.591895 + 0.000984853
[120]	cv_agg's binary_logloss: 0.586858 + 0.00103035
[130]	cv_agg's binary_logloss: 0.582847 + 0.00107089
[140]	cv_agg's binary_logloss: 0.578652 + 0.00111339
[150]	cv_agg's binary_logloss: 0.575285 + 0.00114515
[160]	cv_agg's binary_logloss: 0.572057 + 0.00120843
[170]	cv_agg's binary_logloss: 0.569622 + 0.00123892
[180]	cv_agg's binary_logloss: 0.567021 + 0.00128101
[190]	cv_agg's binary_logloss: 0.564505 + 0.0013236


In [62]:
sign = lambda x: 1 if x >= 0 else -1

In [65]:
n = 100

n_trees = len(cv_result['binary_logloss-mean'])

y_preds = np.empty(shape = (n, len(x_test)))

for i in tqdm_notebook(xrange(n)):
    
    lgb_params = {
            'objective' : 'binary',
            'metric' : 'binary_logloss',
            'num_leaves'       : max(5,int(report['num_leaves'])),
            'max_depth'        : max(1, int(report['max_depth'])),
            'learning_rate'    : max(0.0001, report['learning_rate']),
            'feature_fraction' : max(0.5, min(1, report['feature_fraction'])),
            'bagging_fraction' : max(0.5, min(1, report['bagging_fraction'])),
            'bagging_freq'     : max(1, int(report['bagging_freq'])),
            'colsample_bytree' : max(0.5, min(1, report['colsample_bytree'])),
            'max_bin'          : max(100, int(report['max_bin'])),
            'reg_alpha'        : max(0, report['reg_alpha']),
            'reg_lambda'       : max(0, report['reg_lambda']),
            'seed' : i*7 - 1 + (i**2)//973, }
        
    ltrain = lgb.Dataset(x_train, y_train) 
    
    model = lgb.train(lgb_params, ltrain, num_boost_round = n_trees )
    
    y_preds[i] = model.predict(x_test)
    
    del model
print 'oK'

oK


In [66]:
y_pred_lgb = y_preds.mean(0)
y_pred_lgb

array([ 0.48927958,  0.55688665,  0.41157235, ...,  0.47417598,
        0.30176907,  0.68695352])

In [32]:
n = 64
y_nn_preds1 = np.empty(shape=(n, len(x_test_nn)))
for i in tqdm_notebook(xrange(n)):
    model = Sequential()
    opt = optimizers.adam(lr = 0.01)
    model.add(Dense(12, activation='relu', input_dim=x_train_nn.shape[1]))
    model.add(Dense(24, activation='tanh'))
    model.add(Dense(48, activation='relu'))
    model.add(Dense(24, activation='tanh'))
    model.add(Dense(12, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer = opt,
                  loss = 'binary_crossentropy',
                  metrics = ['binary_crossentropy'])
    
    
    inds = (np.random.randint(0, len(x_train_nn), len(x_train_nn)))
    
    model.fit(x_train_nn[inds], y_train_nn[inds], batch_size = 1000, epochs = 15, verbose=0)
    y_nn = model.predict_proba(x_test_nn).T[0]
    y_nn_preds1[i] = y_nn
    print 
#    print log_loss(y_test_nn, y_nn)
    del model




In [34]:
#log_loss(y_test_nn, y_nn_preds1.mean(0))
y_nn_preds1.mean(0)

array([ 0.47340662,  0.50431734,  0.41439968, ...,  0.43550645,
        0.40926907,  0.73371202])

In [80]:
output = pd.DataFrame(y_pred_cb)
output.to_csv('cb.csv', header=None, index=None)
output.head(7)

Unnamed: 0,0
0,0.504429
1,0.531992
2,0.404638
3,0.543625
4,0.212363
5,0.27507
6,0.218567


In [67]:
output = pd.DataFrame(y_pred_lgb)
output.to_csv('lgb.csv', header=None, index=None)
output.head(7)

Unnamed: 0,0
0,0.48928
1,0.556887
2,0.411572
3,0.546964
4,0.207609
5,0.276292
6,0.20987


In [33]:
output = pd.DataFrame( y_nn_preds1.mean(0) )
output.to_csv('nn.csv', header=None, index=None)
output.head(7)

NameError: name 'y_nn_preds1' is not defined