In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import KFold, cross_val_score as cv_scoring
from sklearn.metrics import log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm_notebook
from scipy.optimize import minimize
import matplotlib.pylab as plt
from bayes_opt import BayesianOptimization
from tqdm import tqdm
plt.style.use('ggplot')

# Read_Data

In [None]:
train = pd.read_csv('train.csv', delimiter=';')
test  = pd.read_csv('test.csv', delimiter=';')

test.replace({'None' : 999}, inplace=True)
test[['smoke', 'alco', 'active']] = test[['smoke', 'alco', 'active']].astype(int)

test.replace({999 : np.nan}, inplace=True)
test[['gluc', 'cholesterol']] = test[['gluc', 'cholesterol']].astype(int)

N_train = train.shape[0]
data = pd.concat([train, test], 0).reset_index(drop=True)

# Data_Cleaning

In [None]:
clean = False

if clean:
    bad_index = (data.height < 110) & (data.weight > 150)
    buf = data.loc[bad_index, 'height']
    data.loc[bad_index, 'height'] = data.loc[bad_index, 'weight']
    data.loc[bad_index, 'weight'] = buf
    
    data.loc[data.height < 90, 'height'] += 100
    
    data.loc[(data.height >= 100) & (data.height < 110), 'height'] += 60 
    
    data.loc[(data.height >= 110) & (data.height < 120) & (data.weight < 110), 'height'] += 60
    
    data.loc[(data.height == data.ap_hi) & (data.weight == data.ap_lo), 'height'] = np.nan
    data.loc[(data.height == data.ap_hi) & (data.weight == data.ap_lo), 'weight'] = np.nan
    
    data.loc[data.height <= 130, ['weight', 'height']] = [np.nan, np.nan]
    data.loc[(data.height <= 135) & (data.weight > 60), 'height'] = np.nan
    data.loc[(data.height < 140) & (data.weight > 80), 'height'] = np.nan
    data.loc[data.height > 210, 'height'] = np.nan
    
    data.loc[data.weight < 20, 'weight'] *= 10
    
    data.loc[data.weight <= 30, 'weight'] = np.nan
    data.loc[(data.weight < 36) & (data.height > 160), 'weight'] = np.nan
    
    data.loc[(data.weight > 140) & (data.height == data.weight), 'weight'] = np.nan
    
    data.loc[data.weight > 125, 'weight'] = np.nan
    
    data.replace({'weight': {120.0: np.nan}}, inplace=True)
    
    data['ap_hi'] = np.abs(data['ap_hi'])
    data['ap_lo'] = np.abs(data['ap_lo'])

    data.loc[data.ap_lo > 5000, 'ap_lo'] /= 100
    data.loc[data.ap_hi > 5000, 'ap_hi'] /= 100
    data.loc[data.ap_hi > 250, 'ap_hi'] /= 10
    data.loc[data.ap_lo > 250, 'ap_lo'] /= 10
    
    data.loc[[12494, 60477, 51749], ['ap_hi', 'ap_lo']] = [np.nan, np.nan]
    data.loc[[75399], 'ap_lo'] = 200
    data.loc[6580, ['ap_hi', 'ap_lo']] = [np.nan, np.nan]
    data.loc[data.ap_hi < 10, 'ap_hi'] = np.nan
    
    data.loc[data.ap_hi == 906, ['ap_hi', 'ap_lo']] = [np.nan, np.nan]
    data.loc[data.ap_hi == 90.6, ['ap_hi', 'ap_lo']] = [np.nan, np.nan]
    data.loc[data.ap_hi == 701, ['ap_hi', 'ap_lo']] = [110, 70]
    data.loc[data.ap_hi == 309, ['ap_hi', 'ap_lo']] = [np.nan, np.nan]
    data.loc[data.ap_hi == 30.9, ['ap_hi', 'ap_lo']] = [np.nan, np.nan]
    data.loc[data.ap_hi == 806, ['ap_hi', 'ap_lo']] = [np.nan, np.nan]
    data.loc[data.ap_hi == 509, ['ap_hi', 'ap_lo']] = [np.nan, np.nan]
    data.loc[data.ap_hi == 50.9, ['ap_hi', 'ap_lo']] = [np.nan, np.nan]
    data.loc[data.ap_hi == 50.9, ['ap_hi', 'ap_lo']] = [np.nan, np.nan]
    data.loc[data.ap_hi == 40.1, ['ap_hi', 'ap_lo']] = [np.nan, np.nan]
    
    data.loc[data.ap_hi == 400, 'ap_hi'] = 100
    data.loc[data.ap_hi == 401, 'ap_hi'] = 101
    data.loc[data.ap_hi == 410, 'ap_hi'] = 110
    data.loc[data.ap_hi == 470, 'ap_hi'] = 170
    data.loc[data.ap_lo == 410, 'ap_lo'] = 110
    data.loc[data.ap_lo == 470, 'ap_lo'] = 170
        
    data.loc[(data.ap_hi == 138) & (data.ap_lo == 0), ['ap_hi', 'ap_lo']] = [np.nan, np.nan]
    data.loc[(data.ap_hi == 149) & (data.ap_lo == 0), ['ap_hi', 'ap_lo']] = [np.nan, np.nan]
    data.loc[(data.ap_hi == 90.7) & (data.ap_lo == 0), ['ap_hi', 'ap_lo']] = [np.nan, np.nan]
    data.loc[(data.ap_hi == 148) & (data.ap_lo == 0), ['ap_hi', 'ap_lo']] = [np.nan, np.nan]
    data.loc[(data.ap_hi == 80.6) & (data.ap_lo == 0), ['ap_hi', 'ap_lo']] = [np.nan, np.nan]
    data.loc[(data.ap_hi == 108) & (data.ap_lo == 0), ['ap_hi', 'ap_lo']] = [np.nan, np.nan]
    data.loc[(data.ap_hi == 121) & (data.ap_lo == 0), ['ap_hi', 'ap_lo']] = [np.nan, np.nan]
    data.loc[(data.ap_hi == 117) & (data.ap_lo == 0), ['ap_hi', 'ap_lo']] = [np.nan, np.nan]
    data.loc[(data.ap_hi == 118) & (data.ap_lo == 0), ['ap_hi', 'ap_lo']] = [np.nan, np.nan]

    data.loc[data.ap_lo == 0, 'ap_lo'] = np.nan

    data.replace({'ap_lo': {1: 100}}, inplace=True)

    data.loc[data.ap_hi < 30, 'ap_hi'] *= 10
    data.loc[data.ap_lo < 20, 'ap_lo'] *= 10
    
    data.replace({'ap_lo': {20: np.nan, 30: 80}}, inplace=True)
    
    bad_index = data.ap_hi < data.ap_lo
    buf = data.loc[bad_index, 'ap_hi']
    data.loc[bad_index, 'ap_hi'] = data.loc[bad_index, 'ap_lo']
    data.loc[bad_index, 'ap_lo'] = buf

# Add_Features

In [None]:
data['chol+gluc'] = data['cholesterol'].astype(str) + data['gluc'].astype(str) 
data['chol+gluc'] = pd.factorize(data['chol+gluc'])[0]

In [None]:
data['ap+chol'] = data['cholesterol'].astype(str) + data['ap_hi'].dropna().astype(int).astype(str)
data['ap+chol'] = pd.factorize(data['ap+chol'])[0]

data['age+ap'] = (data['age']//365).astype(str) + data['ap_hi'].astype(str)
data['age+ap'] = pd.factorize(data['age+ap'])[0]

In [None]:
data['imt'] = data['weight'] / (data['height']/100) ** 2

data['ap-'] = data['ap_hi'] - data['ap_lo']
data['ap*'] = data['ap_hi'] * data['ap_lo'] / 10000
data['ap+'] = data['ap_hi'] + data['ap_lo']
data['ap/'] = data['ap_hi'] / data['ap_lo']

data['h/w'] = data['height'] / data['weight']
data['h*w'] = data['height'] * data['weight'] / 10000

data['age/ap'] = data['age'] / data['ap_hi']
data['ap/age'] = data['ap_hi'] / data['age']
data['age/w'] = data['age'] / data['weight']
data['age/h'] = data['age'] / data['height']

data['age*ap'] = data['age'] * data['ap_hi'] / 1000
data['age*w']  = data['age'] * data['weight'] / 1000
data['age*chol'] = data['age'] * data['cholesterol']
data['age*gluc'] = data['age'] * data['gluc']

In [None]:
mean_y = data.groupby(['cholesterol'])['cardio'].mean().reset_index()
mean_y = mean_y.rename(columns = {'cardio' : 'p_chol'})
data = pd.merge(data, mean_y, on = 'cholesterol', how = 'left')

mean_y = data.groupby(['gluc'])['cardio'].mean().reset_index()
mean_y = mean_y.rename(columns = {'cardio' : 'p_gluc'})
data = pd.merge(data, mean_y, on = 'gluc', how = 'left')

mean_y = data.groupby(['chol+gluc'])['cardio'].mean().reset_index()
mean_y = mean_y.rename(columns = {'cardio' : 'p_ch+gl'})
data = pd.merge(data, mean_y, on = 'chol+gluc', how = 'left')

# Selected_Features 

In [None]:
use_feat = [u'active', u'age', u'alco', u'ap_hi', u'ap_lo', u'gender', u'height',
       u'smoke', u'chol+gluc', u'ap+chol', u'imt', u'ap*', u'ap+', u'h/w',
       u'h*w', u'ap/age', u'age*ap', u'age*w', u'age*gluc', u'p_chol',
       u'p_gluc', u'p_ch+gl']

# Train_Test

In [None]:
x_train = data[:N_train].drop(['id', 'cardio'], 1)
y_train = data[:N_train]['cardio']

x_test = data[N_train:].drop(['id', 'cardio'], 1)

#x_train = data.loc[:int(0.8*N_train)-1].drop(['id', 'cardio'], 1)
#y_train = data.loc[:int(0.8*N_train)-1]['cardio']

#x_test = data.loc[int(0.8*N_train):N_train-1].drop(['id', 'cardio'], 1)
#y_test = data.loc[int(0.8*N_train):N_train-1]['cardio']

In [None]:
x_train.head()

# CV_and_Train

In [None]:
lgb_params = {
    'objective' : 'binary',
    'metric' : 'binary_logloss',
    'num_leaves' : 16,
    'max_depth' : 5,
    'learning_rate' : 0.03,
    'feature_fraction' : 0.9,
    'bagging_fraction' : 0.9,
    'bagging_freq' : 3,
    'max_bin' : 500,
}

In [None]:
ltrain = lgb.Dataset(x_train[use_feat], y_train)

In [None]:
cv_result = lgb.cv(lgb_params, ltrain, num_boost_round=1000, nfold=5, 
                   early_stopping_rounds=20, verbose_eval=10)

In [None]:
dtrain = xgb.DMatrix(x_train[use_feat], y_train)
dtest  = xgb.DMatrix(x_test[use_feat])

In [None]:
xgb_params = {
    'objective' : 'binary:logistic',
    'eval_metric' : 'logloss',
    'max_depth' : 5,
    'eta' : 0.02,
    'subsample' : 0.9,
    'colsample_bytree' : 0.9,
}

In [None]:
cv_result = xgb.cv(xgb_params, dtrain, num_boost_round=1000, nfold=5, 
                   early_stopping_rounds=20, verbose_eval=10)

# Feature_Selection

In [None]:
n_splits = 10

def get_score(x_train, y_train, n_splits):
    kfold = KFold(n_splits=n_splits)
    y_oof_lgb  = np.empty(len(x_train))

    for train_idx, test_idx in kfold.split(x_train):
        
        ltrain_cv = lgb.Dataset(x_train.loc[train_idx], y_train[train_idx])
        ltest_cv  = lgb.Dataset(x_train.loc[test_idx], y_train[test_idx])
        eval_list = ltest_cv
        
        lgbm = lgb.train(lgb_params, ltrain_cv, num_boost_round = 1000, 
                          valid_sets=eval_list, early_stopping_rounds = 20, verbose_eval = 0)
        
        y_oof_lgb[test_idx] = lgbm.predict(x_train.loc[test_idx])
    return log_loss(y_train, y_oof_lgb)

In [None]:
gen_alg_start = False

if gen_alg_start:
    # number of epochs
    epochs = 20
    #initial propability
    p0 = 0.8
    #crossingover probability
    p_cross = 0.5
    #mutate probability
    p_mut = 0.015
    #number of the best samples
    N_best = 30
    N_features = x_train.shape[1]
    #initialization 
    samples = np.random.binomial(1, p0, ( N_best, N_features)).astype(bool)
    
    all_features = x_train.columns
    scores = np.array([])
    
    for epoch in tqdm(range(epochs)):
        for i in range(N_best):
            j = np.random.randint(0, N_best)
            cross = np.random.binomial(1, p_cross, N_features).astype(bool)
            if (samples[i][cross] != samples[j][cross]).any():
                new_sample = np.copy(samples[i])
                new_sample[cross] = np.copy(samples[j][cross])
                if new_sample.any():
                    samples = np.vstack([samples, new_sample])
        for i in range(N_best, len(samples)):
            mutate = np.random.binomial(1, p_mut, N_features).astype(bool)
            samples[i][mutate] = ~samples[i][mutate]
        for i in range(int(N_best*0.2)):
            new_sample = np.random.binomial(1, p0, N_features).astype(bool)
            if new_sample.any():
                samples = np.vstack([samples, new_sample])
        for i, sample in enumerate(samples[N_best:]):
            
            score = get_score(x_train[all_features[sample]], y_train, 5)
            
            print score
            
            scores = np.append(scores, score)
        ind_best = scores.argsort()
        print 'epoch = ', epoch+1
        print scores[ind_best][:5]
        scores = scores[ind_best][:N_best]
        samples = samples[ind_best]
        samples = samples[:N_best]

In [None]:
all_features[samples[0]]

# Train_models

In [None]:
n_splits = 10
kfold = KFold(n_splits=n_splits)

y_pred_lgb = np.zeros(len(x_test))
y_oof_lgb  = np.empty(len(x_train))

for train_idx, test_idx in kfold.split(x_train):
    ltrain_cv = lgb.Dataset(x_train[use_feat].loc[train_idx], y_train[train_idx])
    ltest_cv  = lgb.Dataset(x_train[use_feat].loc[test_idx], y_train[test_idx])
    eval_list = ltest_cv
    
    lgbm = lgb.train(lgb_params, ltrain_cv, num_boost_round = 1000, 
                      valid_sets=eval_list, early_stopping_rounds = 20, verbose_eval = 50)
    
    y_oof_lgb[test_idx] = lgbm.predict(x_train[use_feat].loc[test_idx])
    
    y_pred_lgb += lgbm.predict(x_test[use_feat])
y_pred_lgb /= n_splits

In [None]:
n_splits = 10
kfold = KFold(n_splits=n_splits)

y_pred_xgb = np.zeros(len(x_test))
y_oof_xgb  = np.empty(len(x_train))
dtest  = xgb.DMatrix(x_test[use_feat])

for train_idx, test_idx in kfold.split(x_train):
    dtrain_cv = xgb.DMatrix(x_train[use_feat].loc[train_idx], y_train[train_idx])
    dtest_cv  = xgb.DMatrix(x_train[use_feat].loc[test_idx], y_train[test_idx])
    eval_list = [(dtest_cv, 'test')]
    
    xgbst = xgb.train(xgb_params, dtrain_cv, num_boost_round = 1000, 
                      evals=eval_list, early_stopping_rounds = 20, verbose_eval = 50)
    
    y_oof_xgb[test_idx] = xgbst.predict(dtest_cv)
    
    y_pred_xgb += xgbst.predict(dtest)
y_pred_xgb /= 10

In [None]:
print log_loss(y_train, y_oof_lgb*0.5 + y_oof_xgb*0.5)

In [None]:
y_res = (y_pred_lgb + y_pred_xgb)*0.5

In [None]:
y_res

In [None]:
plt.figure(figsize=(12,10))
ax = plt.subplot(111)
lgb.plot_importance(lgbm, ax = ax)
plt.show()

# Write_Result

In [None]:
output = pd.DataFrame(y_res)
output.to_csv('lgb+xgb_new_feat.csv', header=None, index=None)
output.head(7)