In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
import lightgbm as lgb
import catboost as ctb

from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

from sklearn.cross_validation import KFold

from lightgbm.sklearn import LGBMRegressor
from sklearn import datasets, metrics, cross_validation
from sklearn.linear_model import LogisticRegression



In [1]:
class XgbWrapper(object):
    def __init__(self, seed=2017, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 400)

    def train(self, xtra, ytra, xte, yte):
        dtrain = xgb.DMatrix(xtra, label=ytra)
        dvalid = xgb.DMatrix(xte, label=yte)
        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds,
            watchlist, early_stopping_rounds=10)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))
    
class LgbWrapper(object):
    def __init__(self, seed=2017, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 400)

    def train(self, xtra, ytra, xte, yte):
        ytra = ytra.ravel()
        yte = yte.ravel()
        dtrain = lgb.Dataset(xtra, label=ytra)
        self.gbdt = lgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(x)

class CtbWrapper(object):
    def __init__(self, seed=2017, params=None):
        self.seed = seed
        self.nrounds = 300

    def train(self, xtra, ytra, xte, yte):
        self.gbdt = ctb.CatBoostRegressor(depth=10,
            iterations=self.nrounds, random_seed=self.seed,
            use_best_model=True, loss_function='RMSE',
            thread_count=8, eval_metric='RMSE')

        xtra = pd.DataFrame(xtra)
        ytra = pd.DataFrame(ytra)
        xte = pd.DataFrame(xte)
        yte = pd.DataFrame(yte)

        self.gbdt.fit(X=xtra, y=ytra, eval_set=(xte, yte),
                      use_best_model=True)

    def predict(self, x):
        return self.gbdt.predict(x)

class lrWrapper(object):
    def __init__(self, seed=2017, params=None):
        self.seed = seed

    def train(self, xtra, ytra, xte, yte):
        self.clf = LogisticRegression()
        xtra = pd.DataFrame(xtra)
        ytra = pd.DataFrame(ytra)
        self.clf.fit(X=xtra, y=ytra)

    def predict(self, x):
        return self.clf.predict(x)

In [2]:
def get_oof(clf, ntrain, ntest, kf, train, labels, test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((5, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = train[train_index]
        y_tr = labels[train_index]
        x_te = train[test_index]
        y_te = labels[test_index]

        clf.train(x_tr, y_tr, x_te, y_te)
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [13]:
def model_1(train, labels, test):

    ntrain = train.shape[0]
    ntest = test.shape[0]

    kf = KFold(ntrain, n_folds=5,
               shuffle=True, random_state=2017)

    lgb_params = {}
    lgb_params['boosting_type'] = 'gbdt'
    lgb_params['objective'] = 'regression'
    lgb_params['metric'] = 'auc'
    lgb_params['num_leaves'] = 96
    lgb_params['max_depth'] = 7
    lgb_params['feature_fraction'] = 0.9
    lgb_params['bagging_fraction'] = 0.95
    lgb_params['bagging_freq'] = 5
    lgb_params['learning_rate'] = 0.1


    xgb_params = {}
    xgb_params['booster'] = 'gbtree'
    xgb_params['silent'] = 1
    xgb_params['colsample_bytree'] = 0.15
    xgb_params['eta'] = 0.09
    xgb_params['eval_metric'] = 'auc'
    xgb_params['gamma'] = 0.7
    xgb_params['max_depth'] = 12
    xgb_params['min_child_weight'] = 5.0
    xgb_params['n_estimators'] = 720
    xgb_params['objective'] = 'binary:logistic'
    xgb_params['subsample'] = 0.9
    
    
    cg = CtbWrapper()
    xg = XgbWrapper(seed=2017, params=xgb_params)
    lg = LgbWrapper(seed=2017, params=lgb_params)
    lr = lrWrapper()

    lg_oof_train, lg_oof_test = get_oof(lg, ntrain, ntest, kf, train, labels, test)
    xg_oof_train, xg_oof_test = get_oof(xg, ntrain, ntest, kf, train, labels, test)
    cg_oof_train, cg_oof_test = get_oof(cg, ntrain, ntest, kf, train, labels, test)
    lr_oof_train, lr_oof_test = get_oof(lr, ntrain, ntest, kf, train, labels, test)

    x_train = np.concatenate((lr_oof_train, cg_oof_train, xg_oof_train, lg_oof_train), axis=1)
    x_test = np.concatenate((lr_oof_test, cg_oof_test, xg_oof_test, lg_oof_test), axis=1)

    np.save(arr=x_train, file='x_concat_train.npy')
    np.save(arr=x_test, file='x_concat_test.npy')
    np.save(arr=labels, file='y_labels.npy')

In [14]:
def model_2():
    train = np.load('x_concat_train.npy')
    labels = np.load('y_labels.npy')
    test = np.load('x_concat_test.npy')

    dtrain = xgb.DMatrix(train, label=labels)
    dtest = xgb.DMatrix(test)

    xgb_params = {}
    xgb_params["objective"] = "binary:logistic"
    xgb_params["eta"] = 0.1
    xgb_params["subsample"] = 0.9
    xgb_params["silent"] = 1
    xgb_params["max_depth"] = 6
    xgb_params['eval_metric'] = 'auc'
    xgb_params['min_child_weight'] = 10
    xgb_params['seed'] = 2017

    res = xgb.cv(xgb_params, dtrain, num_boost_round=500, nfold=5, seed=2017, stratified=False,
                 early_stopping_rounds=25, verbose_eval=10, show_stdv=True)

    best_nrounds = res.shape[0] - 1
    cv_mean = res.iloc[-1, 0]
    cv_std = res.iloc[-1, 1]

    print('')
    print('Ensemble-CV: {0}+{1}'.format(cv_mean, cv_std))
    bst = xgb.train(xgb_params, dtrain, best_nrounds)

    return bst.predict(dtest)

In [17]:
def main():
    train_data = pd.read_csv("train_fe_finish.txt", index_col=0)
    test_data = pd.read_csv("test_fe_finish.txt", index_col=0)

    dfx_train, dflabels = train_data.drop(['rating'], axis=1), train_data['rating']

    x_train = np.array(dfx_train)
    x_test = np.array(test_data)
    labels = np.array(dflabels)

    model_1(x_train, labels, x_test)
    preds = model_2()

    return preds

In [18]:
pred = main()

[0]	train-auc:0.773953	eval-auc:0.681023
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 10 rounds.
[1]	train-auc:0.823458	eval-auc:0.746869
[2]	train-auc:0.836904	eval-auc:0.770937
[3]	train-auc:0.836931	eval-auc:0.772371
[4]	train-auc:0.839805	eval-auc:0.77748
[5]	train-auc:0.842228	eval-auc:0.782408
[6]	train-auc:0.844687	eval-auc:0.786435
[7]	train-auc:0.859585	eval-auc:0.790421
[8]	train-auc:0.866368	eval-auc:0.790291
[9]	train-auc:0.865201	eval-auc:0.792469
[10]	train-auc:0.864752	eval-auc:0.79298
[11]	train-auc:0.872151	eval-auc:0.793583
[12]	train-auc:0.870927	eval-auc:0.795379
[13]	train-auc:0.868912	eval-auc:0.796225
[14]	train-auc:0.876362	eval-auc:0.798741
[15]	train-auc:0.881358	eval-auc:0.798086
[16]	train-auc:0.880248	eval-auc:0.79892
[17]	train-auc:0.883859	eval-auc:0.798202
[18]	train-auc:0.882974	eval-auc:0.799433
[19]	train-auc:0.887497	eval-auc:0.799061
[20]	train-auc:0.887022	eval-auc

[57]	train-auc:0.925513	eval-auc:0.822098
[58]	train-auc:0.925524	eval-auc:0.822032
[59]	train-auc:0.927389	eval-auc:0.822708
[60]	train-auc:0.928343	eval-auc:0.823112
[61]	train-auc:0.930269	eval-auc:0.82354
[62]	train-auc:0.931701	eval-auc:0.823757
[63]	train-auc:0.931591	eval-auc:0.823774
[64]	train-auc:0.933034	eval-auc:0.82367
[65]	train-auc:0.932902	eval-auc:0.823625
[66]	train-auc:0.933065	eval-auc:0.823595
[67]	train-auc:0.9332	eval-auc:0.823701
[68]	train-auc:0.934811	eval-auc:0.823548
[69]	train-auc:0.936959	eval-auc:0.824354
[70]	train-auc:0.936951	eval-auc:0.824391
[71]	train-auc:0.938699	eval-auc:0.824521
[72]	train-auc:0.938649	eval-auc:0.824536
[73]	train-auc:0.938717	eval-auc:0.824652
[74]	train-auc:0.938771	eval-auc:0.824624
[75]	train-auc:0.938778	eval-auc:0.824831
[76]	train-auc:0.940517	eval-auc:0.825358
[77]	train-auc:0.940591	eval-auc:0.825363
[78]	train-auc:0.940513	eval-auc:0.825172
[79]	train-auc:0.940476	eval-auc:0.82506
[80]	train-auc:0.940449	eval-auc:0.8253

[15]	train-auc:0.875287	eval-auc:0.78791
[16]	train-auc:0.875509	eval-auc:0.78972
[17]	train-auc:0.87596	eval-auc:0.791015
[18]	train-auc:0.875717	eval-auc:0.791357
[19]	train-auc:0.88	eval-auc:0.791968
[20]	train-auc:0.884628	eval-auc:0.791323
[21]	train-auc:0.889043	eval-auc:0.791591
[22]	train-auc:0.888538	eval-auc:0.791445
[23]	train-auc:0.892176	eval-auc:0.791819
[24]	train-auc:0.896604	eval-auc:0.792496
[25]	train-auc:0.896054	eval-auc:0.793957
[26]	train-auc:0.899604	eval-auc:0.79362
[27]	train-auc:0.899613	eval-auc:0.795007
[28]	train-auc:0.904241	eval-auc:0.795301
[29]	train-auc:0.904124	eval-auc:0.795487
[30]	train-auc:0.903757	eval-auc:0.796382
[31]	train-auc:0.907588	eval-auc:0.796393
[32]	train-auc:0.907412	eval-auc:0.797142
[33]	train-auc:0.910359	eval-auc:0.797314
[34]	train-auc:0.913049	eval-auc:0.797949
[35]	train-auc:0.912736	eval-auc:0.797838
[36]	train-auc:0.914612	eval-auc:0.797895
[37]	train-auc:0.914332	eval-auc:0.798509
[38]	train-auc:0.914142	eval-auc:0.799007


  y = column_or_1d(y, warn=True)


[0]	train-auc:0.828569+0.00148645	test-auc:0.807946+0.00626533
[10]	train-auc:0.841363+0.00150472	test-auc:0.816252+0.00564041
[20]	train-auc:0.845631+0.00131938	test-auc:0.8165+0.0055205
[30]	train-auc:0.848648+0.00150153	test-auc:0.816616+0.00546625
[40]	train-auc:0.851106+0.0015426	test-auc:0.816109+0.00568057

Ensemble-CV: 0.8168388+0.0057118418185380555


In [19]:
import os
homedir = os.environ['HOME']
path_test = homedir+'/data/mangaki-data-challenge/test.csv'
test = pd.read_csv(path_test)

df_pred_sub = pd.DataFrame(pred, columns=['prob_willsee'], index=test.index)

df_pred_sub['prob_willsee'] = df_pred_sub['prob_willsee'].apply(lambda x: 0.0 if x < 0.0 else x)
df_pred_sub['prob_willsee'] = df_pred_sub['prob_willsee'].apply(lambda x: 1.0 if x > 1.0 else x)

pd.concat([test, df_pred_sub], axis=1).to_csv("mangaki_sub.csv", index=False)