In [1]:
import random
import pickle
import itertools
import xgboost
import lightgbm
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

%matplotlib inline

## read data

In [4]:
data = np.load("data.npy")

with open("rec_sim.pickle","rb") as f:
    rec_sim = pickle.load(f)

# append recommondation cosine similarity
data = data.tolist()
for i in range(len(data)):
    data[i].insert(-1,rec_sim[data[i][0]])


with open("1W.pickle","rb") as f:
    pr1 = pickle.load(f)

for i in range(len(data)):
    idx = data[i][0]
    if idx in pr1:
        data[i].insert(-1, pr1[idx][0])

with open("2W.pickle","rb") as f:
    pr2 = pickle.load(f)

for i in range(len(data)):
    idx = data[i][0]
    if idx in pr2:
        data[i].insert(-1, pr2[idx][0])


with open("amp_train.pickle","rb") as f:
    amp_train = pickle.load(f)

for i in range(len(data)):
    data[i].insert(-1, amp_train[data[i][0]])

data = np.array(data)
train, test = train_test_split(data[:,1:], train_size=0.8)
X_train, y_train = train[:,:-1], train[:,-1]
X_test, y_test = test[:,:-1], test[:,-1]

mean = np.mean(X_train, axis=0, keepdims=True)
std = np.std(X_train, axis=0, keepdims=True)
Xn_train = (X_train - mean) / std
Xn_test = (X_test - mean) / std



In [5]:
X_t = np.load("Xt_.npy")

with open("rec_sim_test.pickle", "rb") as f:
    rec_sim_test = pickle.load(f)

with open("test.pickle","rb") as f:
    pr_t = pickle.load(f)

with open("amp_test.pickle","rb") as f:
    amp_t = pickle.load(f)
    
X_t = X_t[:, :-1].tolist()

for i in range(len(X_t)):
    X_t[i].append(rec_sim_test[i+1])

for i in range(len(X_t)):
    X_t[i].append(pr_t[i])

for i in range(len(X_t)):
    X_t[i].append(amp_t[i])

X_t = np.array(X_t)
Xn_t = (X_t - mean) / std
print(X_t.shape)

(2000, 19)


In [6]:
class Ensemble:
    
    def __init__(self, k):
        
        self.fold = k
        
        self.clf = []
        
        clf = []
        for i in range(k):
            clf.append(xgboost.XGBClassifier(booster='dart',colsample_bylevel=0.85,
                                             learning_rate=0.05,max_depth=8,
                                             n_estimators=200,subsample=0.75))
        self.clf.append(clf)
        
        clf = []
        for i in range(k):
            clf.append(xgboost.XGBClassifier(booster='gbtree',colsample_bylevel=0.75,
                                             learning_rate=0.05,max_depth=6,
                                             n_estimators=200,subsample=0.6))
        self.clf.append(clf)
        
        clf = []
        for i in range(k):
            clf.append(lightgbm.LGBMClassifier(boosting_type='rf',bagging_freq=1,
                                               bagging_fraction=0.75,feature_fraction=0.75,
                                               num_leaves=20))
        self.clf.append(clf)
        
        clf = []
        for i in range(k):
            clf.append(lightgbm.LGBMClassifier(boosting_type='dart',learning_rate=0.15,
                                               subsample=0.5,
                                               num_leaves=20))
        self.clf.append(clf)
        
        clf = []
        for i in range(k):
            clf.append(lightgbm.LGBMClassifier(boosting_type='gbdt',learning_rate=0.1,
                                               subsample=0.5,max_depth=4,
                                               num_leaves=20))
        self.clf.append(clf)
        
        clf = []
        for i in range(k):
            clf.append(RandomForestClassifier(n_estimators=1000))
        self.clf.append(clf)
        
        clf = []
        for i in range(k):
            clf.append(AdaBoostClassifier(n_estimators=200, learning_rate=0.5))
        self.clf.append(clf)
        
        clf = []
        for i in range(k):
            clf.append(LogisticRegression(C=100))
        self.clf.append(clf)
        
        clf = []
        for i in range(k):
            clf.append(SVC(kernel="poly",C=7,coef0=1,degree=4))
        self.clf.append(clf)
        
        clf = []
        for i in range(k):
            clf.append(SVC(kernel="rbf",C=9,gamma=0.75))
        self.clf.append(clf)
        
        clf = []
        for i in range(k):
            clf.append(KNeighborsClassifier(n_neighbors=25))
        self.clf.append(clf)
        
        clf = []
        for i in range(k):
            clf.append(ExtraTreeClassifier(splitter="random",min_samples_split=26))
        self.clf.append(clf)
        
    def fit(self, X, y):
        
        skf = StratifiedKFold(self.fold)
        
        count = 0
        features = []
        for clf in self.clf:
            
            feature = np.zeros(len(X))
            for i, (train_index, test_index) in enumerate(skf.split(X, y)):
                
                Xtrain, ytrain = X[train_index], y[train_index]
                clf[i].fit(Xtrain, ytrain)
                feature[test_index] = clf[i].predict(X[test_index])
                
            feature = feature[:, np.newaxis]
            features.append(feature)
            count += 1
            print(count)
    
        features = np.concatenate(features, axis=1)
        features = np.concatenate([X, features], axis=1)
        
        params = {"learning_rate":[0.01,0.03,0.05,0.1],
                  "max_depth":[2,4,6,8,10],
                  "subsample":[0.6,0.75,0.85,1],
                  "colsample_bylevel":[0.6,0.75,0.85,1]}

        base = xgboost.XGBClassifier(n_estimators=200,booster="gbtree")
        self.meta_learner = GridSearchCV(base,params,n_jobs=-1,scoring='roc_auc',verbose=1)
        self.meta_learner.fit(features, y)
        return features
    
    def predict_proba(self, X):
        
        features = []
        for clfs in self.clf:
            
            preds = np.zeros(len(X))
            for clf in clfs:
                preds += clf.predict(X)
                
            feature = (preds / self.fold) > 0.5
            feature = feature[:, np.newaxis]
            features.append(feature)
            
        features = np.concatenate(features, axis=1)
        features = np.concatenate([X,features], axis=1)
        
        return self.meta_learner.predict_proba(features), features

In [7]:
model = Ensemble(3)

In [8]:
features = model.fit(Xn_train, y_train)

  if diff:
  if diff:
  if diff:


1


  if diff:
  if diff:
  if diff:


2


  if diff:
  if diff:
  if diff:


3


  if diff:
  if diff:
  if diff:


4


  if diff:
  if diff:
  if diff:


5
6
7
8
9
10
11
12
Fitting 3 folds for each of 320 candidates, totalling 960 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   28.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 14.0min
[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed: 17.8min finished


In [9]:
prob, tfeatures = model.predict_proba(Xn_test)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


In [10]:
print(roc_auc_score(y_test, np.squeeze(prob[:,1])))

0.968822868080134


In [11]:
model.meta_learner.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.85,
       colsample_bytree=1, gamma=0, learning_rate=0.03, max_delta_step=0,
       max_depth=8, min_child_weight=1, missing=None, n_estimators=200,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.75)

In [12]:
model.meta_learner.best_estimator_.feature_importances_

array([1.14255175e-01, 1.08910270e-01, 1.08784504e-01, 1.81097910e-02,
       4.40168520e-03, 3.20694200e-03, 6.28812195e-05, 1.74180977e-02,
       3.20694200e-03, 6.16235938e-03, 1.09413322e-02, 8.61472636e-03,
       5.72219072e-03, 2.20084260e-03, 5.63415699e-02, 8.02364349e-02,
       1.18405335e-01, 1.03754006e-01, 1.36640891e-01, 7.41998386e-03,
       6.16235938e-03, 2.57812999e-03, 6.97981520e-03, 1.01238759e-02,
       8.74048937e-03, 1.01238759e-02, 3.45846685e-03, 6.85405266e-03,
       6.47676550e-03, 1.37709863e-02, 9.93523281e-03], dtype=float32)

In [13]:
y_pred_prob, ttfeature = model.predict_proba(Xn_t)
with open("pred_meta.csv","w") as f:
    f.write("Id,Prediction\n")
    for i in range(1,len(y_pred_prob)+1):
        f.write(str(i) + "," + str(y_pred_prob[i-1][1]) + "\n")

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


In [16]:
features.shape

31

In [14]:
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [45]:
batch = 200
epoch = 50

tf.reset_default_graph()

f_num = features.shape[-1]

X_in = tf.placeholder(tf.float32, [None, f_num])
y = tf.placeholder(tf.int32, [None])
y_oh = tf.one_hot(y, 2)

h1 = tf.layers.dense(X_in, f_num, activation=tf.nn.relu)
h2 = tf.layers.dense(h1+X_in, f_num, activation=tf.nn.relu)
out = tf.layers.dense(h2+h1, 2)
loss = tf.losses.softmax_cross_entropy(y_oh, out)
optimizer = tf.train.AdamOptimizer()
train_step = optimizer.minimize(loss)


pred = tf.nn.softmax(out)
_,acc = tf.metrics.accuracy(y, tf.argmax(pred, 1))
auc = tf.metrics.auc(y_oh, pred)

dataset = tf.data.Dataset.from_tensor_slices((features, y_train))
dataset = dataset.repeat(epoch)
dataset = dataset.batch(batch)
iterator = dataset.make_initializable_iterator()
next_element = iterator.get_next()

In [46]:
with tf.device("/gpu:0"):
    with tf.Session() as sess:
        
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())
        sess.run(iterator.initializer)
        
        count = 0
        while True:
            try:
                bf, by = sess.run(next_element)
                sess.run(train_step, feed_dict={X_in:bf, y:by})
                count += 1
                if count % 100 == 0:
                    a1, a2 = sess.run([acc, auc], feed_dict={X_in:tfeatures, y:y_test})
                    print("accuracy {}, auc {}".format(a1, a2))
            except tf.errors.OutOfRangeError:
                a1, a2 = sess.run([acc, auc], feed_dict={X_in:tfeatures, y:y_test})
                print("accuracy {}, auc {}".format(a1, a2))
                prob = sess.run(pred, feed_dict={X_in:ttfeature})
                break

accuracy 0.8955000042915344, auc (0.0, 0.9505647)
accuracy 0.8955000042915344, auc (0.9505647, 0.9532799)
accuracy 0.8953333497047424, auc (0.9532799, 0.95418054)
accuracy 0.8953750133514404, auc (0.95418054, 0.9548164)
accuracy 0.8955000042915344, auc (0.9548164, 0.95537287)
accuracy 0.8957083225250244, auc (0.95537287, 0.9558708)
accuracy 0.8956785798072815, auc (0.9558708, 0.9561421)
accuracy 0.8957187533378601, auc (0.9561421, 0.9563302)
accuracy 0.8957222104072571, auc (0.9563302, 0.95662004)
accuracy 0.895799994468689, auc (0.95662004, 0.9568718)
accuracy 0.8959090709686279, auc (0.9568718, 0.95702696)
accuracy 0.8958749771118164, auc (0.95702696, 0.95710117)
accuracy 0.8958653807640076, auc (0.95710117, 0.95728564)
accuracy 0.895892858505249, auc (0.95728564, 0.9574387)
accuracy 0.8959833383560181, auc (0.9574387, 0.95754987)
accuracy 0.8958906531333923, auc (0.95754987, 0.95759064)
accuracy 0.8959705829620361, auc (0.95759064, 0.957707)
accuracy 0.8960555791854858, auc (0.95770

In [47]:
with open("pred_meta.csv","w") as f:
    f.write("Id,Prediction\n")
    for i in range(1,len(prob)+1):
        f.write(str(i) + "," + str(prob[i-1][1]) + "\n")