In [1]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from Tree_Machine import Tree_Machine, simple_batcher
from numpy import load
import math

X_tr=load('x_train.npy')
X_te=load('x_test.npy')
y_tr=load('y_train.npy')
y_te=load('y_test.npy')


#%%

oh = OneHotEncoder()
oh.fit(np.vstack((X_tr, X_te))-1)
X_tr_sp = oh.transform(X_tr-1)
X_te_sp = oh.transform(X_te-1)
logreg = LogisticRegression()
logreg.fit(X_tr_sp, y_tr)
y_pred = logreg.predict_proba(X_te_sp)[:, 1]
print(roc_auc_score(y_te, y_pred))
coef = logreg.coef_[0]
intercept = logreg.intercept_[0]
#%%

rank = 30 # has to be larger than number of features in this case 
s_features=[7,2,21,19,943,1682,10,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]

### Initialization of factor matrices 

num_features = len(s_features)
w_cores = [None] * num_features
begin_feature = [0] + list(np.cumsum(s_features)) #### where each feature begins in the big d vector

coef = logreg.coef_[0]
intercept = logreg.intercept_[0]

# MATRICES 

for i in range(num_features):
    local_dim = s_features[i]
 
    tmp = np.zeros((local_dim+1,rank))
    tmp[0,:num_features]=1
    tmp[0,i]=intercept/num_features   
    tmp[1:s_features[i]+1,i]= coef[begin_feature[i]:begin_feature[i]+s_features[i]]

    w_cores[i] = tmp.astype(np.float32) 
    
# CORES
    
# list of tensor cores 
levels=math.ceil(np.log2(num_features)) # num levels, in this case 5 levels, excluding factor matrices (matrix on top included)
num_tensors_last_level=num_features-2**(levels-1) # (26-2**4) 10 in this case

w_tensors=[]
for i in range(levels):
    G_tensors_local=[]
    for j in range(2**i):
        if (i==levels-1) and (j==num_tensors_last_level):  # if in the last level and last tensor, break
            break
        G_tensors_local.append(None)
    w_tensors.append(G_tensors_local)
    
identity_tensor=np.zeros(shape=(rank,rank,rank))
for i in range(0,rank):
    identity_tensor[i,i,i]=1
       
for i in range(len(w_tensors)):
    for j in range(len(w_tensors[i])):
        if i==0:
            w_tensors[i][j]=np.eye(rank)
        else:
            w_tensors[i][j]=identity_tensor


#%%
# 0.001, 0.00001, 1.1, lr=1e-4
            
model = Tree_Machine(rank=rank, s_features=s_features, init_std=0.001, reg=5e-7, exp_reg=1.1) 
model.init_from_cores(w_cores) # factor matrix initialization
model.init_from_cores_tensors(w_tensors) # tensor cores initialization
model.build_graph()
model.initialize_session()

epoch_hist = []
for epoch in range(21):
    # train phase
    loss_hist = []
    penalty_hist = []
    for x, y in simple_batcher(X_tr, y_tr, 256):
        fd = {model.X: x, model.Y: 2*y-1}
        run_ops = [model.trainer, model.outputs, model.loss, model.penalty]

        _, outs, batch_loss, penalty = model.session.run(run_ops, fd)

        loss_hist.append(batch_loss)
        penalty_hist.append(penalty)
        
    epoch_train_loss = np.mean(loss_hist)
    epoch_train_pen = np.mean(penalty_hist)
    
    epoch_stats = {
        'epoch': epoch,
        'train_logloss': float(epoch_train_loss)
    }
    
    # test phase
#    if epoch%2==0 and epoch>0:
    fd = {model.X: X_te, model.Y: 2*y_te-1}
    run_ops = [model.outputs, model.loss, model.penalty, model.penalized_loss]

    outs, raw_loss, raw_penalty, loss = model.session.run(run_ops, fd)

    epoch_test_loss = roc_auc_score(y_te, outs)
    epoch_stats['test_auc'] = float(epoch_test_loss),
#        epoch_stats['penalty'] = float(raw_penalty)
    print('{}: te_auc: {:.4f}'.format(epoch, epoch_test_loss))
epoch_hist.append(epoch_stats)
    
        

Instructions for updating:
non-resource variables are not supported in the long term


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


0.7821230253636114
Instructions for updating:
Use `tf.global_variables_initializer` instead.
0: te_auc: 0.7471
1: te_auc: 0.7562
2: te_auc: 0.7672
3: te_auc: 0.7780
4: te_auc: 0.7793
5: te_auc: 0.7796
6: te_auc: 0.7799
7: te_auc: 0.7802
8: te_auc: 0.7803
9: te_auc: 0.7803
10: te_auc: 0.7803
11: te_auc: 0.7803
12: te_auc: 0.7803
13: te_auc: 0.7803
14: te_auc: 0.7801
15: te_auc: 0.7799
16: te_auc: 0.7796
17: te_auc: 0.7793
18: te_auc: 0.7789
19: te_auc: 0.7786
20: te_auc: 0.7782
