### M6： Binary map + Random Forest + kNN


In [1]:
%matplotlib inline
import math
import os
import data_util
import BMapModel
#from data_util import DataPoint
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import faiss
import util
import scipy
# import joblib # version incompatibel with sklearn's joblib and can't load the previous model

from scipy.sparse import save_npz, load_npz
from sklearn.externals import joblib # store classifiers
from sklearn.preprocessing import MultiLabelBinarizer # convert y to {0,1}^L
from sklearn.preprocessing import StandardScaler # normalize features 
from sklearn.feature_extraction import DictVectorizer # extract feature vector to x
from numpy.random import normal # generate transforming matrix
from sklearn.neighbors import KDTree #KDTree for fast kNN search
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import average_precision_score
from joblib import Parallel, delayed # Multitread
from pytictoc import TicToc
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

Failed to load GPU Faiss: No module named swigfaiss_gpu
Faiss falling back to CPU-only.


In [2]:
!ls -R ../data

../data:
AmazonCat	   Bibtex	   Eurlex     README_Datasets
AmazonCat-14K	   Delicious	   Mediamill  Wiki10
AmazonCat-14K.zip  DeliciousLarge  RCV1-x     XMLDatasetRead

../data/AmazonCat:
amazonCat_test.txt  amazonCat_train.txt  X_te.npz  X_tr.npz  Y_te.npz  Y_tr.npz

../data/AmazonCat-14K:
amazonCat-14K_test.txt	 X_te.npz  Y_te.npz
amazonCat-14K_train.txt  X_tr.npz  Y_tr.npz

../data/Bibtex:
Bibtex_data.txt     bibtex_tstSplit.txt  X_tr.npz  Y_tr.npz
bibtex_trSplit.txt  X_te.npz		 Y_te.npz

../data/Delicious:
Delicious_data.txt	X_te.npz  X_tr.pkl  Y_tr.npz
delicious_trSplit.txt	X_te.pkl  Y_te.npz  Y_tr.pkl
delicious_tstSplit.txt	X_tr.npz  Y_te.pkl

../data/DeliciousLarge:
deliciousLarge_test.txt   X_te.npz  Y_te.npz
deliciousLarge_train.txt  X_tr.npz  Y_tr.npz

../data/Eurlex:
eurlex_test.txt  eurlex_train.txt  X_te.npz  X_tr.npz  Y_te.npz  Y_tr.npz

../data/Mediamill:
Mediamill_data.txt  mediamill_trSplit.txt  mediamill_tstSplit.txt

../data/RCV1-x:
rc

In [3]:
ls ../data/Delicious/Delicious_data.txt

[0m[01;32m../data/Delicious/Delicious_data.txt[0m*


In [5]:
data_dir = "../data"
model_dir = "../model/model6"
#train_filename = "/AmazonCat-14K/amazonCat-14K_train.txt"
#test_filename = "/AmazonCat-14K/amazonCat-14K_test.txt"
#tr_split_file = "/Bibtex/bibtex_trSplit.txt"
#te_split_file = "/Bibtex/bibtex_tstSplit.txt"

path = "/Delicious"
model_path = model_dir + path
data_path = data_dir + path
num_core = -1
L_hat_ratio = 0.5 # useful when calculate L_hat = klogn*ratio
L_hat = 100
time = TicToc()

In [5]:
#tr_data, num_point, num_feature, num_label = data_util.read_file(data_dir+train_filename)
#te_data, _, _, _ = data_util.read_file(data_dir+test_filename)
#tr_split = data_util.split_data(data=tr_data, split_file=data_dir+tr_split_file)
#te_split = data_util.split_data(data=tr_data, split_file=data_dir+te_split_file)
#print("num_point={}, num_label={}, num_feature={}".format(num_point, num_label, num_feature))

In [None]:
time.tic()
X_tr, Y_tr, X_te, Y_te = data_util.data_transform(tr_data, te_data, num_label)
time.toc('data preprocessing')

In [None]:
X_tr.shape, X_te.shape, Y_tr.shape, Y_te.shape

In [None]:
for name, x in zip(['X_tr', 'X_te', 'Y_tr', 'Y_te'], [X_tr, X_te, Y_tr, Y_te]):
    save_npz(os.path.join(data_path, '{}.npz'.format(name)), x)

In [6]:
[X_tr, X_te, Y_tr, Y_te] = [load_npz(os.path.join(data_path, '{}.npz'.format(name)))\
                            for name in ['X_tr', 'X_te', 'Y_tr', 'Y_te']]

In [7]:
X_tr.shape, X_te.shape, Y_tr.shape, Y_te.shape

((12920, 500), (3185, 500), (12920, 983), (3185, 983))

### Step 1: map to $\hat L$ space and kNN search index

We apply $$\hat L = k \log L$$ where $k$ indicates the sparsity of each label vector $y_i = \{0,1\}^L$. By default we choose k to be the 99.9% maximum sparsity to avoid extreme cases.

The data in "Eurlex" contains $L = 5000$ labels, we are trying to map it into $\hat L = 200$ space.

In [8]:
avg_k = Y_tr.getnnz() / float(Y_tr.shape[0])
avg_k

19.035371517027865

In [9]:
k = sorted([Y_tr[i, :].getnnz() for i in range(Y_tr.shape[0])], reverse=True)\
    [int(X_tr.shape[0]*0.01)]
k

25

In [10]:
#L_hat = int(math.ceil(avg_k * math.log(Y_tr.shape[1], 2) * L_hat_ratio))
L_hat

100

In [11]:
time.tic()
Z_tr = util.map_2_z(Y_tr, L_hat)
time.toc()

Elapsed time is 0.413983 seconds.


In [12]:
Z_tr.shape

(12920, 100)

In [13]:
# sparsity in embedding space
(Z_tr == 1).sum() / float(Z_tr.shape[0] * Z_tr.shape[1])

0.4808877708978328

### Step 2: Train Model

#### 2.1 train binary classifiers on each bit

In [16]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_jobs=-1, n_estimators=100, random_state=1)
time.tic()
clf.fit(X_tr, Z_tr)
training_time = time.tocvalue()

In [15]:
joblib.dump(clf, os.path.join(model_path , 'label0.pkl'), compress=3)# only one classifiers, name for convention#

KeyboardInterrupt: 

In [None]:
time.tic()
clf = joblib.load(os.path.join(model_path , 'label0.pkl'))
time.toc()

In [None]:
clf # test if load successfully

#### 2.2 Store the lower degree space info for kNN

We use opensource faiss library from FAIR to speedup the ANN(Approximate Nearest Neighbor) search.

When dimension and data size is relatively small, we use the brute force kNN search.

In [17]:

# faiss brute force search
nn_index = faiss.index_factory(Z_tr.shape[1], "Flat", faiss.METRIC_L2)   # build the index
time.tic()
nn_index.add(Z_tr.astype('float32'))
time.toc()


Elapsed time is 0.003482 seconds.


```Python
# index created by index factory, Approximate kNN search
nn_index = faiss.index_factory(Z_tr.shape[1], "IVF100,Flat", faiss.METRIC_L2) # need train
nn_index.train(Z_tr.astype('float32'))
nn_index.add(Z_tr.astype('float32'))
print "nlist = {}".format(nn_index.nlist) # number of clusters, only INF has this
print "nprobe = {}".format(nn_index.nprobe)
nn_index.nprobe = 1 # number of clusters to search through, only INF has this, need to be validate
```

### Step 3 Prediction and Validation

test by RandomForest.predict and predict_prob to generate z_pred and then ues kNN to find y.

In [18]:
model = BMapModel.BM_Predictor(Y_tr.shape[1], L_hat=1, index=nn_index, 
                               Y_tr=Y_tr, model_path=model_path, num_core=-1)

In [None]:
time.tic()
Y_pred = model.predict_y(X_te, vote=30, classifier='RandomForest') # 1 nearest neighbor
time.toc()

In [None]:
time.tic()
Y_pred_p = model.predict_y(X_te,  vote=30, classifier='RandomForest', predict_prob=True) # 1 nearest neighbor
time.toc()

In [None]:
import util
for i in np.arange(1,6,2):
    print "p@{} for classification:\t {}\n".format(i, util.precision_at_k(Y_te, Y_pred, i))

In [None]:
for i in np.arange(1,6,2):
    print "p@{} for probability predict:\t {}\n"\
        .format(i, util.precision_at_k(Y_te, Y_pred_p, i))

In [None]:
time.tic()
z_prob = model.predict_prob_z(X_te)
time.toc('predict prob')

In [None]:
time.tic()
dist, ind = nn_index.search(z_prob.astype('float32'), 100)
#time.toc('Approximate-kNN search with probe={}'.format(model.index.nprobe))
time.toc('kNN')

In [None]:
time.tic()
Y_pred_t = np.array([np.sum([Y_tr[indij] for indij in indi], axis=0)\
                     for indi in ind])
time.toc('weighted votes on kNN')

In [None]:
util.precision_at_k(Y_te, Y_pred_t)

In [None]:
time.tic()
Y_pred = Parallel(n_jobs=num_core)\
    (delayed(vote)(indi, disti, True) for indi, disti in zip(ind, dist))
time.toc()

In [None]:
util.precision_at_k(Y_te, Y_pred)

Result (vote = 100, kNN = brute force)

|Large DataSet  |precision@k| RF Classification | RF predict prob | DiSMEC | SLEEK | FastXML
|----| 
|RCV1-2k        |p@1        | 80.6              | 80.9            | *      | *     | * 
|               |p@2        | 68.0              | 68.2            | *      | *     | * 
|               |p@3        | 63.9              | 64.1            | *      | *     | *  
|Wiki10-31k     |p@1        | 80.80             | 80.81           | 85.20  | **85.88** | 83.03 
|               |p@3        | 62.74             | 64.65           | **74.60**  | 72.98 | 67.74  
|               |p@5        | 54.91             | 56.54           | **65.90**  | 62.70 | 57.76 
|Delicious-200k |p@1        | 38.63             | 39.92           | 45.50  | **47.85** | 43.07
|               |p@3        | 35.83             | 36.89           | 38.70  | **42.21** | 38.66  
|               |p@5        | 34.20             | 35.15           | 35.50  | **39.43** | 36.19 
|AmazonCat-13k  |p@1        | 78.51             | 78.59           | 93.40  | 90.53 | 93.11
|               |p@3        | 63.62             | 63.77           | 79.10  | 76.33 | 78.20  
|               |p@5        | 48.72             | 48.66           | 64.10  | 61.52 | 63.41 




Small Dataset:

|Small Dataset  |precision@k| RF Classification | RF predict prob | DiSMEC | SLEEK | FastXML
|----| 
|Delicious      |p@1        | 63.17             | 64.27            | *      | **67.59** | 67.13
|               |p@2        | 58.18             | 58.79            | *      | 61.38 | **62.33** 
|               |p@3        | 53.38             | 54.33            | *      | 56.56 | **58.62** 
|Bitex          |p@1        | 59.80             | 60.00            | *      | **65.08** | 63.46
|(Voter=30)     |p@2        | 31.58             | 31.96            | *      | **39.64** | 39.22 
|               |p@3        | 22.26             | 22.40            | *      | 28.87 | **29.14**


In [19]:
def validate_voter(voter, use_prob):
    Y_pred = model.predict_y(X_te, vote=voter, classifier='RandomForest', 
                             predict_prob=use_prob)
    return (util.precision_at_k(Y_te, Y_pred, 1))

In [None]:
p_at_k_votes_prob = Parallel(n_jobs=num_core)\
                    (delayed(validate_voter)(voter, True) for voter in np.arange(1, 151, 10))

In [None]:
p_at_k_votes = Parallel(n_jobs=num_core)\
                    (delayed(validate_voter)(voter, False) for voter in np.arange(1, 151, 10))

In [None]:
#plt.plot(np.arange(1, 151, 10), p_at_k_votes_prob, label='pred_prob')
plt.plot(np.arange(1, 151, 10), p_at_k_votes, label='pred')
plt.xlabel('number of voters in kNN')
plt.ylabel('p@1 score')
plt.title('L_hat={}, RandomForest predict_prob, {}'.format(L_hat, train_filename))
plt.legend()
top = np.argmax(p_at_k_votes)
(p_at_k_votes[top], top), (np.max(p_at_k_votes_prob), np.argmax(p_at_k_votes_prob))

#### 3.3 optimize hyperparameter
use  k fold cross validation to optimize over 

In [None]:
from sklearn.ensemble import RandomForestClassifier
# validate the result with different L_hat under the same model
def validate(L_hat, pk=1, vote=20, use_prob=False): # simple forkable parallel for loop body
    from util import map_2_z
    from util import precision_at_k

   # for train_index, test_index in k_fold.split(X_tr):
    x_train = X_tr
    y_train = Y_tr
    x_test = X_te
    y_test = Y_te

    # map and create kNN index
    z_train = map_2_z(y_train, L_hat)
    # faiss brute force search
    knn_index = faiss.index_factory(z_train.shape[1], "Flat", faiss.METRIC_L2)   # build the index
    knn_index.add(z_train.astype('float32'))
    #train clf
    clf = RandomForestClassifier(random_state=1, n_estimators=100)
    clf.fit(x_train, z_train)
    # construct model
    model = BMapModel.BM_Predictor(Y_tr.shape[1], 1, index=knn_index, Y_tr=y_train)
    model.clfs.append(clf)
    #predict and calculate p@k score
    y_pred = model.predict_y(x_test, vote=vote, weighted=True, classifier='RandomForest', predict_prob=use_prob)
    # precision@pk
    return precision_at_k(y_test, y_pred, k=pk)

In [None]:
# Optimize L_hat's value on the metric precision@k
pk=1
vote=100
L_hat_range = range(1, 200)

In [None]:
L_hat_score = Parallel(n_jobs=num_core)\
    (delayed(validate)(L_hat, pk, vote) for L_hat in L_hat_range)

In [None]:
L_hat_score_prob = Parallel(n_jobs=num_core)\
    (delayed(validate)(L_hat, pk, vote, True) for L_hat in L_hat_range)

In [None]:
line_down, = plt.plot(range(1,200), L_hat_score, label='random forest performance')
plt.xlabel('L_hat')
plt.ylabel('precision@{}'.format(pk))
plt.title('Delicious validation on L_hat')

### 3.4 Bit Flip Probability
the classifiers predict $\hat z$ can be viewed as transmiting z from a BSC channel with some bit flip probability, this is actually representing the prediction accuracy.

In [None]:
L_hat, X_te.shape[0]

In [None]:
def validate_channel(X_te, Y_te):
    z_te = util.map_2_z(Y_te, L_hat)
    # use the classifers to predict z_hat
    model = BMapModel.BM_Predictor(Y_tr.shape[1], L_hat=1, model_path=model_path,)
    z_pred = model.predict_z(X_te)
    
    hamming = []
    for i in range(z_te.shape[0]):
        hamming.append((z_pred[i]!=z_te[i]).sum())
    return np.array(hamming) / float(z_te.shape[1])

In [None]:
test_error = validate_channel(X_te, Y_te)

In [None]:
sns.distplot(test_error, bins=20)
plt.xlabel('error rate between z_te and z_pred')
plt.ylabel('density')
plt.title('Delicious: distribution of bit_flip')

In [None]:
test_error.mean()

In [None]:
training_error = validate_channel(X_tr, Y_tr)

In [None]:
sns.distplot(training_error, bins=20)
plt.xlabel('error rate between z_train and z_pred')
plt.ylabel('density')
plt.title('{}: distribution of training error'.format(train_filename))

In [None]:
training_error.mean()

In [None]:
# f1 score

### 3.5 Train and test model directly on the X and Y 

In [None]:
model_dir_mirror = model_dir+"_origin"
model_path_mirror = model_dir_mirror+path

In [None]:
classifier = RandomForestClassifier(n_estimators=100, random_state=1, n_jobs=-1)
time.tic()
classifier.fit(X=X_tr, y=Y_tr)
time.toc()
#joblib.dump(classifier, os.path.join(model_path_mirror , 'RandomForestClassifier.pkl'))# only one classifiers, name for convention

In [None]:
#classifier = joblib.load(os.path.join(model_path_mirror , 'RandomForestClassifier.pkl'))            

In [None]:
time.tic()
y_pred = classifier.predict(X_te)
time.toc('predict')

In [None]:
hamming = []
for i in range(y_pred.shape[0]):
    hamming.append((y_pred[i]!=Y_te[i]).sum())
hamming = np.array(hamming) / float(Y_te.shape[1])

In [None]:
sns.distplot(hamming)
plt.xlabel('error_rate')
plt.ylabel('density')
plt.title('test error rate distribution, {}, Randomforest, mean={}'.format(train_filename, hamming.mean()))

In [None]:
time.tic()
y_pred_prob = classifier.predict_proba(X_te) #for every label there's a 2D (sample, output(2)) probability
time.toc('predict_proba')

In [None]:
y_prob = np.ascontiguousarray(np.array([prob[:, 1] for prob in y_pred_prob]).T)

In [None]:
y_prob.shape

In [None]:
Y_te.shape

**This is the precision@k of RandomForest.predict_prob training directly on X_tr and Y_tr**

In [None]:
util.precision_at_k(Y_te, y_prob, 3)

precit_prob + kNN

In [None]:
Y_tr_index = faiss.index_factory(Y_tr.shape[1], "Flat", faiss.METRIC_L2)   # build the index
Y_tr_index.add(Y_tr.astype('float32'))

In [None]:
dist, ind = Y_tr_index.search(y_prob.astype('float32'), 1)
y_prob_vote = np.array([np.sum([Y_tr[ind[i][j]] for j in range(len(ind[i]))], axis=0) for i in range(len(ind))])

In [None]:
util.precision_at_k(Y_te, y_prob_vote, 1)

In [None]:
y_prob[0]

In [None]:
y_pred[0]

In [None]:
def flip_bits(message, p0, p1):
    '''
    randomly flip every "1" w/ prob p1, and every "0" w/ p0
    '''
    def flip(bit):
        if bit==1 and np.random.rand()<p1:
            bit = 0
        if bit==0 and np.random.rand()<p0:
            bit=1
        return bit
    np.random.seed(0)
    return np.apply_along_axis(lambda bits: np.array([flip(bit) for bit in bits]), 0, message)

In [None]:
flip_bits(np.array([[1,0,1],[0,1,1]]), 0,0)