# Sythetic Multilabel Classification

## Data Generation
 y <== L dimension one-hot vector, each entry represent a label
 
 X <== y + N(0, $\sigma$)
 
 ## Train Process
 
* y $\in [0,1]^L$
 
* $\bar y = sign(M\cdot y) \in [0,1]^{\bar L}$, where M is a iid gaussian entry embedding matrix (store $\bar Y$ into local files for Matlab)
 
* $\tilde y = \textbf{BCHencode}(\bar y) \in [0,1]^{\tilde L}$ (need to use Matlab)
 
* Train multi-label random forest on $X, \tilde y$ 

**Notice**: for the BCH code, we choose the message length to be 67, codeword length to be 511, the error correction bit is 87. 

The error correction rate is 0.17

## Implement general One vs All classifier

In [1]:
from pytictoc import TicToc
time = TicToc()

In [2]:
from util import OvsA


In [3]:
from sklearn.linear_model import LogisticRegression
from joblib import Parallel, delayed
def fit_bit(method, X, y):
    return method().fit(X, y)

def predict_bit(clf, X):
    return clf.predict(X)

class OvsA():
    '''
    use OvsA technic to predict one bit in y by a base classifier
    '''
    def __init__(self, method=LogisticRegression, n_jobs=-1):
        '''
        method: 
            the function to generate the base classifiers.
        '''
        self.method = method
        self.n_jobs = n_jobs
        
    def predict(self, X):
        bits = Parallel(n_jobs=self.n_jobs)(delayed(predict_bit)(clf, X)
                                           for clf in self.clfs)
#         bits = [clf.predict(X) for clf in self.clfs]
        return np.stack(bits, axis=1)
    
    def fit(self, X, y):
        self.clfs = Parallel(n_jobs=self.n_jobs)(delayed(fit_bit)(self.method, X, y[:,i]) 
                                     for i in range(y.shape[1]))
#         self.clfs = [self.method().fit(X, y[:, i]) for i in range(y.shape[1])]

In [4]:
class CombinedClf():
    '''
    use two different multilabel classifier classifier for
        X -> y[:, :x] and X -> y[:, x:]
    '''
    def __init__(self, clf1, clf2, seperate):
        '''
        input:
            method1,2: 
                method that return a new classifier needed
            sperate:
                the part that seperates which classifier to use
            n_jobs:
                number of treads to use
        '''
        self.clf1 = clf1
        self.clf2 = clf2
        self.seperate = seperate
        
    def predict(self, X):
        y1 = self.clf1.predict(X)
        y2 = self.clf2.predict(X)
        return np.append(y1, y2, axis=1)
    
    def fit(self, X, y):
        self.clf1.fit(X, y[:, :self.seperate])
        self.clf2.fit(X, y[:, self.seperate:])
        

## Training Process

In [5]:
import numpy as np
from numpy.random import binomial
from numpy.random import normal
from numpy.random import randint
import numpy as np
np.random.seed(42)

In [30]:
# constants
SPARSE = 0.05 # sparsity of label vectors
SIGMA = 0. # standard diveation of noise
FLIP_RATE = 0.005 # probability that bits in y flipped in bits of X
L = 500 # feature and label dimension
N = 10000 # number of data points
voter = 30 # number of nearest neighbors to search

L_bar = 45 # embedding dimension, also the message length for BCH code
L_tilde = 255 # codeword length for BCH encoder

In [31]:
# generate synthetic data
y = binomial(1, SPARSE, size=(N, L)) # iid Bernoulli entries
#X = y + normal(loc=0, scale=SIGMA, size=(N, L))
flip_bits = binomial(1, FLIP_RATE, size=(N, L))
X = y^flip_bits
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33)

In [34]:
# Source encode + KNN searcher
M = normal(size=[L, L_bar])
y_train_bar = (np.sign(y_train.dot(M))+1)/2
y_test_bar = (np.sign(y_test.dot(M)) + 1) / 2
import faiss
nn_index = faiss.index_factory(y_train_bar.shape[1], "Flat", faiss.METRIC_L2)   # build the index
nn_index.add(y_train_bar.astype('float32'))

Failed to load GPU Faiss: No module named swigfaiss_gpu
Faiss falling back to CPU-only.


In [35]:
# save y_bar to matlab file
from scipy.io import savemat, loadmat
savemat(file_name="../.temp/train/y_bar", mdict={'y_bars':[y_train_bar],
                                                 'y_test':y_test_bar,
                                                 'L_tilde':L_tilde
                                                })

In [36]:
y_train_bar.shape

(6700, 45)

----

Using **Matlab** to encode $\bar y$ into $\tilde y$ ...

----

In [37]:
# load the y_tilde file generated by matlab
from scipy.io import savemat, loadmat
y_tildes = loadmat("../.temp/train/y_tilde")['y_tildes'].astype('float')
y_train_tilde = y_tildes[0]
y_test_tilde = loadmat("../.temp/train/y_tilde")['y_test_tilde'].astype('float')

In [39]:
y_train_tilde.shape

(6700, 255)

In [38]:
float(y_train_tilde.sum()) / (y_train_tilde.shape[0] * y_train_tilde.shape[1])

0.4986713491366696

In [64]:
# train the random forest multi-label classifier
from pytictoc import TicToc
time = TicToc()
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
#clf = OvsA(SVC)
# clf = CombinedClf(OvsA(SVC), RandomForestClassifier(n_estimators=48, n_jobs=-1),
#                   seperate = L_bar)
clf = CombinedClf(OvsA(SVC), MLPClassifier(hidden_layer_sizes=(50,), max_iter=1000),
                  seperate = L_bar)

time.tic()
clf.fit(X_train, y_train_tilde)
time.toc("train classifier")

train classifier 330.299255 seconds.


## Testing Process

In [65]:
time.tic()
y_tilde_hat = clf.predict(X_test)
time.toc("prediction")

prediction 99.326697 seconds.


In [66]:
y_test_tilde.shape

(3300, 255)

In [67]:
y_tilde_hat.shape

(3300, 255)

In [68]:
# random forest prediction error
1-(y_test_tilde == y_tilde_hat).sum() / float(y_test_tilde.shape[0] * y_test_tilde.shape[1])

0.4435448603683898

In [69]:
# random forest prediction error for embedding bits
1-(y_test_tilde[:, :45] == y_tilde_hat[:, :45]).sum() / float(y_test_tilde[:, :45].shape[0] * y_test_tilde[:, :45].shape[1])

0.18396632996633

In [70]:
# random forest prediction error for parity bits
1-(y_test_tilde[:, 46:] == y_tilde_hat[:, 46:]).sum() / float(y_test_tilde[:, 46:].shape[0] * y_test_tilde[:, 46:].shape[1])

0.49919240249383789

In [58]:
from scipy.io import savemat, loadmat
savemat(file_name="../.temp/test/y_tilde_hat", 
        mdict={'y_tilde_hats':[y_tilde_hat],
               'L_bar':L_bar
              }
       )

In [59]:
y_tilde_hat.shape

(3300, 255)

----

Using **Matlab** to decode $\hat{\tilde y}$ into $\hat{\bar y}$ ...

----

In [None]:
# load the y_tilde file generated by matlab
from scipy.io import savemat, loadmat
y_bar_hats = loadmat("../.temp/test/y_bar_hat.mat")['y_bar_hats'].astype(int)
y_bar_hat = y_bar_hats[0]
#y_bar_hat = clf_.predict(X_test)

In [None]:
# use KNN searcher to recover the predicted y_hat
dist, ind = nn_index.search(np.ascontiguousarray(y_bar_hat.astype('float32')), voter)
y_hat = np.stack([
    np.sum(np.array([
        y_train[indij].astype('float32')/float(distij**2 + 0.01) for indij, distij in zip(indi, disti)
    ]), axis=0)
    for indi, disti in zip(ind, dist)
], axis=0)

In [None]:
def precision_at_k(truth, vote, k=1):
    assert(truth.shape == vote.shape)
    success = 0
    for i in range(truth.shape[0]):
        topk = np.argpartition(vote[i], -k)[-k:]
        success += truth[i, topk].sum()
    return success / ((float(truth.shape[0]))*k)

In [None]:
precision_at_k(y_test, y_hat, 1)

In [None]:
1-(y_test == y_hat)

In [None]:
y_predict_bar == y_bar_hat

### Simple Results

* random forest classifer for multi-label task

$\sigma$| p@1 | p@3 | p@5 
  ---   | --- | --- | --- 
    0   | 0.126 | 0.107 | 0.099
    0.1 | 0.112 | 0.091 | 0.085
    0.4 | 0.063 | 0.061 | 0.059
     
* OvsA with logistic regression 

$\sigma$| p@1 | p@3 | p@5 
  ---   | --- | --- | --- 
    0   | 0.281  | 0.231 | 0.209

In [None]:
y_test_tilde

In [None]:
y_tilde_hat