In [1]:
import pandas as pd
import numpy as np
import os
import time
import subprocess
import re
import random
import arff

from math import sqrt
from sklearn.model_selection import  train_test_split
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB  
from sklearn.metrics import classification_report
from sklearn.metrics import coverage_error
from sklearn.metrics import label_ranking_loss
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import zero_one_loss
from sklearn.metrics import jaccard_similarity_score

# functions

In [2]:
def read_data(dataPath):
    # input: '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'
    # read data
    data = pd.read_csv(os.path.join(dataPath,'X.csv'))
    label = pd.read_csv(os.path.join(dataPath,'Y.csv'))
    return data,label

def evaluation(y_pred, y_prob, y_true):
    
    coverage = coverage_error(y_true, y_prob)
    hamming = hamming_loss(y_true, y_pred)
    ranking_loss = label_ranking_loss(y_true, y_prob) 
    
    f1_macro = metrics.f1_score(y_true, y_pred, average='macro')
    f1_micro = metrics.f1_score(y_true, y_pred, average='micro')
    
    acc = 0
    for i in range(y_true.shape[0]):
        acc += jaccard_similarity_score(y_true.iloc[i,:], y_pred.iloc[i,:]) # jaccard_similarity_score
    acc = round(acc / y_true.shape[0],2)
    
    zero_one = zero_one_loss(y_true, y_pred) # 0-1 error 
    
    f1_each = metrics.f1_score(y_true, y_pred, average=None)
    
    performance = {"coverage_error":coverage,
                   "ranking_loss":ranking_loss,
                   "hamming_loss":hamming,
                   "f1_macro":f1_macro,
                   "f1_micro":f1_micro,
                   "Jaccard_Index":acc,
                   "zero_one_error":zero_one,
                   "f1_each_label":f1_each}
    return performance

def get_confusion_matrix(y_pred, y_test, column_names):
    """confusion matrix """
    confusion_matrix = pd.DataFrame(np.array(y_pred) - np.array(y_test), columns=column_names)
    pos = pd.DataFrame((np.array(y_pred) == np.array(y_test)) & (np.array(y_pred) == 1), columns=y_test.columns).sum(axis=0)
    neg = pd.DataFrame((np.array(y_pred) == np.array(y_test)) & (np.array(y_pred) == 0), columns=y_test.columns).sum(axis=0)
    for i in range(confusion_matrix.shape[1]): 
        name = confusion_matrix.iloc[:,i].name
        temp = confusion_matrix.iloc[:,i].value_counts()
        TP = pos[name]
        TN = neg[name]
        if 1 in temp.index:
            FP = temp[1]
        else:
            FP = 0
        if -1 in temp.index:
            FN = temp[-1]
        else:
            FN = 0

# read data



In [3]:
dataPath = '/Volumes/Samsung_T5/research/data/small_datasets/enron/'
dataset = 'enron'
data, label = read_data(dataPath) # read data

# get data information
n_label = label.shape[1]
n_attr = data.shape[1]
n_instance = data.shape[0]
avg_label_per_instance = label.sum(axis=1).mean()
avg_instance_per_label = label.sum(axis=0).mean()
# print data information
print("\n--- Data Information ---")
print("dataset:", dataPath)
print("number of label:",n_label)
print("number of attribute:",n_attr)
print("number of instance:",n_instance)
print("avgerage number of labels for an instance:",avg_label_per_instance)
print("avgerage number of positive instances for a label:",avg_instance_per_label,"the std:",sqrt(label.sum(axis=0).var()),"\n")

print("-- number of positive instances --")
print(label.sum(axis=0))


--- Data Information ---
dataset: /Volumes/Samsung_T5/research/data/small_datasets/enron/
number of label: 53
number of attribute: 1001
number of instance: 1702
avgerage number of labels for an instance: 3.3783783783783785
avgerage number of positive instances for a label: 108.49056603773585 the std: 199.46279994809805 

-- number of positive instances --
A.A8      26
C.C9      64
B.B12      6
C.C11     26
C.C5     108
C.C7      83
B.B2     913
B.B3      42
D.D16     10
A.A7      25
D.D1      12
A.A4     533
C.C2     125
A.A3     165
A.A1     855
D.D9      18
D.D19      9
B.B8      22
D.D12     38
D.D6      21
C.C8     107
A.A6     176
B.B9      55
A.A5      96
C.C10     77
B.B1     680
D.D5      13
B.B11      7
D.D2      20
B.B4     124
D.D15      3
C.C4      63
D.D8       7
B.B6      25
D.D3      22
D.D13      8
D.D7      13
C.C12     33
B.B7      13
C.C6     249
B.B5      15
D.D11     28
A.A2      49
C.C3      71
D.D10    130
D.D18      1
B.B13    311
D.D17      2
B.B10     18
C.C1

In [11]:
cooccurrence_matrix = label.T.dot(label)
np.fill_diagonal(cooccurrence_matrix.values, 0)
#cooccurrence_matrix.to_csv('/Users/jiangjunhao/Desktop/cooccurrence_matrix.csv', index=False)
cooccurrence_matrix

Unnamed: 0,A.A8,C.C9,B.B12,C.C11,C.C5,C.C7,B.B2,B.B3,D.D16,A.A7,...,C.C3,D.D10,D.D18,B.B13,D.D17,B.B10,C.C1,D.D4,C.C13,D.D14
A.A8,0,0,0,0,0,1,18,0,0,0,...,0,3,0,3,0,0,3,0,0,0
C.C9,0,0,0,1,3,2,47,2,0,0,...,3,8,0,15,0,1,7,1,0,0
B.B12,0,0,0,0,0,0,4,0,0,0,...,0,1,0,0,0,0,0,0,0,0
C.C11,0,1,0,0,4,0,15,0,0,0,...,0,5,0,7,0,0,6,0,0,1
C.C5,0,3,0,4,0,2,57,4,0,0,...,4,4,0,15,0,0,22,1,1,2
C.C7,1,2,0,0,2,0,46,2,3,0,...,2,15,0,15,0,2,6,2,1,0
B.B2,18,47,4,15,57,46,0,22,8,7,...,32,63,0,164,0,8,100,15,3,2
B.B3,0,2,0,0,4,2,22,0,0,0,...,0,1,0,16,0,0,4,2,0,0
D.D16,0,0,0,0,0,3,8,0,0,0,...,1,1,0,1,0,0,1,0,0,0
A.A7,0,0,0,0,0,0,7,0,0,0,...,0,2,0,23,0,0,0,0,0,0


# Binary Relevance 

## BR using naive Bayes classifier

In [12]:
def naiveBayes_multi_label_training(X_train, y_train):
    start = time.time()
    
    n_label = y_train.shape[1]
    classifier_list = [MultinomialNB() for i in range(n_label)]
    for i in range(n_label):
        classifier_list[i].fit(X_train,y_train.iloc[:,i])
    
    end = time.time()
    training_time = end-start
    
    return classifier_list, training_time

def naiveBayes_multi_label_testing(X_test, n_label, classifier_list):
    y_predict = pd.DataFrame()
    y_prob = pd.DataFrame()
    
    start = time.time()
    
    for i in range(n_label):
        y_predict_i = classifier_list[i].predict(X_test)
        y_predict = pd.concat([y_predict, pd.DataFrame(y_predict_i)],axis=1)
        
        y_predict_prob_i = classifier_list[i].predict_proba(X_test)[:,1]
        y_prob = pd.concat([y_prob, pd.DataFrame(y_predict_prob_i)],axis=1)
        
    end = time.time()
    testing_time = end-start
        
    return y_predict, y_prob, testing_time

def BR_test(data, label, dataPath, random_state=3071980):
    
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()
    
    # split training and test data set
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=random_state)
    
    print("-- test index --")
    print(X_test.index)
    
    # training
    classifier_list, training_time = naiveBayes_multi_label_training(X_train, y_train)
    
    # testing
    y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(X_test, n_label, classifier_list)
    
    # evaluation
    performance = evaluation(y_predict, y_prob, y_test)
    
    # print data information
    print("--- Data Information ---")
    print("dataset:", dataPath)
    print("number of label:",n_label)
    print("number of attribute:",n_attr)
    print("number of instance:",n_instance,"\n")
    
    # get confusion matrix
    get_confusion_matrix(y_predict, y_test, y_test.columns)
    
    # print performance
    print("--- Performance ---")
    for key, value in performance.items():
        if key == "f1_each_label":
            print("\n- f1 for each label -")
            for i in range(n_label):
                print("label_"+label.columns[i],"=",round(value[i],2))
        elif key == "coverage_error":
            print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
        else:
            print(key,'=',round(value,2))
            
# two fold cross-validation
def two_fold_BR_test(data, label, dataPath, n_iter=5, random_state=3071980):
    
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()
    
    # 2-fold cross validatiom
    KF=KFold(n_splits=2, shuffle=True, random_state=random_state)
    i = 0
    
    performance = {}
    for train_index,test_index in KF.split(data):
        i += 1
        
        X_train,X_test=data.iloc[train_index,:],data.iloc[test_index,:]
        y_train,y_test=label.iloc[train_index,:],label.iloc[test_index,:]
        
        print("--- kfold time="+str(i)+" ---")
        # training
        classifier_list, training_time = naiveBayes_multi_label_training(X_train, y_train)
        # testing
        y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(X_test, n_label, classifier_list)
        
        # evaluation
        if performance == {}:
            performance = evaluation(y_predict, y_prob, y_test)
        else:
            performance_i = evaluation(y_predict, y_prob, y_test)
            for key, value in performance_i.items():
                performance[key] = (performance[key] + value)/2
            else:
                performance[key] = value
    
    # print data information
    print("\n--- Data Information ---")
    print("dataset:", dataPath)
    print("number of label:",n_label)
    print("number of attribute:",n_attr)
    print("number of instance:",n_instance,"\n")
    
    # print performance
    print("--- 2 fold cross-validation Performance ---")
    for key, value in performance.items():
        if key == "f1_each_label":
            continue
        elif key == "coverage_error":
            print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
        else:
            print(key,'=',round(value,2))

In [13]:
# read data
data, label = read_data(dataPath)

# train - test
print("------ Binary Relevance using Naive Bayes ------")
BR_test(data, label, dataPath,3071980)

#print("")
#print("------ two_fold Binary Relevance using Naive Bayes ------")
#two_fold_BR_test(data, label, dataPath,3071980)

------ Binary Relevance using Naive Bayes ------
-- test index --
Int64Index([ 104, 1008, 1089,  741,  217, 1455, 1567, 1687,  542,  717,
            ...
            1296,  474,  850,  367,  277,  629,  218,   72,  900,  171],
           dtype='int64', length=851)


  self.class_log_prior_ = (np.log(self.class_count_) -
  'precision', 'predicted', average, warn_for)


--- Data Information ---
dataset: /Volumes/Samsung_T5/research/data/small_datasets/enron/
number of label: 53
number of attribute: 1001
number of instance: 1702 

--- Performance ---
coverage_error = 17.06 ( avg_label_per_instance = 3.38 )
ranking_loss = 0.13
hamming_loss = 0.11
f1_macro = 0.19
f1_micro = 0.42
Jaccard_Index = 0.89
zero_one_error = 0.99

- f1 for each label -
label_A.A8 = 0.0
label_C.C9 = 0.14
label_B.B12 = 0.0
label_C.C11 = 0.12
label_C.C5 = 0.31
label_C.C7 = 0.38
label_B.B2 = 0.74
label_B.B3 = 0.09
label_D.D16 = 0.0
label_A.A7 = 0.09
label_D.D1 = 0.06
label_A.A4 = 0.57
label_C.C2 = 0.16
label_A.A3 = 0.29
label_A.A1 = 0.57
label_D.D9 = 0.09
label_D.D19 = 0.0
label_B.B8 = 0.18
label_D.D12 = 0.06
label_D.D6 = 0.0
label_C.C8 = 0.17
label_A.A6 = 0.29
label_B.B9 = 0.4
label_A.A5 = 0.24
label_C.C10 = 0.27
label_B.B1 = 0.62
label_D.D5 = 0.0
label_B.B11 = 0.12
label_D.D2 = 0.0
label_B.B4 = 0.74
label_D.D15 = 0.0
label_C.C4 = 0.13
label_D.D8 = 0.0
label_B.B6 = 0.19
label_D.D3 =

In [14]:
# data set information
n_label = label.shape[1]
n_attr = data.shape[1]
n_instance = data.shape[0]
avg_label_per_instance = label.sum(axis=1).mean()

# split training and test data set
# X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=random_state)
X_train = data[data.index.isin(index)==False]
X_test = data[data.index.isin(index)==True]
y_train = label[label.index.isin(index)==False]
y_test = label[label.index.isin(index)==True]

print("-- test index --")
print(X_test.index)

# training
print("--- start training ---\n")
classifier_list, training_time = naiveBayes_multi_label_training(X_train, y_train)

# testing
print("--- start testing ---\n")
y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(X_test, n_label, classifier_list)

# get confusion matrix
get_confusion_matrix(y_predict, y_test, y_test.columns)
    
# evaluation
performance = evaluation(y_predict, y_prob, y_test)


# print data information
print("--- Data Information ---")
print("dataset:", dataPath)
print("number of label:",n_label)
print("number of attribute:",n_attr)
print("number of instance:",n_instance,"\n")

# print performance
print("--- Performance ---")
for key, value in performance.items():
    if key == "f1_each_label":
        print("\n- f1 for each label -")
        for i in range(n_label):
            print("label_"+label.columns[i],"=",round(value[i],2))
    elif key == "coverage_error":
        print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
    else:
        print(key,'=',round(value,2))

-- test index --
Int64Index([   5,    7,    8,   15,   16,   17,   21,   23,   24,   31,
            ...
            1683, 1685, 1687, 1688, 1690, 1691, 1693, 1695, 1696, 1699],
           dtype='int64', length=851)
--- start training ---



  self.class_log_prior_ = (np.log(self.class_count_) -


--- start testing ---



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


--- Data Information ---
dataset: /Volumes/Samsung_T5/research/data/small_datasets/enron/
number of label: 53
number of attribute: 1001
number of instance: 1702 

--- Performance ---
coverage_error = 17.24 ( avg_label_per_instance = 3.38 )
ranking_loss = 0.13
hamming_loss = 0.11
f1_macro = 0.19
f1_micro = 0.41
Jaccard_Index = 0.89
zero_one_error = 0.98

- f1 for each label -
label_A.A8 = 0.0
label_C.C9 = 0.19
label_B.B12 = 0.0
label_C.C11 = 0.0
label_C.C5 = 0.21
label_C.C7 = 0.24
label_B.B2 = 0.76
label_B.B3 = 0.0
label_D.D16 = 0.0
label_A.A7 = 0.1
label_D.D1 = 0.0
label_A.A4 = 0.58
label_C.C2 = 0.1
label_A.A3 = 0.36
label_A.A1 = 0.54
label_D.D9 = 0.07
label_D.D19 = 0.04
label_B.B8 = 0.5
label_D.D12 = 0.03
label_D.D6 = 0.0
label_C.C8 = 0.18
label_A.A6 = 0.26
label_B.B9 = 0.47
label_A.A5 = 0.23
label_C.C10 = 0.28
label_B.B1 = 0.64
label_D.D5 = 0.0
label_B.B11 = 0.15
label_D.D2 = 0.0
label_B.B4 = 0.81
label_D.D15 = 0.0
label_C.C4 = 0.15
label_D.D8 = 0.0
label_B.B6 = 0.16
label_D.D3 = 0.1

## BR using ESKDB

In [15]:
def get_arff(word_occurrence, label_matrix, savePath): # get attributes
    for z in range(len(label_matrix.columns)):
        attributes=[(word_occurrence.columns[i],list(map(str,sorted(word_occurrence.iloc[:,i].unique())))) for i in range(len(word_occurrence.columns))]
        attributes.append(('label_'+label_matrix.columns[z],['0', '1']))

        data=[]
        i = 0
        while i < label_matrix.shape[0]:
            attr_data = [str(j) for j in list(word_occurrence.iloc[i,:])]
            label_data = [str(label_matrix.iloc[i,z])]
            row_data = attr_data+label_data
            data.append(row_data) 
            i+=1
        # set obj
        obj = {
           'description': u'',
           'relation': 'relation',
           'attributes': attributes,
           'data': data,
        }
        arff_data = arff.dumps(obj)
        w_file = open(savePath+label_matrix.columns[z]+".arff", "w")
        w_file.write(arff_data)
        w_file.close()

def run_eskdb(dataPath, resultFile, k, l, e, i):
    command = "./run_eskdb.sh "+resultFile+" "+k+" "+i+" "+l+" "+e+" "+dataPath
    subprocess.call("cd /Volumes/Samsung_T5/research/programme/research_python/", shell=True)
    print(command)
    return subprocess.call(command, shell=True)

def get_result(resultPath):

    y_pred = pd.DataFrame()
    y_true = pd.DataFrame()
    y_prob = pd.DataFrame()
    names = []
    for file in os.listdir(resultPath):
        with open(os.path.join(resultPath,file), 'r') as f:
            try:
                lines = f.readlines()
            except:
                print(file)
            else:
                names.append(file[:-4])
                pred = []
                true = []
                prob = []
                train_time_total = 0
                test_time_total = 0
                error_marco = 0
                for line in lines:
                    if line.startswith('pred'):
                        pred.append(int(re.search('pred :\t(.)',line).group(1)))
                        true.append(int(re.search('true :\t(.)',line).group(1)))
                        prob.append(float(re.search('prob :\t(.*)',line).group(1)))
                    elif line.startswith('RSME'):
                        rsme = float(re.search('RSME :\t\t(.*)',line).group(1))
                    elif line.startswith('Error'):
                        error = float(re.search('Error :\t\t(.*)',line).group(1))
                    elif line.startswith("Training time"):
                        train_time = float(re.search('Training time :\s{1,}(.*)',line).group(1))
                        train_time_total = train_time_total + train_time
                    elif line.startswith("Testing time"):
                        test_time = float(re.search('Testing time :\s{1,}(.*)',line).group(1))
                        test_time_total = test_time_total + test_time
                    elif line.startswith("["):
                        para = line
                    elif line.startswith("test0Indexes"):
                        index = list(map(int,re.search('test0Indexes: {(.*)}',line).group(1).split(', ')))

                y_pred = pd.concat([y_pred,pd.DataFrame(pred)],axis=1)
                y_true = pd.concat([y_true,pd.DataFrame(true)],axis=1)
                y_prob = pd.concat([y_prob,pd.DataFrame(prob)],axis=1)
    y_pred.columns = names
    y_true.columns = names
    y_pred.index = index
    y_true.index = index
    print(para)
    print("number of label:", y_pred.shape[1])
    print("training time:",train_time_total)
    print("testing time:",test_time_total)
    return y_pred,y_true,y_prob,index

In [16]:
# read data
data, label = read_data(dataPath)

# get arff files

savePath = "/Users/jiangjunhao/Desktop/test/"+dataset+'/'
if not os.path.exists(savePath):
    os.mkdir(savePath)
get_arff(data,label,savePath)

resultFile = dataset+'_k5_e20_i5000'
k = '5'
i = '5000'
l = '2'
e = '20'

run_eskdb(savePath, resultFile, k, l, e, i)

./run_eskdb.sh enron_k5_e20_i5000 5 5000 2 20 /Users/jiangjunhao/Desktop/test/enron/


0

In [17]:
resultPath = '/Volumes/Samsung_T5/research/programme/ESKDB_HDP/result/'+resultFile
y_pred,y_true,y_prob,index = get_result(resultPath)

performance = evaluation(y_pred=y_pred, y_true=y_true, y_prob=y_prob)
    
# print performance
print("--- Performance ---")
for key, value in performance.items():
    if key == "f1_each_label":
        print("\n- f1 for each label -")
        for i in range(n_label):
            print("label_"+label.columns[i],"=",round(value[i],2))
    elif key == "coverage_error":
        print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
    else:
        print(key,'=',round(value,2))

[-t, /Users/jiangjunhao/Desktop/test/enron//D.D9.arff, -S, ESKDB, -K, 5, -I, 5000, -L, 2, -E, 20, -V, -M]

number of label: 53
training time: 10816.0
testing time: 355.0


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


--- Performance ---
coverage_error = 13.31 ( avg_label_per_instance = 3.38 )
ranking_loss = 0.08
hamming_loss = 0.05
f1_macro = 0.16
f1_micro = 0.56
Jaccard_Index = 0.95
zero_one_error = 0.87

- f1 for each label -
label_A.A8 = 0.68
label_C.C9 = 0.29
label_B.B12 = 0.3
label_C.C11 = 0.68
label_C.C5 = 0.25
label_C.C7 = 0.31
label_B.B2 = 0.0
label_B.B3 = 0.0
label_D.D16 = 0.6
label_A.A7 = 0.0
label_D.D1 = 0.5
label_A.A4 = 0.0
label_C.C2 = 0.54
label_A.A3 = 0.84
label_A.A1 = 0.0
label_D.D9 = 0.82
label_D.D19 = 0.0
label_B.B8 = 0.25
label_D.D12 = 0.0
label_D.D6 = 0.14
label_C.C8 = 0.29
label_A.A6 = 0.15
label_B.B9 = 0.16
label_A.A5 = 0.0
label_C.C10 = 0.0
label_B.B1 = 0.0
label_D.D5 = 0.0
label_B.B11 = 0.0
label_D.D2 = 0.0
label_B.B4 = 0.18
label_D.D15 = 0.61
label_C.C4 = 0.33
label_D.D8 = 0.0
label_B.B6 = 0.06
label_D.D3 = 0.0
label_D.D13 = 0.0
label_D.D7 = 0.0
label_C.C12 = 0.0
label_B.B7 = 0.0
label_C.C6 = 0.0
label_B.B5 = 0.0
label_D.D11 = 0.0
label_A.A2 = 0.0
label_C.C3 = 0.0
label_D.D

# Classifier Chain

## CC using naive Bayes

In [18]:
def naiveBayes_multi_label_training(X_train, y_train):
    start = time.time()
    
    n_label = y_train.shape[1]
    
    order = random.sample(list(range(n_label)),n_label) # get orders
    
    classifier_list = [MultinomialNB() for i in range(n_label)] # create a classifier chain
    
    for i in range(n_label):
        if i == 0:
            classifier_list[i].fit(X_train,y_train.iloc[:, order[i]])
        else:
            X_train = pd.concat([X_train, y_train.iloc[:,order[i-1]]],axis=1) # put the previous label into attribute space
            classifier_list[i].fit(X_train,y_train.iloc[:,order[i]])

    end = time.time()
    training_time = end-start
    
    return classifier_list, training_time, order

def naiveBayes_multi_label_testing(X_test, n_label, classifier_list, order):
    y_predict = pd.DataFrame(index=X_test.index)
    y_prob = pd.DataFrame(index=X_test.index)
    y_true = pd.DataFrame(index=X_test.index)
    
    start = time.time()
    
    for i in range(n_label):
        y_predict_i = classifier_list[i].predict(X_test)
        y_predict = pd.concat([y_predict, pd.DataFrame(y_predict_i,index=X_test.index)],axis=1)

        y_predict_prob_i = classifier_list[i].predict_proba(X_test)[:,1]
        y_prob = pd.concat([y_prob, pd.DataFrame(y_predict_prob_i,index=X_test.index)],axis=1)

        X_test = pd.concat([X_test, pd.DataFrame(y_predict_i,index=X_test.index)],axis=1,ignore_index=True) # put the previous label into attribute space

    end = time.time()
    testing_time = end-start
        
    return y_predict, y_prob, testing_time

def CC_test(data, label, dataPath, random_state=3071980):
    
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()
    
    # split training and test data set
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=random_state)
    
    # training
    print("--- start training ---\n")
    classifier_list, training_time, order = naiveBayes_multi_label_training(X_train, y_train)
    
    # testing
    print("--- start testing ---\n")
    y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(X_test, n_label, classifier_list, order)
    
    # evaluation
    y_test = y_test.iloc[:,order]
    performance = evaluation(y_predict, y_prob, y_test)
    
    # print data information
    print("--- Data Information ---")
    print("dataset:", dataPath)
    print("number of label:",n_label)
    print("number of attribute:",n_attr)
    print("number of instance:",n_instance,"\n")
    
    # print orders
    print("\n--- Order of the chain ---")
    print(label.columns[order])
    print("")
    
    # print performance
    print("--- Performance ---")
    for key, value in performance.items():
        if key == "f1_each_label":
            print("\n- f1 for each label -")
            for i in range(n_label):
                print("label_"+label.columns[i],"=",round(value[i],2))
        elif key == "coverage_error":
            print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
        else:
            print(key,'=',round(value,2))
            
def ECC_test(data, label, dataPath, random_state=3071980, ensemble = 5):
    
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()
    
    # split training and test data set
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=random_state)
    
    # ensemble
    y_pred_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)
    y_prob_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)
    for i in range(ensemble):
        # training
        #print("--- start training ---\n")
        classifier_list, training_time, order = naiveBayes_multi_label_training(X_train, y_train)

        # print orders
        print("Order of the chain:",label.columns[order])

        # testing
        #print("--- start testing ---\n")
        y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(X_test, n_label, classifier_list, order)

        y_predict.columns = label.columns[order]
        y_prob.columns = label.columns[order]
        y_predict = y_predict[label.columns]
        y_prob = y_prob[label.columns]

        y_pred_ensemble = y_pred_ensemble + y_predict
        y_prob_ensemble = y_prob_ensemble + y_prob

    y_pred_ensemble = (((y_pred_ensemble / ensemble) >= 0.5)*1).astype('int')
    y_prob_ensemble = y_prob_ensemble / ensemble 
    
    # evaluation
    performance = evaluation(y_pred_ensemble, y_prob_ensemble, y_test)
    
    # print data information
    print("--- Data Information ---")
    print("dataset:", dataPath)
    print("number of label:",n_label)
    print("number of attribute:",n_attr)
    print("number of instance:",n_instance,"\n")
    
    # print orders
    print("\n--- Order of the chain ---")
    print(label.columns[order])
    print("")
    
    # print performance
    print("--- Performance ---")
    for key, value in performance.items():
        if key == "f1_each_label":
            print("\n- f1 for each label -")
            for i in range(n_label):
                print("label_"+label.columns[i],"=",round(value[i],2))
        elif key == "coverage_error":
            print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
        else:
            print(key,'=',round(value,2))

In [19]:
# read data
data, label = read_data(dataPath)

# train - test
print("------ Binary Relevance using Naive Bayes ------")
ECC_test(data, label, dataPath, 3071980, 1)

------ Binary Relevance using Naive Bayes ------


  self.class_log_prior_ = (np.log(self.class_count_) -


Order of the chain: Index(['C.C3', 'D.D16', 'D.D9', 'D.D10', 'B.B10', 'B.B9', 'D.D11', 'C.C1',
       'D.D19', 'B.B7', 'A.A8', 'B.B12', 'D.D4', 'C.C2', 'D.D15', 'D.D12',
       'C.C10', 'C.C4', 'D.D17', 'C.C9', 'D.D14', 'C.C8', 'C.C12', 'B.B13',
       'A.A3', 'D.D1', 'B.B6', 'D.D13', 'A.A6', 'A.A4', 'D.D18', 'A.A2',
       'D.D7', 'A.A5', 'D.D2', 'C.C11', 'B.B8', 'C.C13', 'C.C7', 'B.B3',
       'D.D3', 'B.B11', 'D.D6', 'C.C6', 'B.B4', 'A.A7', 'B.B5', 'D.D8', 'B.B2',
       'D.D5', 'B.B1', 'A.A1', 'C.C5'],
      dtype='object')


  'precision', 'predicted', average, warn_for)


--- Data Information ---
dataset: /Volumes/Samsung_T5/research/data/small_datasets/enron/
number of label: 53
number of attribute: 1001
number of instance: 1702 


--- Order of the chain ---
Index(['C.C3', 'D.D16', 'D.D9', 'D.D10', 'B.B10', 'B.B9', 'D.D11', 'C.C1',
       'D.D19', 'B.B7', 'A.A8', 'B.B12', 'D.D4', 'C.C2', 'D.D15', 'D.D12',
       'C.C10', 'C.C4', 'D.D17', 'C.C9', 'D.D14', 'C.C8', 'C.C12', 'B.B13',
       'A.A3', 'D.D1', 'B.B6', 'D.D13', 'A.A6', 'A.A4', 'D.D18', 'A.A2',
       'D.D7', 'A.A5', 'D.D2', 'C.C11', 'B.B8', 'C.C13', 'C.C7', 'B.B3',
       'D.D3', 'B.B11', 'D.D6', 'C.C6', 'B.B4', 'A.A7', 'B.B5', 'D.D8', 'B.B2',
       'D.D5', 'B.B1', 'A.A1', 'C.C5'],
      dtype='object')

--- Performance ---
coverage_error = 19.42 ( avg_label_per_instance = 3.38 )
ranking_loss = 0.15
hamming_loss = 0.12
f1_macro = 0.19
f1_micro = 0.41
Jaccard_Index = 0.88
zero_one_error = 0.99

- f1 for each label -
label_A.A8 = 0.0
label_C.C9 = 0.12
label_B.B12 = 0.0
label_C.C11 = 0.11
label_C

In [20]:
# data set information
n_label = label.shape[1]
n_attr = data.shape[1]
n_instance = data.shape[0]
avg_label_per_instance = label.sum(axis=1).mean()

# split training and test data set
# X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=random_state)

X_train = data[data.index.isin(index)==False]
X_test = data[data.index.isin(index)==True]
y_train = label[label.index.isin(index)==False]
y_test = label[label.index.isin(index)==True]

print("-- test index --")
print(X_test.index)

# training
print("--- start training ---\n")
classifier_list, training_time, order = naiveBayes_multi_label_training(X_train, y_train)

# testing
print("--- start testing ---\n")
y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(X_test, n_label, classifier_list, order)

# evaluation
y_test = y_test.iloc[:,order]
performance = evaluation(y_predict, y_prob, y_test)

# print data information
print("--- Data Information ---")
print("dataset:", dataPath)
print("number of label:",n_label)
print("number of attribute:",n_attr)
print("number of instance:",n_instance,"\n")

# print orders
print("\n--- Order of the chain ---")
print(label.columns[order])
print("")


# print performance
print("--- Performance ---")
for key, value in performance.items():
    if key == "f1_each_label":
        print("\n- f1 for each label -")
        for i in range(n_label):
            print("label_"+label.columns[i],"=",round(value[i],2))
    elif key == "coverage_error":
        print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
    else:
        print(key,'=',round(value,2))

-- test index --
Int64Index([   5,    7,    8,   15,   16,   17,   21,   23,   24,   31,
            ...
            1683, 1685, 1687, 1688, 1690, 1691, 1693, 1695, 1696, 1699],
           dtype='int64', length=851)
--- start training ---



  self.class_log_prior_ = (np.log(self.class_count_) -


--- start testing ---



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


--- Data Information ---
dataset: /Volumes/Samsung_T5/research/data/small_datasets/enron/
number of label: 53
number of attribute: 1001
number of instance: 1702 


--- Order of the chain ---
Index(['D.D7', 'C.C6', 'D.D14', 'D.D2', 'B.B4', 'D.D17', 'B.B13', 'C.C10',
       'C.C4', 'A.A4', 'A.A2', 'D.D10', 'D.D3', 'B.B11', 'D.D6', 'C.C1',
       'A.A7', 'C.C7', 'B.B5', 'B.B6', 'A.A3', 'D.D18', 'D.D8', 'D.D9',
       'D.D19', 'D.D11', 'C.C9', 'B.B9', 'C.C12', 'D.D5', 'B.B10', 'C.C11',
       'D.D12', 'D.D15', 'C.C5', 'B.B2', 'D.D16', 'D.D13', 'B.B3', 'C.C13',
       'A.A6', 'B.B1', 'A.A1', 'A.A8', 'B.B8', 'B.B12', 'C.C8', 'C.C2', 'B.B7',
       'A.A5', 'D.D1', 'C.C3', 'D.D4'],
      dtype='object')

--- Performance ---
coverage_error = 18.58 ( avg_label_per_instance = 3.38 )
ranking_loss = 0.15
hamming_loss = 0.12
f1_macro = 0.18
f1_micro = 0.41
Jaccard_Index = 0.88
zero_one_error = 0.99

- f1 for each label -
label_A.A8 = 0.0
label_C.C9 = 0.5
label_B.B12 = 0.0
label_C.C11 = 0.0
label_C.C

## Ensemble CC using naive Bayes (E = 10)

In [21]:
# read data
data, label = read_data(dataPath)

# train - test
print("------ Binary Relevance using Naive Bayes ------")
ECC_test(data, label, dataPath, random_state=3071980, ensemble=10)

------ Binary Relevance using Naive Bayes ------


  self.class_log_prior_ = (np.log(self.class_count_) -


Order of the chain: Index(['B.B12', 'D.D14', 'C.C4', 'D.D8', 'A.A2', 'D.D3', 'D.D10', 'B.B5',
       'D.D15', 'A.A6', 'B.B9', 'D.D19', 'D.D18', 'B.B8', 'D.D9', 'A.A5',
       'B.B11', 'C.C8', 'B.B7', 'B.B2', 'D.D11', 'C.C1', 'D.D5', 'B.B4',
       'A.A7', 'C.C10', 'B.B10', 'B.B6', 'C.C3', 'D.D7', 'C.C6', 'D.D1',
       'D.D17', 'C.C9', 'B.B3', 'C.C12', 'C.C13', 'D.D2', 'B.B1', 'A.A4',
       'A.A3', 'D.D12', 'A.A1', 'D.D6', 'D.D13', 'C.C7', 'C.C11', 'C.C2',
       'D.D4', 'D.D16', 'B.B13', 'C.C5', 'A.A8'],
      dtype='object')
Order of the chain: Index(['C.C11', 'D.D1', 'C.C9', 'D.D11', 'A.A6', 'B.B3', 'A.A2', 'D.D5',
       'B.B12', 'B.B2', 'C.C13', 'D.D15', 'D.D7', 'B.B4', 'C.C8', 'B.B7',
       'C.C6', 'A.A8', 'B.B1', 'B.B5', 'B.B13', 'C.C5', 'A.A7', 'A.A4', 'C.C1',
       'C.C2', 'A.A3', 'C.C10', 'D.D3', 'D.D6', 'D.D18', 'D.D17', 'D.D2',
       'C.C3', 'C.C4', 'B.B6', 'B.B8', 'A.A5', 'D.D16', 'D.D8', 'C.C12',
       'D.D14', 'B.B9', 'B.B11', 'D.D12', 'A.A1', 'D.D19', 'C.C7', 'D.D9

  'precision', 'predicted', average, warn_for)


--- Data Information ---
dataset: /Volumes/Samsung_T5/research/data/small_datasets/enron/
number of label: 53
number of attribute: 1001
number of instance: 1702 


--- Order of the chain ---
Index(['A.A2', 'D.D3', 'D.D17', 'D.D5', 'C.C7', 'D.D9', 'D.D13', 'B.B6',
       'B.B1', 'D.D18', 'D.D16', 'C.C11', 'B.B10', 'D.D15', 'B.B4', 'D.D2',
       'A.A5', 'A.A6', 'A.A1', 'D.D8', 'D.D10', 'B.B8', 'C.C10', 'D.D4',
       'B.B7', 'B.B12', 'B.B11', 'D.D14', 'D.D19', 'C.C12', 'D.D11', 'A.A4',
       'C.C2', 'B.B3', 'C.C6', 'B.B13', 'D.D7', 'D.D6', 'C.C9', 'C.C4', 'C.C3',
       'C.C13', 'C.C1', 'B.B9', 'D.D1', 'A.A7', 'A.A3', 'B.B2', 'A.A8', 'B.B5',
       'C.C8', 'D.D12', 'C.C5'],
      dtype='object')

--- Performance ---
coverage_error = 18.54 ( avg_label_per_instance = 3.38 )
ranking_loss = 0.14
hamming_loss = 0.12
f1_macro = 0.19
f1_micro = 0.41
Jaccard_Index = 0.88
zero_one_error = 0.99

- f1 for each label -
label_A.A8 = 0.0
label_C.C9 = 0.13
label_B.B12 = 0.0
label_C.C11 = 0.11
label_C

In [22]:
# read data
data, label = read_data(dataPath)

ensemble=10

# data set information
n_label = label.shape[1]
n_attr = data.shape[1]
n_instance = data.shape[0]
avg_label_per_instance = label.sum(axis=1).mean()

X_train = data[data.index.isin(index)==False]
X_test = data[data.index.isin(index)==True]
y_train = label[label.index.isin(index)==False]
y_test = label[label.index.isin(index)==True]

print("-- test index --")
print(X_test.index)

# ensemble
y_pred_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)
y_prob_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)
for i in range(ensemble):
    # training
    #print("--- start training ---\n")
    classifier_list, training_time, order = naiveBayes_multi_label_training(X_train, y_train)

    # print orders
    print("Order of the chain:",label.columns[order])

    # testing
    #print("--- start testing ---\n")
    y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(X_test, n_label, classifier_list, order)

    y_predict.columns = label.columns[order]
    y_prob.columns = label.columns[order]
    y_predict = y_predict[label.columns]
    y_prob = y_prob[label.columns]

    y_pred_ensemble = y_pred_ensemble + y_predict
    y_prob_ensemble = y_prob_ensemble + y_prob

y_pred_ensemble = (((y_pred_ensemble / ensemble) >= 0.5)*1).astype('int')
y_prob_ensemble = y_prob_ensemble / ensemble 

# evaluation
performance = evaluation(y_pred_ensemble, y_prob_ensemble, y_test)

# print data information
print("--- Data Information ---")
print("dataset:", dataPath)
print("number of label:",n_label)
print("number of attribute:",n_attr)
print("number of instance:",n_instance,"\n")

# print orders
print("\n--- Order of the chain ---")
print(label.columns[order])
print("")

# get confusion matrix
get_confusion_matrix(y_pred_ensemble, y_test, y_test.columns)

# print performance
print("--- Performance ---")
for key, value in performance.items():
    if key == "f1_each_label":
        print("\n- f1 for each label -")
        for i in range(n_label):
            print("label_"+label.columns[i],"=",round(value[i],2))
    elif key == "coverage_error":
        print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
    else:
        print(key,'=',round(value,2))

-- test index --
Int64Index([   5,    7,    8,   15,   16,   17,   21,   23,   24,   31,
            ...
            1683, 1685, 1687, 1688, 1690, 1691, 1693, 1695, 1696, 1699],
           dtype='int64', length=851)


  self.class_log_prior_ = (np.log(self.class_count_) -


Order of the chain: Index(['B.B9', 'A.A4', 'C.C12', 'D.D5', 'D.D13', 'D.D8', 'D.D9', 'B.B3',
       'A.A7', 'D.D19', 'A.A3', 'C.C6', 'D.D3', 'B.B7', 'A.A2', 'D.D14',
       'B.B5', 'C.C10', 'D.D12', 'B.B2', 'C.C11', 'D.D17', 'B.B11', 'D.D18',
       'B.B13', 'B.B8', 'A.A1', 'D.D16', 'D.D4', 'B.B12', 'B.B10', 'A.A6',
       'C.C5', 'B.B4', 'C.C9', 'B.B1', 'D.D10', 'C.C1', 'B.B6', 'D.D2', 'D.D1',
       'C.C4', 'D.D11', 'C.C13', 'D.D15', 'D.D6', 'D.D7', 'C.C2', 'C.C3',
       'C.C8', 'A.A5', 'C.C7', 'A.A8'],
      dtype='object')
Order of the chain: Index(['B.B12', 'B.B3', 'C.C6', 'D.D17', 'B.B2', 'C.C12', 'B.B10', 'D.D19',
       'D.D4', 'B.B6', 'B.B9', 'B.B11', 'D.D16', 'C.C2', 'A.A2', 'B.B7',
       'B.B4', 'A.A6', 'C.C8', 'A.A7', 'C.C5', 'D.D9', 'D.D11', 'D.D18',
       'C.C13', 'A.A4', 'D.D10', 'C.C4', 'D.D3', 'D.D15', 'A.A8', 'C.C11',
       'C.C1', 'D.D6', 'D.D14', 'D.D2', 'C.C9', 'D.D12', 'C.C7', 'B.B13',
       'C.C10', 'B.B5', 'A.A3', 'B.B8', 'A.A5', 'C.C3', 'A.A1', 'D.D13',
  

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


--- Data Information ---
dataset: /Volumes/Samsung_T5/research/data/small_datasets/enron/
number of label: 53
number of attribute: 1001
number of instance: 1702 


--- Order of the chain ---
Index(['D.D9', 'D.D15', 'D.D19', 'B.B9', 'D.D16', 'D.D11', 'D.D18', 'C.C8',
       'D.D7', 'C.C7', 'C.C6', 'C.C9', 'A.A2', 'D.D1', 'C.C4', 'B.B2', 'B.B7',
       'D.D14', 'D.D12', 'A.A8', 'D.D13', 'C.C10', 'B.B6', 'D.D4', 'B.B13',
       'D.D5', 'B.B12', 'D.D8', 'A.A5', 'A.A1', 'C.C12', 'B.B1', 'B.B4',
       'C.C3', 'A.A3', 'C.C5', 'D.D17', 'D.D2', 'C.C13', 'C.C2', 'B.B11',
       'B.B3', 'D.D6', 'B.B8', 'A.A4', 'B.B10', 'A.A6', 'D.D10', 'A.A7',
       'B.B5', 'C.C1', 'D.D3', 'C.C11'],
      dtype='object')

--- Performance ---
coverage_error = 17.41 ( avg_label_per_instance = 3.38 )
ranking_loss = 0.13
hamming_loss = 0.12
f1_macro = 0.18
f1_micro = 0.4
Jaccard_Index = 0.88
zero_one_error = 0.98

- f1 for each label -
label_A.A8 = 0.0
label_C.C9 = 0.16
label_B.B12 = 0.0
label_C.C11 = 0.03
label_C.

## Ensemble CC using naive Bayes (E = 50)

In [23]:
# read data
data, label = read_data(dataPath)

# train - test
print("------ Binary Relevance using Naive Bayes ------")
ECC_test(data, label, dataPath, random_state=3071980, ensemble=50)

------ Binary Relevance using Naive Bayes ------


  self.class_log_prior_ = (np.log(self.class_count_) -


Order of the chain: Index(['D.D13', 'D.D5', 'D.D17', 'C.C13', 'B.B12', 'C.C11', 'D.D6', 'D.D2',
       'A.A7', 'C.C7', 'B.B5', 'C.C12', 'A.A3', 'C.C8', 'A.A6', 'C.C1', 'D.D8',
       'B.B3', 'B.B6', 'C.C5', 'D.D10', 'C.C2', 'A.A4', 'B.B7', 'D.D1',
       'D.D12', 'C.C4', 'A.A2', 'A.A5', 'B.B8', 'D.D18', 'D.D9', 'B.B1',
       'D.D3', 'A.A8', 'A.A1', 'D.D14', 'D.D11', 'C.C6', 'C.C3', 'B.B10',
       'B.B11', 'D.D15', 'D.D16', 'B.B4', 'B.B13', 'D.D7', 'D.D4', 'D.D19',
       'B.B2', 'B.B9', 'C.C9', 'C.C10'],
      dtype='object')
Order of the chain: Index(['D.D11', 'C.C9', 'D.D13', 'A.A4', 'D.D3', 'D.D16', 'A.A5', 'B.B5',
       'D.D10', 'C.C10', 'C.C8', 'D.D14', 'C.C1', 'D.D12', 'B.B8', 'A.A8',
       'A.A7', 'D.D9', 'B.B10', 'C.C12', 'D.D19', 'B.B12', 'B.B3', 'D.D6',
       'A.A1', 'D.D4', 'B.B4', 'B.B7', 'D.D15', 'C.C11', 'B.B9', 'C.C5',
       'D.D5', 'D.D17', 'D.D1', 'D.D7', 'C.C4', 'C.C2', 'C.C7', 'D.D8', 'A.A6',
       'C.C3', 'B.B11', 'D.D18', 'D.D2', 'B.B1', 'C.C13', 'B.B6', 'B.

Order of the chain: Index(['D.D3', 'B.B5', 'A.A6', 'B.B3', 'C.C1', 'D.D4', 'C.C7', 'B.B11',
       'D.D11', 'D.D12', 'C.C13', 'D.D7', 'C.C8', 'C.C3', 'C.C9', 'A.A2',
       'A.A3', 'C.C2', 'C.C4', 'D.D6', 'D.D9', 'B.B2', 'C.C6', 'B.B1', 'A.A1',
       'D.D17', 'A.A7', 'D.D14', 'C.C10', 'A.A8', 'B.B6', 'B.B4', 'D.D15',
       'D.D1', 'B.B12', 'D.D2', 'D.D13', 'D.D5', 'B.B8', 'B.B7', 'D.D19',
       'A.A4', 'D.D10', 'D.D18', 'A.A5', 'C.C5', 'B.B9', 'B.B10', 'C.C12',
       'D.D16', 'C.C11', 'D.D8', 'B.B13'],
      dtype='object')
Order of the chain: Index(['D.D14', 'D.D15', 'D.D13', 'B.B8', 'A.A1', 'D.D3', 'B.B6', 'D.D17',
       'B.B2', 'C.C5', 'C.C10', 'D.D8', 'D.D1', 'D.D18', 'B.B10', 'C.C12',
       'D.D6', 'C.C7', 'A.A6', 'D.D2', 'B.B5', 'B.B13', 'B.B9', 'C.C3', 'D.D5',
       'C.C1', 'B.B4', 'B.B12', 'C.C13', 'B.B11', 'A.A8', 'C.C4', 'D.D11',
       'D.D4', 'B.B1', 'D.D12', 'C.C11', 'C.C2', 'C.C8', 'A.A7', 'A.A4',
       'D.D10', 'C.C6', 'D.D19', 'A.A3', 'A.A2', 'D.D9', 'B.B7', 'D.

Order of the chain: Index(['A.A4', 'C.C10', 'C.C5', 'B.B9', 'C.C4', 'B.B10', 'D.D19', 'C.C6',
       'C.C12', 'B.B8', 'D.D15', 'D.D4', 'C.C13', 'B.B4', 'A.A3', 'C.C11',
       'D.D6', 'D.D13', 'D.D2', 'A.A5', 'C.C8', 'A.A1', 'A.A8', 'D.D10',
       'D.D14', 'C.C3', 'B.B1', 'D.D17', 'D.D1', 'D.D11', 'D.D3', 'B.B6',
       'B.B3', 'B.B7', 'A.A7', 'D.D16', 'A.A6', 'C.C1', 'C.C9', 'D.D5',
       'B.B12', 'B.B13', 'B.B2', 'D.D12', 'B.B5', 'D.D18', 'C.C7', 'A.A2',
       'D.D8', 'C.C2', 'D.D7', 'B.B11', 'D.D9'],
      dtype='object')
Order of the chain: Index(['D.D13', 'B.B2', 'D.D6', 'D.D17', 'D.D3', 'C.C1', 'C.C9', 'B.B12',
       'A.A6', 'A.A2', 'A.A5', 'D.D8', 'A.A1', 'B.B3', 'B.B13', 'A.A7', 'C.C6',
       'A.A8', 'D.D7', 'D.D1', 'D.D10', 'B.B1', 'A.A4', 'C.C13', 'B.B7',
       'C.C7', 'C.C12', 'B.B8', 'B.B4', 'D.D12', 'C.C3', 'D.D4', 'D.D2',
       'D.D18', 'D.D11', 'C.C2', 'B.B10', 'D.D15', 'B.B11', 'C.C8', 'D.D5',
       'C.C5', 'B.B6', 'D.D9', 'D.D19', 'B.B5', 'C.C11', 'C.C10', 'D.D

Order of the chain: Index(['C.C12', 'D.D19', 'D.D13', 'B.B4', 'A.A1', 'D.D5', 'A.A7', 'D.D14',
       'B.B8', 'D.D18', 'C.C5', 'B.B12', 'B.B3', 'A.A2', 'B.B9', 'B.B5',
       'D.D2', 'C.C9', 'D.D12', 'D.D15', 'D.D8', 'B.B13', 'B.B10', 'D.D6',
       'C.C2', 'A.A6', 'D.D11', 'B.B7', 'B.B2', 'D.D16', 'C.C4', 'A.A8',
       'D.D17', 'C.C8', 'B.B1', 'C.C10', 'C.C11', 'C.C6', 'C.C7', 'D.D7',
       'A.A3', 'C.C1', 'D.D10', 'B.B11', 'A.A5', 'C.C3', 'C.C13', 'A.A4',
       'D.D3', 'B.B6', 'D.D1', 'D.D9', 'D.D4'],
      dtype='object')
Order of the chain: Index(['C.C4', 'D.D7', 'B.B13', 'A.A3', 'D.D17', 'C.C1', 'B.B4', 'C.C8',
       'A.A7', 'A.A4', 'C.C5', 'B.B9', 'D.D16', 'D.D9', 'C.C2', 'D.D5',
       'D.D11', 'D.D19', 'B.B1', 'D.D2', 'C.C7', 'B.B2', 'C.C3', 'C.C10',
       'B.B10', 'A.A2', 'C.C12', 'B.B11', 'D.D13', 'B.B7', 'B.B12', 'C.C13',
       'A.A6', 'D.D10', 'B.B3', 'D.D12', 'D.D18', 'D.D8', 'A.A8', 'C.C9',
       'D.D3', 'D.D6', 'D.D4', 'C.C11', 'B.B5', 'B.B8', 'D.D14', 'D.D15',
  

  'precision', 'predicted', average, warn_for)


--- Data Information ---
dataset: /Volumes/Samsung_T5/research/data/small_datasets/enron/
number of label: 53
number of attribute: 1001
number of instance: 1702 


--- Order of the chain ---
Index(['C.C4', 'D.D7', 'B.B13', 'A.A3', 'D.D17', 'C.C1', 'B.B4', 'C.C8',
       'A.A7', 'A.A4', 'C.C5', 'B.B9', 'D.D16', 'D.D9', 'C.C2', 'D.D5',
       'D.D11', 'D.D19', 'B.B1', 'D.D2', 'C.C7', 'B.B2', 'C.C3', 'C.C10',
       'B.B10', 'A.A2', 'C.C12', 'B.B11', 'D.D13', 'B.B7', 'B.B12', 'C.C13',
       'A.A6', 'D.D10', 'B.B3', 'D.D12', 'D.D18', 'D.D8', 'A.A8', 'C.C9',
       'D.D3', 'D.D6', 'D.D4', 'C.C11', 'B.B5', 'B.B8', 'D.D14', 'D.D15',
       'C.C6', 'D.D1', 'A.A5', 'A.A1', 'B.B6'],
      dtype='object')

--- Performance ---
coverage_error = 17.32 ( avg_label_per_instance = 3.38 )
ranking_loss = 0.13
hamming_loss = 0.12
f1_macro = 0.19
f1_micro = 0.41
Jaccard_Index = 0.88
zero_one_error = 0.99

- f1 for each label -
label_A.A8 = 0.0
label_C.C9 = 0.13
label_B.B12 = 0.0
label_C.C11 = 0.11
label_C

In [24]:
# read data
data, label = read_data(dataPath)

ensemble=50

# data set information
n_label = label.shape[1]
n_attr = data.shape[1]
n_instance = data.shape[0]
avg_label_per_instance = label.sum(axis=1).mean()

X_train = data[data.index.isin(index)==False]
X_test = data[data.index.isin(index)==True]
y_train = label[label.index.isin(index)==False]
y_test = label[label.index.isin(index)==True]

print("-- test index --")
print(X_test.index)

# ensemble
y_pred_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)
y_prob_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)
for i in range(ensemble):
    # training
    #print("--- start training ---\n")
    classifier_list, training_time, order = naiveBayes_multi_label_training(X_train, y_train)

    # print orders
    print("Order of the chain:",label.columns[order])

    # testing
    #print("--- start testing ---\n")
    y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(X_test, n_label, classifier_list, order)

    y_predict.columns = label.columns[order]
    y_prob.columns = label.columns[order]
    y_predict = y_predict[label.columns]
    y_prob = y_prob[label.columns]

    y_pred_ensemble = y_pred_ensemble + y_predict
    y_prob_ensemble = y_prob_ensemble + y_prob

y_pred_ensemble = (((y_pred_ensemble / ensemble) >= 0.5)*1).astype('int')
y_prob_ensemble = y_prob_ensemble / ensemble 

# evaluation
performance = evaluation(y_pred_ensemble, y_prob_ensemble, y_test)


# print data information
print("--- Data Information ---")
print("dataset:", dataPath)
print("number of label:",n_label)
print("number of attribute:",n_attr)
print("number of instance:",n_instance,"\n")

# print orders
print("\n--- Order of the chain ---")
print(label.columns[order])
print("")

# get confusion matrix
get_confusion_matrix(y_pred_ensemble, y_test, y_test.columns)

# print performance
print("--- Performance ---")
for key, value in performance.items():
    if key == "f1_each_label":
        print("\n- f1 for each label -")
        for i in range(n_label):
            print("label_"+label.columns[i],"=",round(value[i],2))
    elif key == "coverage_error":
        print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
    else:
        print(key,'=',round(value,2))

-- test index --
Int64Index([   5,    7,    8,   15,   16,   17,   21,   23,   24,   31,
            ...
            1683, 1685, 1687, 1688, 1690, 1691, 1693, 1695, 1696, 1699],
           dtype='int64', length=851)


  self.class_log_prior_ = (np.log(self.class_count_) -


Order of the chain: Index(['D.D4', 'A.A4', 'D.D14', 'D.D19', 'B.B2', 'B.B4', 'B.B12', 'D.D9',
       'D.D11', 'C.C8', 'B.B13', 'B.B9', 'A.A8', 'D.D15', 'B.B8', 'B.B11',
       'B.B5', 'D.D17', 'B.B6', 'B.B7', 'D.D18', 'C.C1', 'A.A5', 'D.D10',
       'C.C12', 'C.C2', 'C.C11', 'A.A7', 'C.C10', 'A.A2', 'C.C7', 'D.D12',
       'B.B1', 'D.D6', 'D.D2', 'D.D8', 'C.C5', 'C.C13', 'C.C9', 'B.B3', 'D.D7',
       'D.D1', 'D.D3', 'A.A3', 'B.B10', 'D.D5', 'D.D13', 'A.A1', 'A.A6',
       'D.D16', 'C.C4', 'C.C3', 'C.C6'],
      dtype='object')
Order of the chain: Index(['D.D9', 'B.B10', 'D.D10', 'D.D19', 'B.B5', 'A.A6', 'A.A1', 'B.B4',
       'B.B6', 'C.C9', 'D.D7', 'C.C4', 'B.B2', 'C.C10', 'D.D14', 'C.C7',
       'D.D11', 'D.D16', 'D.D18', 'B.B7', 'D.D15', 'C.C5', 'D.D13', 'D.D1',
       'C.C6', 'A.A8', 'C.C1', 'D.D8', 'D.D17', 'A.A2', 'D.D4', 'C.C13',
       'B.B8', 'B.B1', 'B.B3', 'B.B12', 'A.A3', 'A.A5', 'B.B9', 'B.B13',
       'D.D2', 'C.C3', 'D.D5', 'D.D12', 'D.D6', 'C.C8', 'B.B11', 'D.D3',
    

Order of the chain: Index(['A.A4', 'C.C3', 'C.C1', 'D.D12', 'C.C6', 'D.D15', 'D.D9', 'D.D2',
       'A.A1', 'B.B2', 'D.D19', 'D.D7', 'C.C8', 'D.D10', 'C.C9', 'D.D11',
       'B.B12', 'B.B5', 'B.B8', 'C.C4', 'D.D4', 'C.C5', 'C.C10', 'B.B3',
       'C.C13', 'C.C2', 'D.D3', 'B.B13', 'A.A3', 'D.D14', 'D.D13', 'C.C12',
       'D.D6', 'B.B7', 'D.D18', 'D.D16', 'C.C11', 'D.D5', 'B.B4', 'A.A5',
       'D.D1', 'B.B6', 'B.B1', 'A.A6', 'B.B11', 'D.D8', 'B.B9', 'C.C7', 'A.A7',
       'D.D17', 'A.A2', 'B.B10', 'A.A8'],
      dtype='object')
Order of the chain: Index(['C.C10', 'D.D5', 'C.C12', 'C.C2', 'D.D18', 'C.C6', 'B.B10', 'C.C1',
       'D.D13', 'B.B12', 'C.C3', 'D.D11', 'D.D3', 'B.B8', 'B.B9', 'D.D8',
       'D.D7', 'C.C13', 'B.B3', 'B.B6', 'D.D17', 'A.A6', 'D.D10', 'A.A5',
       'B.B13', 'D.D4', 'C.C4', 'B.B2', 'B.B11', 'A.A3', 'C.C9', 'D.D2',
       'B.B4', 'D.D19', 'A.A1', 'C.C11', 'D.D12', 'B.B7', 'C.C8', 'D.D1',
       'D.D9', 'A.A4', 'C.C5', 'B.B5', 'B.B1', 'D.D6', 'A.A2', 'D.D16',
    

Order of the chain: Index(['A.A3', 'A.A7', 'D.D7', 'B.B5', 'D.D12', 'D.D3', 'B.B6', 'B.B13',
       'C.C4', 'D.D1', 'D.D19', 'C.C5', 'D.D14', 'D.D18', 'D.D13', 'D.D5',
       'D.D2', 'A.A1', 'B.B10', 'C.C3', 'D.D15', 'C.C10', 'D.D11', 'A.A6',
       'D.D4', 'D.D9', 'C.C9', 'A.A8', 'C.C8', 'D.D6', 'D.D17', 'C.C12',
       'C.C2', 'A.A2', 'B.B3', 'B.B1', 'C.C11', 'B.B2', 'C.C6', 'B.B8',
       'D.D10', 'B.B12', 'B.B4', 'D.D8', 'D.D16', 'B.B7', 'B.B9', 'C.C1',
       'C.C7', 'A.A4', 'C.C13', 'B.B11', 'A.A5'],
      dtype='object')
Order of the chain: Index(['C.C10', 'C.C5', 'C.C2', 'B.B11', 'D.D7', 'D.D19', 'A.A3', 'A.A4',
       'C.C8', 'D.D10', 'B.B13', 'B.B12', 'D.D5', 'B.B4', 'B.B1', 'C.C4',
       'C.C11', 'D.D3', 'B.B6', 'B.B8', 'A.A2', 'B.B10', 'D.D9', 'C.C3',
       'C.C12', 'D.D14', 'C.C1', 'D.D4', 'A.A7', 'D.D6', 'D.D8', 'D.D1',
       'C.C13', 'A.A1', 'C.C6', 'A.A6', 'D.D2', 'B.B2', 'B.B7', 'D.D18',
       'A.A5', 'B.B3', 'D.D12', 'C.C9', 'B.B9', 'D.D17', 'D.D13', 'D.D11',
    

Order of the chain: Index(['C.C12', 'B.B4', 'C.C1', 'D.D13', 'D.D9', 'B.B5', 'D.D1', 'D.D4',
       'C.C3', 'B.B1', 'B.B6', 'A.A7', 'C.C6', 'B.B9', 'A.A2', 'D.D16',
       'B.B13', 'C.C8', 'C.C13', 'C.C11', 'D.D10', 'D.D7', 'C.C10', 'C.C7',
       'C.C4', 'C.C5', 'B.B2', 'B.B3', 'D.D11', 'A.A4', 'D.D2', 'D.D18',
       'A.A1', 'A.A5', 'D.D14', 'B.B7', 'C.C2', 'D.D17', 'D.D3', 'B.B10',
       'B.B8', 'D.D15', 'D.D6', 'C.C9', 'D.D12', 'A.A8', 'B.B11', 'A.A6',
       'B.B12', 'D.D19', 'A.A3', 'D.D8', 'D.D5'],
      dtype='object')
Order of the chain: Index(['D.D15', 'D.D9', 'B.B2', 'D.D14', 'C.C5', 'D.D16', 'B.B4', 'C.C1',
       'B.B12', 'A.A3', 'B.B10', 'A.A5', 'D.D4', 'B.B5', 'D.D12', 'B.B9',
       'C.C2', 'C.C9', 'C.C6', 'D.D17', 'A.A7', 'D.D13', 'D.D1', 'B.B8',
       'B.B11', 'C.C3', 'A.A2', 'C.C10', 'D.D18', 'C.C7', 'B.B3', 'B.B13',
       'D.D10', 'A.A1', 'A.A8', 'B.B1', 'D.D19', 'B.B7', 'D.D7', 'D.D6',
       'C.C11', 'C.C12', 'D.D8', 'A.A4', 'D.D3', 'A.A6', 'D.D11', 'D.D5',
   

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


--- Data Information ---
dataset: /Volumes/Samsung_T5/research/data/small_datasets/enron/
number of label: 53
number of attribute: 1001
number of instance: 1702 


--- Order of the chain ---
Index(['D.D15', 'D.D9', 'B.B2', 'D.D14', 'C.C5', 'D.D16', 'B.B4', 'C.C1',
       'B.B12', 'A.A3', 'B.B10', 'A.A5', 'D.D4', 'B.B5', 'D.D12', 'B.B9',
       'C.C2', 'C.C9', 'C.C6', 'D.D17', 'A.A7', 'D.D13', 'D.D1', 'B.B8',
       'B.B11', 'C.C3', 'A.A2', 'C.C10', 'D.D18', 'C.C7', 'B.B3', 'B.B13',
       'D.D10', 'A.A1', 'A.A8', 'B.B1', 'D.D19', 'B.B7', 'D.D7', 'D.D6',
       'C.C11', 'C.C12', 'D.D8', 'A.A4', 'D.D3', 'A.A6', 'D.D11', 'D.D5',
       'B.B6', 'C.C4', 'C.C13', 'C.C8', 'D.D2'],
      dtype='object')

--- Performance ---
coverage_error = 17.25 ( avg_label_per_instance = 3.38 )
ranking_loss = 0.13
hamming_loss = 0.12
f1_macro = 0.18
f1_micro = 0.41
Jaccard_Index = 0.88
zero_one_error = 0.98

- f1 for each label -
label_A.A8 = 0.0
label_C.C9 = 0.18
label_B.B12 = 0.0
label_C.C11 = 0.03
label_C

## CC using ESKDB


In [5]:
# prepare data
def csv_to_arff(X, label_i, savePath):
    # get attributes
    attributes=[(X.columns[i],['0', '1']) for i in range(len(X.columns))]
    attributes.append(('label_'+label_i.name,['0', '1']))

    data=[]
    i = 0
    while i < len(label_i):
        attr_data = [str(j) for j in list(X.iloc[i,:])]
        label_data = [str(label_i[i])]
        row_data = attr_data+label_data
        data.append(row_data) 
        i+=1
    # set obj
    obj = {
       'description': u'',
       'relation': 'relation',
       'attributes': attributes,
       'data': data,
    }
    arff_data = arff.dumps(obj)
    w_file = open(savePath+label_i.name+".arff", "w")
    w_file.write(arff_data)
    w_file.close()

def get_arff(X, label, savePath):
    
    n_label = label.shape[1]
    # get orders
    order = random.sample(list(range(n_label)),n_label) # get orders
    
    #  get all arff files, one for each label
    for i in range(n_label):
        label_i = label.iloc[:,order[i]]
        print("--Running label:",label_i.name)
        csv_to_arff(X, label_i, savePath)
        
        label_i.name = 'label_' + label_i.name
        X = pd.concat([X, label_i], axis=1)
    print("--finished getting arff files")
    return order

def run_eskdb(label_arff, resultFile, k, l, e, i):
    command = "./run_ECC.sh "+resultFile+" "+k+" "+i+" "+l+" "+e+" "+label_arff
    subprocess.call("cd /Volumes/Samsung_T5/research/programme/research_python/", shell=True)
    #print(command)
    return subprocess.call(command, shell=True)

def get_result(resultPath):

    y_pred = pd.DataFrame()
    y_true = pd.DataFrame()
    y_prob = pd.DataFrame()
    names = []
    for file in os.listdir(resultPath):
        with open(os.path.join(resultPath,file), 'r') as f:
            try:
                lines = f.readlines()
            except:
                print(file)
            else:
                names.append(file[:-4])
                pred = []
                true = []
                prob = []
                train_time_total = 0
                test_time_total = 0
                error_marco = 0
                for line in lines:
                    if line.startswith('pred'):
                        pred.append(int(re.search('pred :\t(.)',line).group(1)))
                        true.append(int(re.search('true :\t(.)',line).group(1)))
                        prob.append(float(re.search('prob :\t(.*)',line).group(1)))
                    elif line.startswith('RSME'):
                        rsme = float(re.search('RSME :\t\t(.*)',line).group(1))
                    elif line.startswith('Error'):
                        error = float(re.search('Error :\t\t(.*)',line).group(1))
                    elif line.startswith("Training time"):
                        train_time = float(re.search('Training time :\s{1,}(.*)',line).group(1))
                        train_time_total = train_time_total + train_time
                    elif line.startswith("Testing time"):
                        test_time = float(re.search('Testing time :\s{1,}(.*)',line).group(1))
                        test_time_total = test_time_total + test_time
                    elif line.startswith("["):
                        para = line
                    elif line.startswith("test0Indexes"):
                        index = list(map(int,re.search('test0Indexes: {(.*)}',line).group(1).split(', ')))

                y_pred = pd.concat([y_pred,pd.DataFrame(pred)],axis=1)
                y_true = pd.concat([y_true,pd.DataFrame(true)],axis=1)
                y_prob = pd.concat([y_prob,pd.DataFrame(prob)],axis=1)
    y_pred.columns = names
    y_true.columns = names
    y_prob.columns = names
    print(para)
    print("number of label:", y_pred.shape[1])
    print("training time:",train_time_total)
    print("testing time:",test_time_total)
    return y_pred,y_true,y_prob,index

In [26]:
# read data
data, label = read_data(dataPath)
n_label = label.shape[1]

resultFile = dataset+'_k5_e20_i5000_CC'

# train and test on the first label
savePath = "/Users/jiangjunhao/Desktop/test/"+dataset+'/'

# data set information
n_label = label.shape[1]
n_attr = data.shape[1]
n_instance = data.shape[0]
avg_label_per_instance = label.sum(axis=1).mean()

# get orders
order = random.sample(list(range(n_label)),n_label) # get orders


## prepare data, get the arff file
for i in range(n_label):
    label_i = label.iloc[:,order[i]]
    #print("--Running label:",label_i.name)
    csv_to_arff(data, label_i, savePath)

    # run eskdb
    label_arff = os.path.join(savePath,label_i.name+'.arff')
    k = '5'
    i = '5000'
    l = '2'
    e = '20'

    run_eskdb(label_arff, resultFile, k, l, e, i)

    result = os.path.join("/Volumes/Samsung_T5/research/programme/ESKDB_HDP/result",resultFile, label_i.name+'.txt')
    with open(result, 'r') as f:
        try:
            lines = f.readlines()
        except:
            print(file)
        else:
            pred = []
            for line in lines:
                if line.startswith('pred'):
                    pred.append(int(re.search('pred :\t(.)',line).group(1)))
                elif line.startswith("test0Indexes"):
                    index = list(map(int,re.search('test0Indexes: {(.*)}',line).group(1).split(', ')))

            label.loc[index,label_i.name] = pred
            temp = label.loc[:,label_i.name]
            temp.name = 'label_'+label_i.name
            data = pd.concat([data, label.loc[:,label_i.name]],axis=1)

# get result
resultPath = '/Volumes/Samsung_T5/research/programme/ESKDB_HDP/result/'+resultFile+'/'
y_pred,y_true,y_prob,index = get_result(resultPath)
performance = evaluation(y_pred=y_pred, y_true=y_true, y_prob=y_prob)

# print orders:
print(label.columns[order])

# get confusion matrix
print("\n--- Confusion matrix ---")
get_confusion_matrix(y_pred, y_true, y_pred.columns)

# print performance
print("--- Performance ---")
for key, value in performance.items():
    if key == "f1_each_label":
        print("\n- f1 for each label -")
        for i in range(n_label):
            print("label_"+label.columns[i],"=",round(value[i],2))
    elif key == "coverage_error":
        print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
    else:
        print(key,'=',round(value,2))

[-t, /Users/jiangjunhao/Desktop/test/enron/A.A1.arff, -S, ESKDB, -K, 5, -I, 5000, -L, 2, -E, 20, -V, -M]

number of label: 53
training time: 15381.0
testing time: 8909.0


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Index(['A.A4', 'D.D8', 'B.B12', 'B.B5', 'B.B1', 'B.B3', 'B.B6', 'C.C6', 'C.C2',
       'B.B11', 'D.D4', 'D.D19', 'D.D16', 'C.C9', 'C.C3', 'A.A8', 'B.B9',
       'D.D15', 'D.D12', 'B.B13', 'D.D13', 'C.C7', 'C.C11', 'B.B4', 'D.D14',
       'D.D9', 'D.D1', 'D.D11', 'A.A3', 'A.A2', 'A.A5', 'C.C1', 'D.D7',
       'B.B10', 'D.D2', 'C.C4', 'D.D18', 'C.C13', 'C.C12', 'D.D3', 'A.A6',
       'B.B8', 'B.B7', 'B.B2', 'D.D10', 'C.C8', 'A.A7', 'C.C5', 'D.D6',
       'D.D17', 'D.D5', 'C.C10', 'A.A1'],
      dtype='object')

--- Confusion matrix ---
--- Performance ---
coverage_error = 13.62 ( avg_label_per_instance = 3.38 )
ranking_loss = 0.09
hamming_loss = 0.05
f1_macro = 0.16
f1_micro = 0.56
Jaccard_Index = 0.95
zero_one_error = 0.87

- f1 for each label -
label_A.A8 = 0.67
label_C.C9 = 0.0
label_B.B12 = 0.0
label_C.C11 = 0.0
label_C.C5 = 0.62
label_C.C7 = 0.0
label_B.B2 = 0.25
label_B.B3 = 0.6
label_D.D16 = 0.0
label_A.A7 = 0.5
label_D.D1 = 0.0
label_A.A4 = 0.0
label_C.C2 = 0.0
label_A.A3 = 0.11


## Ensemble CC using ESKDB(E=10)

In [15]:
# read data
data, label = read_data(dataPath)
n_label = label.shape[1]

# data set information
n_label = label.shape[1]
n_attr = data.shape[1]
n_instance = data.shape[0]
avg_label_per_instance = label.sum(axis=1).mean()

# ensemble
ensemble = 10

y_pred_ensemble = pd.DataFrame(np.zeros((int(label.shape[0]/2),label.shape[1])),columns=label.columns)
y_prob_ensemble = pd.DataFrame(np.zeros((int(label.shape[0]/2),label.shape[1])),columns=label.columns)
    
resultFile = dataset+'_k5_e20_i5000_ECC'   
for i in range(ensemble):

    # read data
    data, label = read_data(dataPath)
    n_label = label.shape[1]

    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()

    # get orders
    order = random.sample(list(range(n_label)),n_label) # get orders

    # train and test on the first label
    savePath = "/Users/jiangjunhao/Desktop/test/"+dataset+'/'

    ## prepare data, get the arff file
    for i in range(n_label):
        label_i = label.iloc[:,order[i]]
        print("--Running label:",label_i.name)
        csv_to_arff(data, label_i, savePath)

        # run eskdb
        label_arff = os.path.join(savePath,label_i.name+'.arff')
        k = '5'
        i = '5000'
        l = '2'
        e = '20'

        run_eskdb(label_arff, resultFile, k, l, e, i)

        result = os.path.join("/Volumes/Samsung_T5/research/programme/ESKDB_HDP/result",resultFile, label_i.name+'.txt')
        with open(result, 'r') as f:
            try:
                lines = f.readlines()
            except:
                print(file)
            else:
                pred = []
                for line in lines:
                    if line.startswith('pred'):
                        if re.search('pred :\t(.)',line).group(1) == '-':
                            pred.append(0)
                        else:
                            pred.append(int(re.search('pred :\t(.)',line).group(1)))
                    elif line.startswith("test0Indexes"):
                        index = list(map(int,re.search('test0Indexes: {(.*)}',line).group(1).split(', ')))

                label.loc[index,label_i.name] = pred
                temp = label.loc[:,label_i.name]
                temp.name = 'label_'+label_i.name
                data = pd.concat([data, label.loc[:,label_i.name]],axis=1)

    # get result
    resultPath = '/Volumes/Samsung_T5/research/programme/ESKDB_HDP/result/'+resultFile+'/'
    y_pred,y_true,y_prob,index = get_result(resultPath)
    y_pred.columns = label.columns
    y_prob.columns = label.columns
    
    y_pred_ensemble = y_pred_ensemble + y_pred
    y_prob_ensemble = y_prob_ensemble + y_prob
    
    
y_pred_ensemble = (((y_pred_ensemble / ensemble) >= 0.5)*1).astype('int')
y_prob_ensemble = y_prob_ensemble / ensemble 

--Running label: B.B8
--Running label: D.D11
--Running label: B.B1
--Running label: B.B12
--Running label: D.D14
--Running label: D.D3
--Running label: A.A7
--Running label: D.D18
--Running label: C.C4
--Running label: B.B9
--Running label: B.B4
--Running label: D.D8
--Running label: D.D5
--Running label: A.A6
--Running label: B.B11
--Running label: C.C11
--Running label: C.C1
--Running label: D.D19
--Running label: B.B6
--Running label: C.C2
--Running label: D.D17
--Running label: C.C3
--Running label: C.C7
--Running label: D.D15
--Running label: D.D10
--Running label: C.C9
--Running label: C.C12
--Running label: D.D13
--Running label: D.D16
--Running label: D.D12
--Running label: C.C6
--Running label: A.A3
--Running label: D.D2
--Running label: A.A1
--Running label: B.B13
--Running label: B.B2
--Running label: A.A8
--Running label: B.B7
--Running label: A.A5
--Running label: C.C13
--Running label: C.C5
--Running label: D.D6
--Running label: B.B3
--Running label: D.D9
--Running label:

--Running label: C.C9
--Running label: B.B8
--Running label: A.A8
--Running label: D.D1
--Running label: B.B5
--Running label: C.C4
--Running label: C.C10
--Running label: D.D6
--Running label: D.D7
--Running label: A.A2
--Running label: B.B4
--Running label: D.D16
--Running label: A.A5
--Running label: A.A6
--Running label: D.D8
--Running label: D.D18
--Running label: D.D5
--Running label: D.D11
--Running label: B.B9
--Running label: C.C6
--Running label: B.B12
--Running label: C.C8
--Running label: D.D9
--Running label: D.D3
--Running label: D.D12
--Running label: B.B3
--Running label: B.B13
--Running label: C.C11
--Running label: D.D4
--Running label: C.C12
--Running label: C.C3
--Running label: D.D2
--Running label: C.C2
--Running label: B.B6
--Running label: D.D15
--Running label: C.C1
--Running label: A.A4
--Running label: A.A7
--Running label: C.C13
--Running label: D.D17
--Running label: D.D10
--Running label: B.B2
--Running label: C.C7
--Running label: B.B10
--Running label: D

In [16]:
performance = evaluation(y_pred=y_pred_ensemble, y_true=y_true, y_prob=y_prob_ensemble)

# print orders:
print(label.columns[order])

# get confusion matrix
print("\n--- Confusion matrix ---")
get_confusion_matrix(y_pred, y_true, y_pred.columns)

# print performance
print("--- Performance ---")
for key, value in performance.items():
    if key == "f1_each_label":
        print("\n- f1 for each label -")
        for i in range(n_label):
            print("label_"+label.columns[i],"=",round(value[i],2))
    elif key == "coverage_error":
        print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
    else:
        print(key,'=',round(value,2))

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Index(['C.C3', 'A.A3', 'D.D8', 'A.A8', 'D.D1', 'A.A1', 'A.A6', 'C.C2', 'C.C13',
       'B.B7', 'D.D3', 'B.B5', 'B.B8', 'D.D6', 'D.D7', 'B.B11', 'B.B6', 'C.C1',
       'B.B10', 'D.D16', 'C.C12', 'D.D2', 'D.D17', 'B.B12', 'C.C6', 'B.B9',
       'D.D11', 'D.D12', 'B.B13', 'B.B1', 'C.C9', 'B.B3', 'A.A2', 'C.C10',
       'D.D4', 'C.C7', 'D.D9', 'D.D18', 'A.A7', 'C.C8', 'B.B4', 'C.C5', 'C.C4',
       'A.A5', 'D.D15', 'D.D14', 'B.B2', 'D.D10', 'D.D19', 'A.A4', 'D.D5',
       'D.D13', 'C.C11'],
      dtype='object')

--- Confusion matrix ---
--- Performance ---
coverage_error = 13.16 ( avg_label_per_instance = 3.38 )
ranking_loss = 0.08
hamming_loss = 0.05
f1_macro = 0.16
f1_micro = 0.56
Jaccard_Index = 0.95
zero_one_error = 0.87

- f1 for each label -
label_A.A8 = 0.14
label_C.C9 = 0.0
label_B.B12 = 0.0
label_C.C11 = 0.0
label_C.C5 = 0.0
label_C.C7 = 0.0
label_B.B2 = 0.0
label_B.B3 = 0.32
label_D.D16 = 0.0
label_A.A7 = 0.0
label_D.D1 = 0.0
label_A.A4 = 0.14
label_C.C2 = 0.2
label_A.A3 = 0.0
l

# Result

`
**Performance of BR_naive Bayes**
coverage_error = 13.6 ( avg_label_per_instance = 3.25 )
ranking_loss = 0.13
hamming_loss = 0.1
f1_macro = 0.26
f1_micro = 0.4
Jaccard_Index = 0.26
zero_one_error = 0.98
`

`
**Performance of BR_ESKDB**
coverage_error = 14.76 ( avg_label_per_instance = 3.25 )
ranking_loss = 0.15
hamming_loss = 0.08
f1_macro = 0.15
f1_micro = 0.33
jaccard_index = 0.23
zero_one_error = 0.94
`

`
**Performance of Ensemble Classifier Chain using naive Bayes**
coverage_error = 13.63 ( avg_label_per_instance = 3.25 )
ranking_loss = 0.13
hamming_loss = 0.11
f1_macro = 0.26
f1_micro = 0.38
Jaccard_Index = 0.25
zero_one_error = 0.97
`

`
**Performance of Ensemble Classifier Chain using ESKDB(E=2)**
coverage_error = 15.87 ( avg_label_per_instance = 3.25 )
ranking_loss = 0.16
hamming_loss = 0.08
f1_macro = 0.22
f1_micro = 0.38
Jaccard_Index = 0.25
zero_one_error = 0.95
`


recall 体现了分类模型H对正样本的识别能力，recall 越高，说明模型对正样本的识别能力越强.

precision 体现了模型对负样本的区分能力，precision越高，说明模型对负样本的区分能力越强。F1-score 是两者的综合。F1-score 越高，说明分类模型越稳健。