In [1]:
import pandas as pd
import numpy as np
import os
import time
import subprocess
import re
import random
import arff

from math import sqrt
from sklearn.model_selection import  train_test_split
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB  
from sklearn.metrics import classification_report
from sklearn.metrics import coverage_error
from sklearn.metrics import label_ranking_loss
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import zero_one_loss
from sklearn.metrics import jaccard_similarity_score

In [2]:
# BR for getting error matrix
def naiveBayes_multi_label_training_BR(X_train, y_train):
    start = time.time()
    
    n_label = y_train.shape[1]
    classifier_list = [MultinomialNB() for i in range(n_label)]
    for i in range(n_label):
        classifier_list[i].fit(X_train,y_train.iloc[:,i])
    
    end = time.time()
    training_time = end-start
    
    return classifier_list, training_time

def naiveBayes_multi_label_testing_BR(X_test, n_label, classifier_list):
    y_predict = pd.DataFrame()
    y_prob = pd.DataFrame()
    
    start = time.time()
    
    for i in range(n_label):
        y_predict_i = classifier_list[i].predict(X_test)
        y_predict = pd.concat([y_predict, pd.DataFrame(y_predict_i)],axis=1)
        
        y_predict_prob_i = classifier_list[i].predict_proba(X_test)[:,1]
        y_prob = pd.concat([y_prob, pd.DataFrame(y_predict_prob_i)],axis=1)
        
    end = time.time()
    testing_time = end-start
        
    return y_predict, y_prob, testing_time

def BR_test(data, label, dataPath, random_state=3071980):
    
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()
    
    # split training and test data set
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=random_state)
    
    # training
    classifier_list, training_time = naiveBayes_multi_label_training_BR(X_train, y_train)
    
    # testing
    y_predict, y_prob, testing_time = naiveBayes_multi_label_testing_BR(X_test, n_label, classifier_list)
    
    y_predict.columns = label.columns
    return y_predict, y_test

In [3]:
def read_data(dataPath, X_file, y_file):
    # input: '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'
    # read data
    data = pd.read_csv(os.path.join(dataPath,X_file))
    label = pd.read_csv(os.path.join(dataPath,y_file))
    return data,label

def evaluation(y_pred, y_prob, y_true):
    
    coverage = coverage_error(y_true, y_prob)
    hamming = hamming_loss(y_true, y_pred)
    ranking_loss = label_ranking_loss(y_true, y_prob) 
    
    f1_macro = metrics.f1_score(y_true, y_pred, average='macro')
    f1_micro = metrics.f1_score(y_true, y_pred, average='micro')
    
    acc = 0
    for i in range(y_true.shape[0]):
        acc += jaccard_similarity_score(y_true.iloc[i,:], y_pred.iloc[i,:]) # jaccard_similarity_score
    acc = acc / y_true.shape[0]
    
    zero_one = zero_one_loss(y_true, y_pred) # 0-1 error 
    
    performance = {"coverage_error":coverage,
                   "ranking_loss":ranking_loss,
                   "hamming_loss":hamming,
                   "f1_macro":f1_macro,
                   "f1_micro":f1_micro,
                   "Jaccard_Index":acc,
                   "zero_one_error":zero_one}
    return performance
            
def build_BN(labelFile, labelName, savePng):
    cmd = """cd /Volumes/Samsung_T5/research/programme/Chordalysis/ 
    java -Xmx1g -classpath bin:lib/core/commons-math3-3.2.jar:lib/core/jayes.jar:lib/core/jgrapht-jdk1.6.jar:lib/extra/jgraphx.jar:lib/loader/weka.jar demo.Run %s 0.05 %s false
    """ % (labelFile,savePng)

    p = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
    out,err = p.communicate()  
    for line in out.splitlines():  
        if line.decode("utf-8").startswith('['):
            graph_set = [i for i in map(lambda x: x.split(','), line.decode("utf-8").replace(' ',',').strip('[[\,]]').split(',]['))]

    dic = {}
    for l in labelName:
        s = set()
        for i in map(lambda x: set(x) if l in x else None, graph_set):
            if i != None:
                s.update(i)
        s.remove(l)       
        dic[l] = s

    return dic

In [4]:
def naiveBayes_multi_label_training(X_train, y_train, bayes_net, root_name):
    
    n_label = y_train.shape[1]
    
    classifier_list = [MultinomialNB() for i in range(n_label)] # create a classifier chain
    
    learned_label = []
    
    i = 0
    inde_node = 0
    
    for node, par in bayes_net.items():
        if par == set():
            l = node
            classifier_list[i].fit(X_train,y_train.loc[:, l])
            i += 1
            learned_label.append(l)
            inde_node += 1
            
    while True:
        if i == inde_node:
            l = root_name
            classifier_list[i].fit(X_train,y_train.loc[:, l])
            i += 1
            learned_label.append(l)
            children = bayes_net[l]
            
        else:
            children_sub = []
            for child in children:
                par = [p for p in bayes_net[child] if p in learned_label]
                X = pd.concat([X_train, y_train.loc[:,par]],axis=1) # put the previous label into attribute space
                classifier_list[i].fit(X,y_train.loc[:,child])
                i += 1
                learned_label.append(child)
                children_sub.extend([p for p in bayes_net[child] if p not in learned_label])
            children = [p for p in set(children_sub) if p not in learned_label]
                
        if i >= n_label:
            break
    
    return classifier_list, learned_label

def naiveBayes_multi_label_testing(X_test, n_label, classifier_list, bayes_net, learned_label):
    y_predict = pd.DataFrame(index=X_test.index)
    y_prob = pd.DataFrame(index=X_test.index)
    y_true = pd.DataFrame(index=X_test.index)
        
    predicted_list = []
    i = 0
    
    inde_node = 0
    for node, par in bayes_net.items():
        if par == set():
            l = learned_label[i]
            y_predict_i = classifier_list[i].predict(X_test)
            y_predict_prob_i = classifier_list[i].predict_proba(X_test)[:,1]
            
            y_predict = pd.concat([y_predict, pd.DataFrame(y_predict_i,index=X_test.index,columns=[l])],axis=1)
            y_prob = pd.concat([y_prob, pd.DataFrame(y_predict_prob_i,index=X_test.index,columns=[l])],axis=1)
            
            predicted_list.append(l)
            
            i += 1
            inde_node += 1
            
    while True:
        if i == inde_node:
            l = learned_label[i]
            y_predict_i = classifier_list[i].predict(X_test)
            y_predict_prob_i = classifier_list[i].predict_proba(X_test)[:,1]
            
            y_predict = pd.concat([y_predict, pd.DataFrame(y_predict_i,index=X_test.index,columns=[l])],axis=1)
            y_prob = pd.concat([y_prob, pd.DataFrame(y_predict_prob_i,index=X_test.index,columns=[l])],axis=1)
            
            predicted_list.append(l)
            
            i += 1
            
        else:
            l = learned_label[i]
            par = [p for p in bayes_net[l] if p in predicted_list]
            
            if len(par) != 0:
                X = pd.concat([X_test, y_predict.loc[:,par]],axis=1) # put the previous label into attribute space
            else:
                X= X_test
            y_predict_i = classifier_list[i].predict(X)
            y_predict_prob_i = classifier_list[i].predict_proba(X)[:,1]
            
            y_predict = pd.concat([y_predict, pd.DataFrame(y_predict_i,index=X_test.index,columns=[l])],axis=1)
            y_prob = pd.concat([y_prob, pd.DataFrame(y_predict_prob_i,index=X_test.index,columns=[l])],axis=1)
  
            i += 1
            predicted_list.append(l)
        
        if i >= n_label:
            break
            
    return y_predict, y_prob

def BCC_test(data, label, dataPath, bayes_net, random_state=3071980, ensemble = 5, root = None):
    
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()
    
    # split training and test data set
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=random_state)
    
    # ensemble
    y_pred_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)
    y_prob_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)
    
    node_list = []
    for node, par in bayes_net.items():
        if par != set():
            node_list.append(node)
    
    en = 0
    for i in range(ensemble):
        if root != None:
            root_name = root
        else:
            root_name = label.columns[i]
            if root_name not in node_list:
                print(root_name)
                continue
                
            else:
                # training
                #print("--- start training ---\n")
                classifier_list, learned_label = naiveBayes_multi_label_training(X_train, y_train, bayes_net, root_name)

                # testing
                #print("--- start testing ---\n")
                y_predict, y_prob = naiveBayes_multi_label_testing(X_test, n_label, classifier_list, bayes_net, learned_label)

                y_predict = y_predict[label.columns]
                y_prob = y_prob[label.columns]

                y_pred_ensemble = y_pred_ensemble + y_predict
                y_prob_ensemble = y_prob_ensemble + y_prob

                en += 1
        
    y_pred_ensemble = (((y_pred_ensemble / en) >= 0.5)*1).astype('int')
    y_prob_ensemble = y_prob_ensemble / en 
    y_pred_ensemble = y_pred_ensemble.fillna(0)
    y_prob_ensemble = y_prob_ensemble.fillna(0)
    
    # evaluation
    performance = evaluation(y_pred_ensemble, y_prob_ensemble, y_test)
    
    performance_df = pd.DataFrame.from_dict(performance, orient='index')
    
    return performance_df

def BCC_test_2_fold(data, label, dataPath, bayes_net, random_state=3071980, ensemble = 5, root = None):
    
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    
    # split training and test data set
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=random_state)
    
    performance_df_all = pd.DataFrame(np.zeros([7,1]))
    for j in range(2):
        X_train, y_train = X_test, y_test
        
        # ensemble
        y_pred_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)
        y_prob_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)

        node_list = []
        for node, par in bayes_net.items():
            if par != set():
                node_list.append(node)

        en = 0
        for i in range(ensemble):
            if root != None:
                root_name = root
            else:
                root_name = label.columns[i]
                if root_name not in node_list:
                    continue

                else:
                    # training
                    #print("--- start training ---\n")
                    classifier_list, learned_label = naiveBayes_multi_label_training(X_train, y_train, bayes_net, root_name)

                    # testing
                    #print("--- start testing ---\n")
                    y_predict, y_prob = naiveBayes_multi_label_testing(X_test, n_label, classifier_list, bayes_net, learned_label)

                    y_predict = y_predict[label.columns]
                    y_prob = y_prob[label.columns]

                    y_pred_ensemble = y_pred_ensemble + y_predict
                    y_prob_ensemble = y_prob_ensemble + y_prob

                    en += 1

        y_pred_ensemble = (((y_pred_ensemble / en) >= 0.5)*1).astype('int')
        y_prob_ensemble = y_prob_ensemble / en 
        y_pred_ensemble = y_pred_ensemble.fillna(0)
        y_prob_ensemble = y_prob_ensemble.fillna(0)

        # evaluation
        performance = evaluation(y_pred_ensemble, y_prob_ensemble, y_test)
    
        performance_df = pd.DataFrame.from_dict(performance, orient='index')
        
        performance_df_all.index = performance_df.index
        performance_df_all.columns = performance_df.columns
        
        performance_df_all = performance_df_all + performance_df
        
    performance_df_all = performance_df_all / 2
    return performance_df_all

# BCC

In [5]:
df_all_1 = pd.DataFrame()

data_list = ["yeast","emotions","scene","enron","genbase","medical"]
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/small_datasets/'+dataset+"/"
    X_file = "X.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data
    
    print("learn structure")
    labelFile = dataPath+"y.csv"
    savePng = dataPath+"/bayes_net.png"

    bayes_net = build_BN(labelFile, label.columns, savePng)
    
    print("BCC test")
    df = BCC_test(data, label, dataPath, bayes_net, 3071980, label.shape[1])
    df.columns = [dataset]

    df_all_1 = pd.concat([df_all_1, df],axis=1)

df_all_1.to_csv("/Users/jiangjunhao/Desktop/results_algorithms/BayesianClassifierChain_naive_bayes.csv")
df_all_1

yeast
learn structure
BCC test
ensemble: 0
ensemble: 1
ensemble: 2
ensemble: 3
ensemble: 4
ensemble: 5
ensemble: 6
ensemble: 7
ensemble: 8
ensemble: 9
ensemble: 10
ensemble: 11
ensemble: 12
ensemble: 13
emotions
learn structure
BCC test
ensemble: 0
ensemble: 1
ensemble: 2
ensemble: 3
ensemble: 4
ensemble: 5
scene
learn structure
BCC test
ensemble: 0
ensemble: 1
ensemble: 2
ensemble: 3
ensemble: 4
ensemble: 5
enron
learn structure
BCC test
ensemble: 0


  self.class_log_prior_ = (np.log(self.class_count_) -


ensemble: 1
ensemble: 2
ensemble: 3
ensemble: 4
ensemble: 5
ensemble: 6
ensemble: 7
ensemble: 8
D.D16
ensemble: 9
ensemble: 10
ensemble: 11
ensemble: 12
ensemble: 13
ensemble: 14
ensemble: 15
ensemble: 16
ensemble: 17
ensemble: 18
ensemble: 19
ensemble: 20
ensemble: 21
ensemble: 22
ensemble: 23
ensemble: 24
ensemble: 25
ensemble: 26
ensemble: 27
ensemble: 28
ensemble: 29
ensemble: 30
D.D15
ensemble: 31
ensemble: 32
ensemble: 33
ensemble: 34
ensemble: 35
ensemble: 36
ensemble: 37
ensemble: 38
ensemble: 39
ensemble: 40
ensemble: 41
ensemble: 42
ensemble: 43
ensemble: 44
ensemble: 45
D.D18
ensemble: 46
ensemble: 47
ensemble: 48
ensemble: 49
ensemble: 50
ensemble: 51
C.C13
ensemble: 52


  'precision', 'predicted', average, warn_for)


genbase
learn structure
BCC test
ensemble: 0
ensemble: 1
ensemble: 2
ensemble: 3
ensemble: 4
ensemble: 5
ensemble: 6
ensemble: 7
ensemble: 8
ensemble: 9
ensemble: 10
ensemble: 11
ensemble: 12
ensemble: 13
ensemble: 14
ensemble: 15
ensemble: 16
ensemble: 17
ensemble: 18
ensemble: 19
ensemble: 20
PDOC00014
ensemble: 21
ensemble: 22
ensemble: 23
ensemble: 24
PDOC00660
ensemble: 25
PDOC00653
ensemble: 26


  'recall', 'true', average, warn_for)


medical
learn structure
BCC test
ensemble: 0
ensemble: 1
ensemble: 2
Class-2-786_09
ensemble: 3
ensemble: 4
ensemble: 5
Class-5-786_2
ensemble: 6
Class-6-V72_5
ensemble: 7
Class-7-511_9
ensemble: 8
Class-8-596_8
ensemble: 9
ensemble: 10
ensemble: 11
ensemble: 12
ensemble: 13
ensemble: 14
Class-14-789_00
ensemble: 15
ensemble: 16
Class-16-462
ensemble: 17
Class-17-592_0
ensemble: 18
Class-18-786_59
ensemble: 19
Class-19-785_6
ensemble: 20
Class-20-V67_09
ensemble: 21
Class-21-795_5
ensemble: 22
ensemble: 23
ensemble: 24
ensemble: 25
ensemble: 26
Class-26-V42_0
ensemble: 27
ensemble: 28
ensemble: 29
Class-29-783_0
ensemble: 30
ensemble: 31
ensemble: 32
ensemble: 33
ensemble: 34
ensemble: 35
ensemble: 36
ensemble: 37
ensemble: 38
ensemble: 39
ensemble: 40
ensemble: 41
ensemble: 42
Class-42-599_7
ensemble: 43
ensemble: 44


Unnamed: 0,yeast,emotions,scene,enron,genbase,medical
coverage_error,8.094293,2.851852,1.510797,17.144536,1.462236,3.803681
ranking_loss,0.214029,0.174888,0.082879,0.131462,0.004686,0.046311
hamming_loss,0.243885,0.230079,0.204734,0.114449,0.006042,0.025721
f1_macro,0.383105,0.620216,0.631126,0.188882,0.476872,0.157944
f1_micro,0.583955,0.657763,0.610687,0.417118,0.928947,0.536066
Jaccard_Index,0.756115,0.769921,0.795266,0.885551,0.993958,0.974279
zero_one_error,0.873449,0.713805,0.793189,0.989424,0.129909,0.664622


In [6]:
data_list = ['tmc2007','rcv1subset1','rcv1subset2','rcv1subset3','rcv1subset4','rcv1subset5']
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/large_datasets/'+dataset+"/"
    X_file = "X_dis_1500.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data
    
    print("learn structure")
    labelFile = dataPath+"y.csv"
    savePng = dataPath+"/bayes_net.png"

    bayes_net = build_BN(labelFile, label.columns, savePng)
    
    print("BCC test")
    df = BCC_test(data, label, dataPath, bayes_net, 3071980, label.shape[1])
    df.columns = [dataset]

    df_all_1 = pd.concat([df_all_1, df],axis=1)

    
df_all_1.to_csv("/Users/jiangjunhao/Desktop/results_algorithms/BayesianClassifierChain_naive_bayes.csv")
df_all_1

tmc2007
learn structure
BCC test
ensemble: 0
ensemble: 1
ensemble: 2
ensemble: 3
ensemble: 4
ensemble: 5
ensemble: 6
ensemble: 7
ensemble: 8
ensemble: 9
ensemble: 10
ensemble: 11
ensemble: 12
ensemble: 13
ensemble: 14
ensemble: 15
ensemble: 16
ensemble: 17
ensemble: 18
ensemble: 19
ensemble: 20
ensemble: 21
rcv1subset1
learn structure
BCC test
ensemble: 0


  self.class_log_prior_ = (np.log(self.class_count_) -


ensemble: 1
ensemble: 2
ensemble: 3
ensemble: 4
ensemble: 5
ensemble: 6
ensemble: 7
ensemble: 8
ensemble: 9
ensemble: 10
ensemble: 11
ensemble: 12
ensemble: 13
ensemble: 14
ensemble: 15
ensemble: 16
ensemble: 17
ensemble: 18
ensemble: 19
ensemble: 20
ensemble: 21
ensemble: 22
ensemble: 23
ensemble: 24
ensemble: 25
ensemble: 26
ensemble: 27
ensemble: 28
ensemble: 29
ensemble: 30
ensemble: 31
ensemble: 32
ensemble: 33
ensemble: 34
ensemble: 35
ensemble: 36
ensemble: 37
ensemble: 38
ensemble: 39
ensemble: 40
ensemble: 41
ensemble: 42
ensemble: 43
ensemble: 44
ensemble: 45
ensemble: 46
ensemble: 47
ensemble: 48
ensemble: 49
ensemble: 50
ensemble: 51
ensemble: 52
ensemble: 53
ensemble: 54
ensemble: 55
ensemble: 56
ensemble: 57
ensemble: 58
ensemble: 59
ensemble: 60
ensemble: 61
ensemble: 62
ensemble: 63
ensemble: 64
ensemble: 65
ensemble: 66
ensemble: 67
ensemble: 68
ensemble: 69
ensemble: 70
ensemble: 71
ensemble: 72
ensemble: 73
ensemble: 74
ensemble: 75
ensemble: 76
ensemble: 77
ensemble

  'precision', 'predicted', average, warn_for)


rcv1subset2
learn structure
BCC test
ensemble: 0
ensemble: 1
ensemble: 2
ensemble: 3
ensemble: 4
ensemble: 5
ensemble: 6
ensemble: 7
ensemble: 8
ensemble: 9
ensemble: 10
ensemble: 11
ensemble: 12
ensemble: 13
ensemble: 14
ensemble: 15
ensemble: 16
ensemble: 17
ensemble: 18
ensemble: 19
ensemble: 20
ensemble: 21
ensemble: 22
ensemble: 23
ensemble: 24
ensemble: 25
ensemble: 26
ensemble: 27
ensemble: 28
ensemble: 29
ensemble: 30
ensemble: 31
ensemble: 32
ensemble: 33
ensemble: 34
ensemble: 35
ensemble: 36
ensemble: 37
ensemble: 38
ensemble: 39
ensemble: 40
ensemble: 41
ensemble: 42
ensemble: 43
ensemble: 44
ensemble: 45
ensemble: 46
ensemble: 47
ensemble: 48
ensemble: 49
ensemble: 50
ensemble: 51
ensemble: 52
ensemble: 53
ensemble: 54
ensemble: 55
ensemble: 56
ensemble: 57
ensemble: 58
ensemble: 59
ensemble: 60
ensemble: 61
ensemble: 62
ensemble: 63
ensemble: 64
ensemble: 65
ensemble: 66
ensemble: 67
ensemble: 68
ensemble: 69
ensemble: 70
ensemble: 71
ensemble: 72
ensemble: 73
ensemble: 7

  'recall', 'true', average, warn_for)


rcv1subset3
learn structure
BCC test
ensemble: 0
ensemble: 1
ensemble: 2
ensemble: 3
ensemble: 4
ensemble: 5
ensemble: 6
ensemble: 7
ensemble: 8
ensemble: 9
ensemble: 10
ensemble: 11
ensemble: 12
ensemble: 13
ensemble: 14
ensemble: 15
ensemble: 16
ensemble: 17
ensemble: 18
ensemble: 19
ensemble: 20
ensemble: 21
ensemble: 22
ensemble: 23
ensemble: 24
ensemble: 25
ensemble: 26
ensemble: 27
ensemble: 28
ensemble: 29
ensemble: 30
ensemble: 31
ensemble: 32
ensemble: 33
ensemble: 34
ensemble: 35
ensemble: 36
ensemble: 37
ensemble: 38
ensemble: 39
ensemble: 40
ensemble: 41
ensemble: 42
ensemble: 43
ensemble: 44
ensemble: 45
ensemble: 46
ensemble: 47
ensemble: 48
ensemble: 49
ensemble: 50
ensemble: 51
ensemble: 52
ensemble: 53
ensemble: 54
ensemble: 55
ensemble: 56
ensemble: 57
ensemble: 58
ensemble: 59
ensemble: 60
ensemble: 61
ensemble: 62
ensemble: 63
ensemble: 64
ensemble: 65
ensemble: 66
ensemble: 67
ensemble: 68
ensemble: 69
ensemble: 70
ensemble: 71
ensemble: 72
ensemble: 73
ensemble: 7

Unnamed: 0,yeast,emotions,scene,enron,genbase,medical,tmc2007,rcv1subset1,rcv1subset2,rcv1subset3,rcv1subset4,rcv1subset5
coverage_error,8.094293,2.851852,1.510797,17.144536,1.462236,3.803681,4.154008,13.701,13.546333,13.694333,12.373667,14.248
ranking_loss,0.214029,0.174888,0.082879,0.131462,0.004686,0.046311,0.060175,0.054673,0.055343,0.056344,0.051297,0.058726
hamming_loss,0.243885,0.230079,0.204734,0.114449,0.006042,0.025721,0.115913,0.032967,0.035244,0.03603,0.033033,0.038789
f1_macro,0.383105,0.620216,0.631126,0.188882,0.476872,0.157944,0.478522,0.231094,0.210592,0.191668,0.175994,0.188055
f1_micro,0.583955,0.657763,0.610687,0.417118,0.928947,0.536066,0.583241,0.438599,0.389876,0.383185,0.377821,0.366381
Jaccard_Index,0.756115,0.769921,0.795266,0.885551,0.993958,0.974279,0.884087,0.967033,0.964756,0.96397,0.966967,0.961211
zero_one_error,0.873449,0.713805,0.793189,0.989424,0.129909,0.664622,0.88334,0.961333,0.894667,0.894667,0.815,0.912


## 5 times 2-fold

In [5]:
df_all_1_twofold= pd.DataFrame()

seed = [1234,2234,12345,12346,1234567]

data_list = ["yeast","emotions","scene","enron","genbase","medical"]
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/small_datasets/'+dataset+"/"
    X_file = "X.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data
    
    d = pd.DataFrame(np.zeros([7,1]))
    for s in seed:
        labelFile = dataPath+"y.csv"
        savePng = dataPath+"/bayes_net.png"
        bayes_net = build_BN(labelFile, label.columns, savePng)
        df = BCC_test_2_fold(data, label, dataPath, bayes_net, s, label.shape[1])
        df.columns = [dataset]
        d.columns = [dataset]
        d.index = df.index
        d = d + df
    df_all_1_twofold = pd.concat([df_all_1_twofold, d/5],axis=1)

data_list = ['tmc2007','rcv1subset1','rcv1subset2','rcv1subset3','rcv1subset4','rcv1subset5']
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/large_datasets/'+dataset+"/"
    X_file = "X_dis_1500.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data
    
    d = pd.DataFrame(np.zeros([7,1]))
    for s in seed:
        labelFile = dataPath+"y.csv"
        savePng = dataPath+"/bayes_net.png"
        bayes_net = build_BN(labelFile, label.columns, savePng)
        df = BCC_test_2_fold(data, label, dataPath, bayes_net, s, label.shape[1])
        df.columns = [dataset]
        d.columns = [dataset]
        d.index = df.index
        d = d + df
    df_all_1_twofold = pd.concat([df_all_1_twofold, d/5],axis=1)
    
df_all_1_twofold.to_csv("/Users/jiangjunhao/Desktop/results_algorithms/twofold/BayesianClassifierChain_naive_bayes.csv")
df_all_1_twofold

yeast
emotions
scene
enron


  self.class_log_prior_ = (np.log(self.class_count_) -
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


genbase
medical
tmc2007
rcv1subset1
rcv1subset2
rcv1subset3
rcv1subset4
rcv1subset5


Unnamed: 0,yeast,emotions,scene,enron,genbase,medical,tmc2007,rcv1subset1,rcv1subset2,rcv1subset3,rcv1subset4,rcv1subset5
coverage_error,7.802812,2.753535,1.488538,12.071915,1.358308,1.87771,3.979564,8.805133,7.695667,7.696733,7.2838,7.785933
ranking_loss,0.196125,0.164368,0.080126,0.078935,0.002219,0.012031,0.054211,0.030883,0.026532,0.026616,0.025777,0.026631
hamming_loss,0.232884,0.217284,0.198726,0.092757,0.004811,0.014133,0.11274,0.028131,0.026733,0.026926,0.026366,0.026969
f1_macro,0.426581,0.64269,0.631035,0.378736,0.504426,0.27515,0.496631,0.339901,0.329695,0.318412,0.270732,0.304354
f1_micro,0.603068,0.672793,0.612512,0.518102,0.944995,0.725644,0.595921,0.510375,0.491632,0.494747,0.467463,0.492696
Jaccard_Index,0.767116,0.782716,0.801274,0.907243,0.995189,0.985867,0.88726,0.971869,0.973267,0.973074,0.973634,0.973031
zero_one_error,0.851282,0.706397,0.798505,0.964512,0.107553,0.470757,0.874444,0.953667,0.878,0.863533,0.797933,0.887267


# LEAD

In [7]:
df_all_2 = pd.DataFrame()

data_list = ["yeast","emotions","scene","enron","genbase","medical"]
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/small_datasets/'+dataset+"/"
    X_file = "X.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data
    
    y_predict, y_test = BR_test(data, label, dataPath,3071980)

    error_matrix = pd.DataFrame(np.array(y_predict) - np.array(y_test), columns=y_test.columns)

    labelFile = dataPath+"error_matrix.csv"
    savePng = dataPath+"/bayes_net_error_matrix.png"
    error_matrix.to_csv(os.path.join(dataPath,labelFile),index=False)
    less_error_label = (error_matrix!=0).sum().idxmin()
    bayes_net = build_BN(labelFile, label.columns, savePng)

    df = BCC_test(data, label, dataPath, bayes_net, 3071980, label.shape[1])
    df.columns = [dataset]

    df_all_2 = pd.concat([df_all_2, df],axis=1)
    
df_all_2.to_csv("/Users/jiangjunhao/Desktop/results_algorithms/LEAD_naive_bayes.csv")
df_all_2

yeast
learn structure
BCC test
ensemble: 0
ensemble: 1
ensemble: 2
ensemble: 3
ensemble: 4
ensemble: 5
ensemble: 6
ensemble: 7
ensemble: 8
ensemble: 9
ensemble: 10
ensemble: 11
ensemble: 12
ensemble: 13
emotions
learn structure
BCC test
ensemble: 0
ensemble: 1
ensemble: 2
ensemble: 3
ensemble: 4
ensemble: 5
scene
learn structure
BCC test
ensemble: 0
ensemble: 1
ensemble: 2
ensemble: 3
ensemble: 4
ensemble: 5
enron
learn structure


  self.class_log_prior_ = (np.log(self.class_count_) -
  'precision', 'predicted', average, warn_for)


BCC test
ensemble: 0
A.A8
ensemble: 1
ensemble: 2
ensemble: 3
ensemble: 4
ensemble: 5
ensemble: 6
ensemble: 7
ensemble: 8
ensemble: 9
ensemble: 10
ensemble: 11
ensemble: 12
ensemble: 13
ensemble: 14
ensemble: 15
ensemble: 16
ensemble: 17
ensemble: 18
ensemble: 19
ensemble: 20
ensemble: 21
ensemble: 22
ensemble: 23
ensemble: 24
ensemble: 25
ensemble: 26
ensemble: 27
ensemble: 28
ensemble: 29
ensemble: 30
ensemble: 31
ensemble: 32
ensemble: 33
ensemble: 34
ensemble: 35
ensemble: 36
ensemble: 37
ensemble: 38
ensemble: 39
ensemble: 40
ensemble: 41
ensemble: 42
ensemble: 43
ensemble: 44
ensemble: 45
D.D18
ensemble: 46
ensemble: 47
D.D17
ensemble: 48
ensemble: 49
ensemble: 50
ensemble: 51
ensemble: 52
genbase
learn structure


  'recall', 'true', average, warn_for)


BCC test
ensemble: 0
ensemble: 1
ensemble: 2
ensemble: 3
ensemble: 4
ensemble: 5
ensemble: 6
ensemble: 7
ensemble: 8
ensemble: 9
ensemble: 10
ensemble: 11
ensemble: 12
ensemble: 13
ensemble: 14
ensemble: 15
ensemble: 16
ensemble: 17
ensemble: 18
ensemble: 19
ensemble: 20
ensemble: 21
ensemble: 22
ensemble: 23
ensemble: 24
ensemble: 25
ensemble: 26
medical
learn structure
BCC test
ensemble: 0
ensemble: 1
ensemble: 2
ensemble: 3
ensemble: 4
ensemble: 5
ensemble: 6
ensemble: 7
ensemble: 8
ensemble: 9
ensemble: 10
ensemble: 11
ensemble: 12
ensemble: 13
ensemble: 14
ensemble: 15
ensemble: 16
ensemble: 17
ensemble: 18
ensemble: 19
ensemble: 20
ensemble: 21
ensemble: 22
ensemble: 23
ensemble: 24
ensemble: 25
ensemble: 26
ensemble: 27
ensemble: 28
ensemble: 29
ensemble: 30
ensemble: 31
ensemble: 32
ensemble: 33
ensemble: 34
ensemble: 35
ensemble: 36
ensemble: 37
ensemble: 38
ensemble: 39
ensemble: 40
ensemble: 41
ensemble: 42
ensemble: 43
ensemble: 44


Unnamed: 0,yeast,emotions,scene,enron,genbase,medical
coverage_error,8.120761,2.838384,1.510797,17.195065,1.374622,3.777096
ranking_loss,0.210724,0.172316,0.083211,0.131091,0.002929,0.046129
hamming_loss,0.242054,0.247475,0.213178,0.111633,0.00593,0.025721
f1_macro,0.376699,0.634074,0.619488,0.187949,0.491257,0.16407
f1_micro,0.584272,0.650278,0.600622,0.42173,0.930355,0.537582
Jaccard_Index,0.757946,0.752525,0.786822,0.888367,0.99407,0.974279
zero_one_error,0.870141,0.781145,0.805648,0.988249,0.132931,0.668712


In [9]:
data_list = ['tmc2007','rcv1subset1','rcv1subset2','rcv1subset3','rcv1subset4','rcv1subset5']
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/large_datasets/'+dataset+"/"
    X_file = "X_dis_1500.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data
    
    print("learn structure")
    y_predict, y_test = BR_test(data, label, dataPath,3071980)

    error_matrix = pd.DataFrame(np.array(y_predict) - np.array(y_test), columns=y_test.columns)

    labelFile = dataPath+"error_matrix.csv"
    savePng = dataPath+"/bayes_net_error_matrix.png"
    error_matrix.to_csv(os.path.join(dataPath,labelFile),index=False)
    less_error_label = (error_matrix!=0).sum().idxmin()
    bayes_net = build_BN(labelFile, label.columns, savePng)

    print("BCC test")
    df = BCC_test(data, label, dataPath, bayes_net, 3071980, label.shape[1])
    df.columns = [dataset]

    df_all_2 = pd.concat([df_all_2, df],axis=1)
    
df_all_2.to_csv("/Users/jiangjunhao/Desktop/results_algorithms/LEAD_naive_bayes.csv")

tmc2007
learn structure
BCC test
ensemble: 0
ensemble: 1
ensemble: 2
ensemble: 3
ensemble: 4
ensemble: 5
ensemble: 6
ensemble: 7
ensemble: 8
ensemble: 9
ensemble: 10
ensemble: 11
ensemble: 12
ensemble: 13
ensemble: 14
ensemble: 15
ensemble: 16
ensemble: 17
ensemble: 18
ensemble: 19
ensemble: 20
ensemble: 21
rcv1subset1
learn structure


  self.class_log_prior_ = (np.log(self.class_count_) -
  'precision', 'predicted', average, warn_for)


BCC test
ensemble: 0
ensemble: 1
ensemble: 2
ensemble: 3
ensemble: 4
ensemble: 5
ensemble: 6
ensemble: 7
ensemble: 8
ensemble: 9
ensemble: 10
ensemble: 11
ensemble: 12
ensemble: 13
ensemble: 14
ensemble: 15
ensemble: 16
ensemble: 17
ensemble: 18
ensemble: 19
ensemble: 20
ensemble: 21
ensemble: 22
ensemble: 23
ensemble: 24
ensemble: 25
ensemble: 26
ensemble: 27
ensemble: 28
ensemble: 29
ensemble: 30
ensemble: 31
ensemble: 32
ensemble: 33
ensemble: 34
ensemble: 35
ensemble: 36
ensemble: 37
ensemble: 38
ensemble: 39
ensemble: 40
ensemble: 41
ensemble: 42
ensemble: 43
ensemble: 44
ensemble: 45
ensemble: 46
ensemble: 47
ensemble: 48
ensemble: 49
Class50
ensemble: 50
ensemble: 51
ensemble: 52
ensemble: 53
ensemble: 54
ensemble: 55
ensemble: 56
ensemble: 57
ensemble: 58
ensemble: 59
ensemble: 60
ensemble: 61
ensemble: 62
ensemble: 63
ensemble: 64
ensemble: 65
ensemble: 66
ensemble: 67
ensemble: 68
ensemble: 69
ensemble: 70
ensemble: 71
ensemble: 72
ensemble: 73
ensemble: 74
ensemble: 75
ensem

  'recall', 'true', average, warn_for)


BCC test
ensemble: 0
ensemble: 1
ensemble: 2
ensemble: 3
ensemble: 4
ensemble: 5
ensemble: 6
ensemble: 7
ensemble: 8
ensemble: 9
ensemble: 10
ensemble: 11
ensemble: 12
ensemble: 13
ensemble: 14
ensemble: 15
ensemble: 16
ensemble: 17
ensemble: 18
ensemble: 19
ensemble: 20
ensemble: 21
ensemble: 22
ensemble: 23
ensemble: 24
ensemble: 25
ensemble: 26
ensemble: 27
ensemble: 28
ensemble: 29
ensemble: 30
ensemble: 31
ensemble: 32
ensemble: 33
ensemble: 34
ensemble: 35
ensemble: 36
ensemble: 37
ensemble: 38
ensemble: 39
ensemble: 40
ensemble: 41
ensemble: 42
ensemble: 43
ensemble: 44
ensemble: 45
ensemble: 46
ensemble: 47
ensemble: 48
ensemble: 49
ensemble: 50
ensemble: 51
ensemble: 52
ensemble: 53
ensemble: 54
ensemble: 55
ensemble: 56
ensemble: 57
ensemble: 58
ensemble: 59
ensemble: 60
ensemble: 61
ensemble: 62
ensemble: 63
ensemble: 64
ensemble: 65
ensemble: 66
ensemble: 67
ensemble: 68
ensemble: 69
ensemble: 70
ensemble: 71
ensemble: 72
ensemble: 73
ensemble: 74
ensemble: 75
ensemble: 76


## 5 times 2 fold

In [5]:
df_all_2_2fold = pd.DataFrame()

seed = [1234,2234,12345,12346,1234567]

data_list = ["yeast","emotions","scene"]
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/small_datasets/'+dataset+"/"
    X_file = "X.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data
    
    d = pd.DataFrame(np.zeros([7,1]))
    for s in seed:
        y_predict, y_test = BR_test(data, label, dataPath,s)

        error_matrix = pd.DataFrame(np.array(y_predict) - np.array(y_test), columns=y_test.columns)

        labelFile = dataPath+"error_matrix.csv"
        savePng = dataPath+"/bayes_net_error_matrix.png"
        error_matrix.to_csv(os.path.join(dataPath,labelFile),index=False)
        bayes_net = build_BN(labelFile, label.columns, savePng)

        df = BCC_test_2_fold(data, label, dataPath, bayes_net, s, label.shape[1])
        df.columns = [dataset]
        d.columns = [dataset]
        d.index = df.index
        d = d + df
        

    df_all_2_2fold = pd.concat([df_all_2_2fold, d/5],axis=1)
df_all_2_2fold.to_csv("/Users/jiangjunhao/Desktop/results_algorithms/twofold/LEAD_naive_bayes.csv")


df_all_2_2fold

yeast
emotions
scene


Unnamed: 0,yeast,emotions,scene
coverage_error,7.807113,2.758249,1.497674
ranking_loss,0.196283,0.16596,0.081957
hamming_loss,0.23409,0.221998,0.211489
f1_macro,0.414332,0.631359,0.616057
f1_micro,0.597466,0.666152,0.598532
Jaccard_Index,0.76591,0.778002,0.788511
zero_one_error,0.858561,0.708418,0.811794


In [6]:
data_list = ["enron","genbase","medical"]
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/small_datasets/'+dataset+"/"
    X_file = "X.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data
    
    d = pd.DataFrame(np.zeros([7,1]))
    for s in seed:
        y_predict, y_test = BR_test(data, label, dataPath,s)

        error_matrix = pd.DataFrame(np.array(y_predict) - np.array(y_test), columns=y_test.columns)

        labelFile = dataPath+"error_matrix.csv"
        savePng = dataPath+"/bayes_net_error_matrix.png"
        error_matrix.to_csv(os.path.join(dataPath,labelFile),index=False)
        bayes_net = build_BN(labelFile, label.columns, savePng)

        df = BCC_test_2_fold(data, label, dataPath, bayes_net, s, label.shape[1])
        df.columns = [dataset]
        d.columns = [dataset]
        d.index = df.index
        d = d + df
        

    df_all_2_2fold = pd.concat([df_all_2_2fold, d/5],axis=1)
df_all_2_2fold.to_csv("/Users/jiangjunhao/Desktop/results_algorithms/twofold/LEAD_naive_bayes.csv")


df_all_2_2fold

enron


  self.class_log_prior_ = (np.log(self.class_count_) -
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


KeyboardInterrupt: 

In [None]:
data_list = ['tmc2007','rcv1subset1','rcv1subset2','rcv1subset3','rcv1subset4','rcv1subset5']
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/small_datasets/'+dataset+"/"
    X_file = "X_dis_1500.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data
    
    d = pd.DataFrame(np.zeros([7,1]))
    for s in seed:
        y_predict, y_test = BR_test(data, label, dataPath,s)

        error_matrix = pd.DataFrame(np.array(y_predict) - np.array(y_test), columns=y_test.columns)

        labelFile = dataPath+"error_matrix.csv"
        savePng = dataPath+"/bayes_net_error_matrix.png"
        error_matrix.to_csv(os.path.join(dataPath,labelFile),index=False)
        bayes_net = build_BN(labelFile, label.columns, savePng)

        df = BCC_test_2_fold(data, label, dataPath, bayes_net, s, label.shape[1])
        df.columns = [dataset]
        d.columns = [dataset]
        d.index = df.index
        d = d + df
        

    df_all_2_2fold = pd.concat([df_all_2_2fold, d/5],axis=1)
    
df_all_2_2fold.to_csv("/Users/jiangjunhao/Desktop/results_algorithms/twofold/LEAD_naive_bayes.csv")

# different orders of chain

In [6]:
# BR for getting error matrix
def naiveBayes_multi_label_training_BR(X_train, y_train):
    start = time.time()
    
    n_label = y_train.shape[1]
    classifier_list = [MultinomialNB() for i in range(n_label)]
    for i in range(n_label):
        classifier_list[i].fit(X_train,y_train.iloc[:,i])
    
    end = time.time()
    training_time = end-start
    
    return classifier_list, training_time

def naiveBayes_multi_label_testing_BR(X_test, n_label, classifier_list):
    y_predict = pd.DataFrame()
    y_prob = pd.DataFrame()
    
    start = time.time()
    
    for i in range(n_label):
        y_predict_i = classifier_list[i].predict(X_test)
        y_predict = pd.concat([y_predict, pd.DataFrame(y_predict_i)],axis=1)
        
        y_predict_prob_i = classifier_list[i].predict_proba(X_test)[:,1]
        y_prob = pd.concat([y_prob, pd.DataFrame(y_predict_prob_i)],axis=1)
        
    end = time.time()
    testing_time = end-start
        
    return y_predict, y_prob, testing_time

def BR_test(data, label, dataPath, random_state=3071980):
    
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()
    
    # split training and test data set
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=random_state)
    
    # training
    classifier_list, training_time = naiveBayes_multi_label_training_BR(X_train, y_train)
    
    # testing
    y_predict, y_prob, testing_time = naiveBayes_multi_label_testing_BR(X_test, n_label, classifier_list)
    
    y_predict.columns = label.columns
    return y_predict, y_test

def naiveBayes_multi_label_training_order(X_train, y_train, bayes_net, order):
    start = time.time()
    
    n_label = y_train.shape[1]
    
    classifier_list = [MultinomialNB() for i in range(n_label)] # create a classifier chain
    
    learned_label = []
    
    for i in range(n_label):
        if i == 0:
            l = order[i]
            classifier_list[i].fit(X_train, y_train.loc[:, l])
            learned_label.append(l)
            
        else:
            l = order[i]
            par = [x for x in bayes_net[l] if x in learned_label]
            X = pd.concat([X_train, y_train.loc[:,par]],axis=1) # put the previous label into attribute space
            classifier_list[i].fit(X, y_train.loc[:, l])
            learned_label.append(l)

    end = time.time()
    training_time = end-start
    
    return classifier_list, learned_label

def naiveBayes_multi_label_testing_order(X_test, n_label, classifier_list, bayes_net, learned_label):
    y_predict = pd.DataFrame(index=X_test.index)
    y_prob = pd.DataFrame(index=X_test.index)
    y_true = pd.DataFrame(index=X_test.index)
    
    start = time.time()
    
    predicted_list = []
    
    for i in range(n_label):
        if i == 0:
            l = learned_label[i]
            y_predict_i = classifier_list[i].predict(X_test)
            y_predict_prob_i = classifier_list[i].predict_proba(X_test)[:,1]
            y_predict = pd.concat([y_predict, pd.DataFrame(y_predict_i,index=X_test.index,columns=[l])],axis=1)
            y_prob = pd.concat([y_prob, pd.DataFrame(y_predict_prob_i,index=X_test.index,columns=[l])],axis=1)
            predicted_list.append(l)
        
        else:
            l = learned_label[i]
            par = [p for p in bayes_net[l] if p in predicted_list]
            if len(par) != 0:
                X = pd.concat([X_test, y_predict.loc[:,par]],axis=1) # put the previous label into attribute space
            else:
                X= X_test
            y_predict_i = classifier_list[i].predict(X)
            y_predict_prob_i = classifier_list[i].predict_proba(X)[:,1]
            
            y_predict = pd.concat([y_predict, pd.DataFrame(y_predict_i,index=X_test.index,columns=[l])],axis=1)
            y_prob = pd.concat([y_prob, pd.DataFrame(y_predict_prob_i,index=X_test.index,columns=[l])],axis=1)
  
            predicted_list.append(l)            
        
    return y_predict, y_prob

def BCC_test_order(data, label, dataPath, bayes_net, random_state=3071980, ensemble = 5, order_method="random"):
    
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()
    
    # get order
    if order_method=="best_prediction":
        y_predict, y_test = BR_test(data, label, dataPath,3071980)
        acc = (y_predict.values == y_test.values).mean(axis = 0)
        order = list(label.columns[np.argsort(-acc)])
    
    elif order_method=="largest_edges":
        a = [(x,len(y)) for x,y in bayes_net.items()]
        a_sort = sorted(a, key=lambda x:x[1], reverse=True)
        order = [x[0] for x in a_sort]
    
    # split training and test data set
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=random_state)
    
    # ensemble
    y_pred_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)
    y_prob_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)
    
    for i in range(ensemble):
        if order_method=="random":
            order = random.sample(list(range(n_label)),n_label) # get orders

        # training
        #print("--- start training ---\n")
        classifier_list, learned_label = naiveBayes_multi_label_training_order(X_train, y_train, bayes_net, order)

        # testing
        #print("--- start testing ---\n")
        y_predict, y_prob = naiveBayes_multi_label_testing_order(X_test, n_label, classifier_list, bayes_net, learned_label)

        y_predict = y_predict[label.columns]
        y_prob = y_prob[label.columns]

        y_pred_ensemble = y_pred_ensemble + y_predict
        y_prob_ensemble = y_prob_ensemble + y_prob

        
    y_pred_ensemble = (((y_pred_ensemble / ensemble) >= 0.5)*1).astype('int')
    y_prob_ensemble = y_prob_ensemble / ensemble
    y_pred_ensemble = y_pred_ensemble.fillna(0)
    y_prob_ensemble = y_prob_ensemble.fillna(0)
    
    # evaluation
    performance = evaluation(y_pred_ensemble, y_prob_ensemble, y_test)
    
    performance_df = pd.DataFrame.from_dict(performance, orient='index')
    
    return performance_df


def BCC_test_order_twofold(data, label, dataPath, bayes_net, random_state=3071980, ensemble = 5, order_method="random"):
    
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()
    
    # get order
    if order_method=="best_prediction":
        y_predict, y_test = BR_test(data, label, dataPath,3071980)
        acc = (y_predict.values == y_test.values).mean(axis = 0)
        order = list(label.columns[np.argsort(-acc)])
    
    elif order_method=="largest_edges":
        a = [(x,len(y)) for x,y in bayes_net.items()]
        a_sort = sorted(a, key=lambda x:x[1], reverse=True)
        order = [x[0] for x in a_sort]
    
    # split training and test data set
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=random_state)
    
    performance_df_all = pd.DataFrame(np.zeros([7,1]))
    for j in range(2):
        X_train, y_train = X_test, y_test
    # ensemble
        y_pred_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)
        y_prob_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)

        for i in range(ensemble):
            if order_method=="random":
                order = random.sample(list(range(n_label)),n_label) # get orders

            # training
            #print("--- start training ---\n")
            classifier_list, learned_label = naiveBayes_multi_label_training_order(X_train, y_train, bayes_net, order)

            # testing
            #print("--- start testing ---\n")
            y_predict, y_prob = naiveBayes_multi_label_testing_order(X_test, n_label, classifier_list, bayes_net, learned_label)

            y_predict = y_predict[label.columns]
            y_prob = y_prob[label.columns]

            y_pred_ensemble = y_pred_ensemble + y_predict
            y_prob_ensemble = y_prob_ensemble + y_prob

        
        y_pred_ensemble = (((y_pred_ensemble / ensemble) >= 0.5)*1).astype('int')
        y_prob_ensemble = y_prob_ensemble / ensemble
        y_pred_ensemble = y_pred_ensemble.fillna(0)
        y_prob_ensemble = y_prob_ensemble.fillna(0)

        # evaluation
        performance = evaluation(y_pred_ensemble, y_prob_ensemble, y_test)
    
        performance_df = pd.DataFrame.from_dict(performance, orient='index')
        
        performance_df_all.index = performance_df.index
        performance_df_all.columns = performance_df.columns
        
        performance_df_all = performance_df_all + performance_df
        
    performance_df_all = performance_df_all / 2
    return performance_df_all

In [62]:
df_all_3 = pd.DataFrame()

data_list = ["yeast","emotions","scene","enron","genbase","medical"]
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/small_datasets/'+dataset+"/"
    X_file = "X.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data
    
    labelFile = dataPath+"y.csv"
    savePng = dataPath+"/bayes_net.png"

    bayes_net = build_BN(labelFile, label.columns, savePng)
    
    df = BCC_test_order(data, label, dataPath, bayes_net, 3071980, 1, order_method="best_prediction")
    df.columns = [dataset]

    df_all_3 = pd.concat([df_all_3, df],axis=1)
    
data_list = ['tmc2007','rcv1subset1','rcv1subset2','rcv1subset3','rcv1subset4','rcv1subset5']
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/large_datasets/'+dataset+"/"
    X_file = "X_dis_1500.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data
    
    labelFile = dataPath+"y.csv"
    savePng = dataPath+"/bayes_net.png"

    bayes_net = build_BN(labelFile, label.columns, savePng)
    
    df = BCC_test_order(data, label, dataPath, bayes_net, 3071980, 1, order_method="best_prediction")
    df.columns = [dataset]

    df_all_3 = pd.concat([df_all_3, df],axis=1)


df_all_3.to_csv("/Users/jiangjunhao/Desktop/results_algorithms/BayesianClassifierChain_best_prediction.csv")
df_all_3

yeast
emotions
scene
enron


  self.class_log_prior_ = (np.log(self.class_count_) -
  'precision', 'predicted', average, warn_for)


genbase


  'recall', 'true', average, warn_for)


medical
tmc2007
rcv1subset1
rcv1subset2
rcv1subset3
rcv1subset4
rcv1subset5


Unnamed: 0,yeast,emotions,scene,enron,genbase,medical,tmc2007,rcv1subset1,rcv1subset2,rcv1subset3,rcv1subset4,rcv1subset5
coverage_error,8.19603,2.86532,1.538206,17.457109,1.398792,3.760736,4.171283,13.853667,13.870333,13.902,12.25,14.270333
ranking_loss,0.216811,0.178853,0.088464,0.13264,0.003454,0.045918,0.060312,0.055099,0.055898,0.056682,0.050152,0.05801
hamming_loss,0.241108,0.231762,0.203904,0.112498,0.006266,0.026221,0.115773,0.032498,0.03463,0.035056,0.033066,0.038535
f1_macro,0.382511,0.631798,0.627739,0.189732,0.473191,0.159631,0.480735,0.231007,0.203404,0.187474,0.170644,0.179251
f1_micro,0.587069,0.657829,0.610626,0.421173,0.926121,0.52898,0.582532,0.435217,0.377602,0.380208,0.364962,0.354775
Jaccard_Index,0.758892,0.768238,0.796096,0.887502,0.993734,0.973779,0.884227,0.967502,0.96537,0.964944,0.966934,0.961465
zero_one_error,0.864351,0.750842,0.791528,0.988249,0.135952,0.668712,0.882221,0.970333,0.897,0.877667,0.816667,0.914


In [63]:
df_all_4 = pd.DataFrame()

data_list = ["yeast","emotions","scene","enron","genbase","medical",]
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/small_datasets/'+dataset+"/"
    X_file = "X.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data
    
    labelFile = dataPath+"y.csv"
    savePng = dataPath+"/bayes_net.png"

    bayes_net = build_BN(labelFile, label.columns, savePng)
    
    df = BCC_test_order(data, label, dataPath, bayes_net, 3071980, 1, order_method="largest_edges")
    df.columns = [dataset]

    df_all_4 = pd.concat([df_all_4, df],axis=1)
    
data_list = ['tmc2007','rcv1subset1','rcv1subset2','rcv1subset3','rcv1subset4','rcv1subset5']
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/large_datasets/'+dataset+"/"
    X_file = "X_dis_1500.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data
    
    labelFile = dataPath+"y.csv"
    savePng = dataPath+"/bayes_net.png"

    bayes_net = build_BN(labelFile, label.columns, savePng)
    
    df = BCC_test_order(data, label, dataPath, bayes_net, 3071980, 1, order_method="largest_edges")
    df.columns = [dataset]

    df_all_4 = pd.concat([df_all_4, df],axis=1)


df_all_4.to_csv("/Users/jiangjunhao/Desktop/results_algorithms/BayesianClassifierChain_largest_edges.csv")
df_all_4

yeast
emotions
scene
enron


  self.class_log_prior_ = (np.log(self.class_count_) -
  'precision', 'predicted', average, warn_for)


genbase


  'recall', 'true', average, warn_for)


medical
tmc2007
rcv1subset1
rcv1subset2
rcv1subset3
rcv1subset4
rcv1subset5


Unnamed: 0,yeast,emotions,scene,enron,genbase,medical,tmc2007,rcv1subset1,rcv1subset2,rcv1subset3,rcv1subset4,rcv1subset5
coverage_error,8.229115,2.888889,1.5299,17.139835,1.465257,3.787321,4.170443,13.860667,13.608667,13.812667,12.636333,14.229333
ranking_loss,0.219101,0.182005,0.086741,0.131616,0.004705,0.046038,0.060937,0.055406,0.055727,0.057057,0.052921,0.058546
hamming_loss,0.240754,0.228956,0.199336,0.113984,0.006266,0.025721,0.116504,0.03304,0.035347,0.036007,0.032736,0.038581
f1_macro,0.380726,0.620771,0.637587,0.188612,0.473191,0.157944,0.477964,0.231615,0.210076,0.192876,0.180666,0.187544
f1_micro,0.57994,0.658291,0.616205,0.417318,0.926121,0.536066,0.58099,0.437868,0.388,0.381589,0.383798,0.367219
Jaccard_Index,0.759246,0.771044,0.800664,0.886016,0.993734,0.974279,0.883496,0.96696,0.964653,0.963993,0.967264,0.961419
zero_one_error,0.860215,0.717172,0.788206,0.989424,0.135952,0.664622,0.884739,0.966,0.896667,0.89,0.811667,0.913


## 5 times 2 fold

In [10]:
df_all_3_twofold = pd.DataFrame()
seed = [1234,2234,12345,12346,1234567]
data_list = ["yeast","emotions","scene","enron","genbase","medical"]
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/small_datasets/'+dataset+"/"
    X_file = "X.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data
    
    labelFile = dataPath+"y.csv"
    savePng = dataPath+"/bayes_net.png"
    
    d = pd.DataFrame(np.zeros([7,1]))
    for s in seed:
        bayes_net = build_BN(labelFile, label.columns, savePng)
        df = BCC_test_order_twofold(data, label, dataPath, bayes_net, s, 1, order_method="best_prediction")
        df.columns = [dataset]
        d.columns = [dataset]
        d.index = df.index
        d = d + df

    df_all_3_twofold = pd.concat([df_all_3_twofold, d/5],axis=1)
    
data_list = ['tmc2007','rcv1subset1','rcv1subset2','rcv1subset3','rcv1subset4','rcv1subset5']
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/large_datasets/'+dataset+"/"
    X_file = "X_dis_1500.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data
    
    labelFile = dataPath+"y.csv"
    savePng = dataPath+"/bayes_net.png"

    d = pd.DataFrame(np.zeros([7,1]))
    for s in seed:
        bayes_net = build_BN(labelFile, label.columns, savePng)
        df = BCC_test_order_twofold(data, label, dataPath, bayes_net, s, 1, order_method="best_prediction")
        df.columns = [dataset]
        d.columns = [dataset]
        d.index = df.index
        d = d + df

    df_all_3_twofold = pd.concat([df_all_3_twofold, d/5],axis=1)


df_all_3_twofold.to_csv("/Users/jiangjunhao/Desktop/results_algorithms/twofold/BayesianClassifierChain_best_prediction.csv")
df_all_3_twofold

yeast
emotions
scene
enron


  self.class_log_prior_ = (np.log(self.class_count_) -
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


genbase
medical
tmc2007
rcv1subset1
rcv1subset2
rcv1subset3
rcv1subset4
rcv1subset5


Unnamed: 0,yeast,emotions,scene,enron,genbase,medical,tmc2007,rcv1subset1,rcv1subset2,rcv1subset3,rcv1subset4,rcv1subset5
coverage_error,7.900579,2.756229,1.52691,12.247239,1.32568,1.86135,3.992474,8.7946,7.9396,7.955133,7.240533,7.837333
ranking_loss,0.197997,0.167563,0.088103,0.078855,0.001563,0.011625,0.054149,0.030923,0.027268,0.027741,0.025862,0.026757
hamming_loss,0.227083,0.214366,0.204097,0.092349,0.004879,0.013751,0.112231,0.028045,0.026412,0.02629,0.026786,0.026954
f1_macro,0.425463,0.642672,0.621628,0.377857,0.503626,0.278517,0.499024,0.329935,0.319179,0.307629,0.260168,0.292653
f1_micro,0.611755,0.674031,0.605983,0.521008,0.944079,0.731639,0.59611,0.502243,0.479331,0.487235,0.449833,0.4777
Jaccard_Index,0.772917,0.785634,0.795903,0.907651,0.995121,0.986249,0.887769,0.971955,0.973588,0.97371,0.973214,0.973046
zero_one_error,0.841853,0.721886,0.793854,0.962867,0.110574,0.466667,0.870737,0.959,0.879667,0.8508,0.802933,0.8916


In [11]:
df_all_4_twofold = pd.DataFrame()

data_list = ["yeast","emotions","scene","enron","genbase","medical"]
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/small_datasets/'+dataset+"/"
    X_file = "X.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data
    
    labelFile = dataPath+"y.csv"
    savePng = dataPath+"/bayes_net.png"
    
    d = pd.DataFrame(np.zeros([7,1]))
    for s in seed:
        bayes_net = build_BN(labelFile, label.columns, savePng)
        df = BCC_test_order_twofold(data, label, dataPath, bayes_net, s, 1, order_method="largest_edges")
        df.columns = [dataset]
        d.columns = [dataset]
        d.index = df.index
        d = d + df

    df_all_4_twofold = pd.concat([df_all_4_twofold, d/5],axis=1)
    
data_list = ['tmc2007','rcv1subset1','rcv1subset2','rcv1subset3','rcv1subset4','rcv1subset5']
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/large_datasets/'+dataset+"/"
    X_file = "X_dis_1500.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data
    
    labelFile = dataPath+"y.csv"
    savePng = dataPath+"/bayes_net.png"

    d = pd.DataFrame(np.zeros([7,1]))
    for s in seed:
        bayes_net = build_BN(labelFile, label.columns, savePng)
        df = BCC_test_order_twofold(data, label, dataPath, bayes_net, s, 1, order_method="largest_edges")
        df.columns = [dataset]
        d.columns = [dataset]
        d.index = df.index
        d = d + df

    df_all_4_twofold = pd.concat([df_all_4_twofold, d/5],axis=1)


df_all_4_twofold.to_csv("/Users/jiangjunhao/Desktop/results_algorithms/twofold/BayesianClassifierChain_largest_edges.csv")
df_all_4_twofold

yeast
emotions
scene
enron


  self.class_log_prior_ = (np.log(self.class_count_) -
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


genbase
medical
tmc2007
rcv1subset1
rcv1subset2
rcv1subset3
rcv1subset4
rcv1subset5


Unnamed: 0,yeast,emotions,scene,enron,genbase,medical,tmc2007,rcv1subset1,rcv1subset2,rcv1subset3,rcv1subset4,rcv1subset5
coverage_error,7.908354,2.767003,1.52392,12.078496,1.367372,1.876074,3.988586,8.9694,7.7668,7.816533,7.4286,7.7548
ranking_loss,0.202651,0.166631,0.087443,0.079084,0.002381,0.012064,0.054633,0.031511,0.026992,0.027338,0.026572,0.026602
hamming_loss,0.232731,0.215713,0.198782,0.092637,0.004834,0.014115,0.112941,0.028143,0.026757,0.026941,0.026332,0.0269
f1_macro,0.408855,0.643596,0.632083,0.378851,0.503859,0.275172,0.496238,0.340018,0.330757,0.31864,0.272458,0.303122
f1_micro,0.592752,0.673806,0.612737,0.518382,0.944615,0.725898,0.594558,0.509039,0.492226,0.494038,0.467363,0.493221
Jaccard_Index,0.767269,0.784287,0.801218,0.907363,0.995166,0.985885,0.887059,0.971857,0.973243,0.973059,0.973668,0.9731
zero_one_error,0.849628,0.700337,0.791362,0.964747,0.109366,0.470757,0.875591,0.948,0.877667,0.860467,0.799933,0.8864
