In [1]:
import pandas as pd
import numpy as np
import os
import time
import subprocess
import re
import random
import arff

from math import sqrt
from sklearn.model_selection import  train_test_split
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB  
from sklearn.metrics import classification_report
from sklearn.metrics import coverage_error
from sklearn.metrics import label_ranking_loss
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import zero_one_loss
from sklearn.metrics import jaccard_similarity_score

# functions

In [2]:
def read_data(dataPath):
    # input: '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'
    # read data
    data = pd.read_csv(os.path.join(dataPath,'X.csv'), index_col=0).reset_index(drop=True)
    label = pd.read_csv(os.path.join(dataPath,'Y.csv'), index_col=0).reset_index(drop=True)
    return data,label

def evaluation(y_pred, y_prob, y_true):
    coverage = 0
    hamming = hamming_loss(y_true, y_pred)
    ranking_loss = 0
    
    f1_macro = metrics.f1_score(y_true, y_pred, average='macro')
    f1_micro = metrics.f1_score(y_true, y_pred, average='micro')
    
    acc = 0
    for i in range(y_true.shape[0]):
        acc += jaccard_similarity_score(y_true.iloc[i,:], y_pred.iloc[i,:]) # jaccard_similarity_score
    acc = round(acc / y_true.shape[0],2)
    
    zero_one = zero_one_loss(y_true, y_pred) # 0-1 error 
    
    f1_each = metrics.f1_score(y_true, y_pred, average=None)
    
    #confusion_matrix = y_pred - y_true
    
    performance = {"coverage_error":coverage,
                   "ranking_loss":ranking_loss,
                   "hamming_loss":hamming,
                   "f1_macro":f1_macro,
                   "f1_micro":f1_micro,
                   "Jaccard_Index":acc,
                   "zero_one_error":zero_one,
                   "f1_each_label":f1_each}
    return performance

def get_confusion_matrix(y_pred, y_test, column_names):
    """confusion matrix """
    confusion_matrix = pd.DataFrame(np.array(y_pred) - np.array(y_test), columns=column_names)
    pos = pd.DataFrame((np.array(y_pred) == np.array(y_test)) & (np.array(y_pred) == 1), columns=y_test.columns).sum(axis=0)
    neg = pd.DataFrame((np.array(y_pred) == np.array(y_test)) & (np.array(y_pred) == 0), columns=y_test.columns).sum(axis=0)
    for i in range(confusion_matrix.shape[1]): 
        name = confusion_matrix.iloc[:,i].name
        temp = confusion_matrix.iloc[:,i].value_counts()
        TP = pos[name]
        TN = neg[name]
        if 1 in temp.index:
            FP = temp[1]
        else:
            FP = 0
        if -1 in temp.index:
            FN = temp[-1]
        else:
            FN = 0
            
        
        print("{0}: TP:{1}, TN:{2}, FP:{3}, FN:{4}".format(name, TP, TN, FP, FN))
        
    

# read data



In [136]:
dataPath = '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'

data, label = read_data(dataPath) # read data

# get data information
n_label = label.shape[1]
n_attr = data.shape[1]
n_instance = data.shape[0]
avg_label_per_instance = label.sum(axis=1).mean()
avg_instance_per_label = label.sum(axis=0).mean()
# print data information
print("\n--- Data Information ---")
print("dataset:", dataPath)
print("number of label:",n_label)
print("number of attribute:",n_attr)
print("number of instance:",n_instance)
print("avgerage number of labels for an instance:",avg_label_per_instance)
print("avgerage number of positive instances for a label:",avg_instance_per_label,"the std:",sqrt(label.sum(axis=0).var()),"\n")

print("-- number of positive instances --")
print(label.sum(axis=0))


--- Data Information ---
dataset: /Volumes/Samsung_T5/research/data/ABC_news_data/obesity/
number of label: 42
number of attribute: 335
number of instance: 891
avgerage number of labels for an instance: 3.2480359147025815
avgerage number of positive instances for a label: 68.9047619047619 the std: 87.33362147641554 

-- number of positive instances --
act                                                  26
advertising                                          23
advertising-and-marketing                            27
australia                                           342
business-economics-and-finance                       31
cancer                                               21
child-health-and-behaviour                           97
children                                             59
community-and-society                                55
diabetes                                             57
diet-and-nutrition                                  119
diseases-and-disorders       

## co-occurance

In [137]:
cooccurrence_matrix = label.T.dot(label)
np.fill_diagonal(cooccurrence_matrix.values, 0)
#cooccurrence_matrix.to_csv('/Users/jiangjunhao/Desktop/cooccurrence_matrix.csv', index=False)

# Binary Relevance 

## BR using naive Bayes classifier

In [138]:
def naiveBayes_multi_label_training(X_train, y_train):
    start = time.time()
    
    n_label = y_train.shape[1]
    classifier_list = [MultinomialNB() for i in range(n_label)]
    for i in range(n_label):
        classifier_list[i].fit(X_train,y_train.iloc[:,i])
    
    end = time.time()
    training_time = end-start
    
    return classifier_list, training_time

def naiveBayes_multi_label_testing(X_test, n_label, classifier_list):
    y_predict = pd.DataFrame()
    y_prob = pd.DataFrame()
    
    start = time.time()
    
    for i in range(n_label):
        y_predict_i = classifier_list[i].predict(X_test)
        y_predict = pd.concat([y_predict, pd.DataFrame(y_predict_i)],axis=1)
        
        y_predict_prob_i = classifier_list[i].predict_proba(X_test)[:,1]
        y_prob = pd.concat([y_prob, pd.DataFrame(y_predict_prob_i)],axis=1)
        
    end = time.time()
    testing_time = end-start
        
    return y_predict, y_prob, testing_time

def BR_test(data, label, dataPath, random_state=3071980):
    
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()
    
    # split training and test data set
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=random_state)
    
    print("-- test index --")
    print(X_test.index)
    
    # training
    print("--- start training ---\n")
    classifier_list, training_time = naiveBayes_multi_label_training(X_train, y_train)
    
    # testing
    print("--- start testing ---\n")
    y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(X_test, n_label, classifier_list)
    
    # evaluation
    performance = evaluation(y_predict, y_prob, y_test)
    
    # print data information
    print("--- Data Information ---")
    print("dataset:", dataPath)
    print("number of label:",n_label)
    print("number of attribute:",n_attr)
    print("number of instance:",n_instance,"\n")
    
    # get confusion matrix
    get_confusion_matrix(y_predict, y_test, y_test.columns)
    
    # print performance
    print("--- Performance ---")
    for key, value in performance.items():
        if key == "f1_each_label":
            print("\n- f1 for each label -")
            for i in range(n_label):
                print("label_"+label.columns[i],"=",round(value[i],2))
        elif key == "coverage_error":
            print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
        else:
            print(key,'=',round(value,2))
            
# two fold cross-validation
def two_fold_BR_test(data, label, dataPath, n_iter=5, random_state=3071980):
    
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()
    
    # 2-fold cross validatiom
    KF=KFold(n_splits=2, shuffle=True, random_state=random_state)
    i = 0
    
    performance = {}
    for train_index,test_index in KF.split(data):
        i += 1
        
        X_train,X_test=data.iloc[train_index,:],data.iloc[test_index,:]
        y_train,y_test=label.iloc[train_index,:],label.iloc[test_index,:]
        
        print("--- kfold time="+str(i)+" ---")
        # training
        classifier_list, training_time = naiveBayes_multi_label_training(X_train, y_train)
        # testing
        y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(X_test, n_label, classifier_list)
        
        # evaluation
        if performance == {}:
            performance = evaluation(y_predict, y_prob, y_test)
        else:
            performance_i = evaluation(y_predict, y_prob, y_test)
            for key, value in performance_i.items():
                performance[key] = (performance[key] + value)/2
            else:
                performance[key] = value
    
    # print data information
    print("\n--- Data Information ---")
    print("dataset:", dataPath)
    print("number of label:",n_label)
    print("number of attribute:",n_attr)
    print("number of instance:",n_instance,"\n")
    
    # print performance
    print("--- 2 fold cross-validation Performance ---")
    for key, value in performance.items():
        if key == "f1_each_label":
            continue
        elif key == "coverage_error":
            print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
        else:
            print(key,'=',round(value,2))

In [139]:
dataPath = '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'

# read data
data, label = read_data(dataPath)

# train - test
print("------ Binary Relevance using Naive Bayes ------")
BR_test(data, label, dataPath,3071980)

#print("")
#print("------ two_fold Binary Relevance using Naive Bayes ------")
#two_fold_BR_test(data, label, dataPath,3071980)

------ Binary Relevance using Naive Bayes ------
-- test index --
Int64Index([465, 368, 408, 584, 701, 805, 625, 615, 862, 495,
            ...
             92, 490, 367, 800, 491, 627, 655, 274, 867,  44],
           dtype='int64', length=446)
--- start training ---

--- start testing ---

--- Data Information ---
dataset: /Volumes/Samsung_T5/research/data/ABC_news_data/obesity/
number of label: 42
number of attribute: 335
number of instance: 891 

act: TP:0, TN:427, FP:4, FN:15
advertising: TP:9, TN:416, FP:19, FN:2
advertising-and-marketing: TP:10, TN:404, FP:31, FN:1
australia: TP:92, TN:209, FP:57, FN:88
business-economics-and-finance: TP:6, TN:418, FP:16, FN:6
cancer: TP:2, TN:428, FP:8, FN:8
child-health-and-behaviour: TP:20, TN:343, FP:55, FN:28
children: TP:6, TN:385, FP:30, FN:25
community-and-society: TP:2, TN:403, FP:17, FN:24
diabetes: TP:14, TN:399, FP:24, FN:9
diet-and-nutrition: TP:12, TN:361, FP:21, FN:52
diseases-and-disorders: TP:8, TN:378, FP:37, FN:23
doctors-and-m

In [117]:
# data set information
n_label = label.shape[1]
n_attr = data.shape[1]
n_instance = data.shape[0]
avg_label_per_instance = label.sum(axis=1).mean()

# split training and test data set
# X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=random_state)
X_train = data[data.index.isin(index)==False]
X_test = data[data.index.isin(index)==True]
y_train = label[label.index.isin(index)==False]
y_test = label[label.index.isin(index)==True]

print("-- test index --")
print(X_test.index)

# training
print("--- start training ---\n")
classifier_list, training_time = naiveBayes_multi_label_training(X_train, y_train)

# testing
print("--- start testing ---\n")
y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(X_test, n_label, classifier_list)

# get confusion matrix
get_confusion_matrix(y_predict, y_test, y_test.columns)
    
# evaluation
performance = evaluation(y_predict, y_prob, y_test)


# print data information
print("--- Data Information ---")
print("dataset:", dataPath)
print("number of label:",n_label)
print("number of attribute:",n_attr)
print("number of instance:",n_instance,"\n")

# print performance
print("--- Performance ---")
for key, value in performance.items():
    if key == "f1_each_label":
        print("\n- f1 for each label -")
        for i in range(n_label):
            print("label_"+label.columns[i],"=",round(value[i],2))
    elif key == "coverage_error":
        print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
    else:
        print(key,'=',round(value,2))

-- test index --
Int64Index([  4,   5,   7,   8,  15,  16,  17,  21,  23,  24,
            ...
            876, 878, 879, 880, 881, 882, 884, 888, 889, 890],
           dtype='int64', length=446)
--- start training ---

--- start testing ---

act: TP:1, TN:422, FP:16, FN:7
advertising: TP:7, TN:417, FP:18, FN:4
advertising-and-marketing: TP:8, TN:409, FP:24, FN:5
australia: TP:96, TN:206, FP:64, FN:80
business-economics-and-finance: TP:6, TN:408, FP:25, FN:7
cancer: TP:3, TN:430, FP:2, FN:11
child-health-and-behaviour: TP:21, TN:341, FP:47, FN:37
children: TP:7, TN:386, FP:23, FN:30
community-and-society: TP:2, TN:408, FP:11, FN:25
diabetes: TP:10, TN:392, FP:24, FN:20
diet-and-nutrition: TP:20, TN:337, FP:54, FN:35
diseases-and-disorders: TP:7, TN:389, FP:23, FN:27
doctors-and-medical-professionals: TP:3, TN:414, FP:8, FN:21
education: TP:1, TN:429, FP:9, FN:7
exercise-and-fitness: TP:5, TN:406, FP:12, FN:23
family-and-children: TP:1, TN:415, FP:13, FN:17
federal---state-issues: TP:6,

## BR using ESKDB

In [5]:
def get_arff(word_occurrence, label_matrix, savePath): # get attributes
    for z in range(len(label_matrix.columns)):
        attributes=[(word_occurrence.columns[i],['0', '1']) for i in range(len(word_occurrence.columns))]
        attributes.append(('label_'+label_matrix.columns[z],['0', '1']))

        data=[]
        i = 0
        while i < label_matrix.shape[0]:
            attr_data = [str(j) for j in list(word_occurrence.iloc[i,:])]
            label_data = [str(label_matrix.iloc[i,z])]
            row_data = attr_data+label_data
            data.append(row_data) 
            i+=1
        # set obj
        obj = {
           'description': u'',
           'relation': 'relation',
           'attributes': attributes,
           'data': data,
        }
        arff_data = arff.dumps(obj)
        w_file = open(savePath+label_matrix.columns[z]+".arff", "w")
        w_file.write(arff_data)
        w_file.close()

def run_eskdb(dataPath, resultFile, k, l, e, i):
    command = "./run_eskdb.sh "+resultFile+" "+k+" "+i+" "+l+" "+e+" "+dataPath
    subprocess.call("cd /Volumes/Samsung_T5/research/programme/research_python/", shell=True)
    print(command)
    return subprocess.call(command, shell=True)

def get_result(resultPath):

    y_pred = pd.DataFrame()
    y_true = pd.DataFrame()
    y_prob = pd.DataFrame()
    names = []
    for file in os.listdir(resultPath):
        with open(os.path.join(resultPath,file), 'r') as f:
            try:
                lines = f.readlines()
            except:
                print(file)
            else:
                names.append(file[:-4])
                pred = []
                true = []
                prob = []
                train_time_total = 0
                test_time_total = 0
                error_marco = 0
                for line in lines:
                    if line.startswith('pred'):
                        pred.append(int(re.search('pred :\t(.)',line).group(1)))
                        true.append(int(re.search('true :\t(.)',line).group(1)))
                        prob.append(float(re.search('prob :\t(.*)',line).group(1)))
                    elif line.startswith('RSME'):
                        rsme = float(re.search('RSME :\t\t(.*)',line).group(1))
                    elif line.startswith('Error'):
                        error = float(re.search('Error :\t\t(.*)',line).group(1))
                    elif line.startswith("Training time"):
                        train_time = float(re.search('Training time :\s{1,}(.*)',line).group(1))
                        train_time_total = train_time_total + train_time
                    elif line.startswith("Testing time"):
                        test_time = float(re.search('Testing time :\s{1,}(.*)',line).group(1))
                        test_time_total = test_time_total + test_time
                    elif line.startswith("["):
                        para = line
                    elif line.startswith("test0Indexes"):
                        index = list(map(int,re.search('test0Indexes: {(.*)}',line).group(1).split(', ')))

                y_pred = pd.concat([y_pred,pd.DataFrame(pred)],axis=1)
                y_true = pd.concat([y_true,pd.DataFrame(true)],axis=1)
                y_prob = pd.concat([y_prob,pd.DataFrame(prob)],axis=1)
    y_pred.columns = names
    y_true.columns = names
    y_pred.index = index
    y_true.index = index
    print(para)
    print("number of label:", y_pred.shape[1])
    print("training time:",train_time_total)
    print("testing time:",test_time_total)
    print("-- test index --")
    print(index)
    return y_pred,y_true,y_prob,index

In [None]:
# obesity
dataPath = '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'

# read data
data, label = read_data(dataPath)

# get arff files
savePath = "/Users/jiangjunhao/Desktop/test/"
get_arff(data,label,savePath)

dataPath = "/Users/jiangjunhao/Desktop/test/"
resultFile = 'obesity_k5_e20_i5000'
k = '5'
i = '5000'
l = '2'
e = '20'

run_eskdb(dataPath, resultFile, k, l, e, i)

In [120]:
resultPath = '/Volumes/Samsung_T5/research/programme/ESKDB_HDP/result/obesity_k5_e20_i5000/'
y_pred,y_true,y_prob,index = get_result(resultPath)

performance = evaluation(y_pred=y_pred, y_true=y_true, y_prob=y_prob)

# get confusion mbatrix
print("\n--- Confusion matrix ---")
get_confusion_matrix(y_pred, y_true, y_true.columns)
    
# print performance
print("--- Performance ---")
for key, value in performance.items():
    if key == "f1_each_label":
        print("\n- f1 for each label -")
        for i in range(n_label):
            print("label_"+label.columns[i],"=",round(value[i],2))
    elif key == "coverage_error":
        print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
    else:
        print(key,'=',round(value,2))

[-t, /Users/jiangjunhao/Desktop/test//wa.arff, -S, ESKDB, -K, 5, -I, 5000, -L, 2, -E, 20, -V, -M]

number of label: 42
training time: 2339.0
testing time: 1427.0
-- test index --
[4, 5, 7, 8, 15, 16, 17, 21, 23, 24, 31, 34, 35, 36, 39, 40, 41, 42, 43, 45, 46, 47, 48, 50, 52, 55, 56, 59, 60, 62, 63, 65, 66, 67, 70, 72, 75, 76, 78, 80, 84, 87, 90, 91, 92, 95, 100, 102, 103, 104, 107, 109, 110, 111, 112, 113, 114, 115, 116, 118, 120, 121, 126, 127, 129, 131, 132, 133, 134, 135, 136, 146, 154, 158, 163, 164, 167, 171, 172, 173, 175, 176, 178, 183, 185, 187, 189, 194, 195, 197, 198, 200, 202, 203, 204, 207, 208, 212, 213, 214, 217, 219, 221, 222, 223, 225, 228, 229, 230, 231, 232, 235, 239, 240, 241, 242, 243, 245, 247, 249, 251, 252, 253, 254, 256, 258, 260, 262, 263, 264, 265, 266, 268, 269, 270, 273, 275, 276, 277, 279, 281, 282, 283, 284, 285, 286, 288, 291, 293, 294, 295, 299, 300, 302, 304, 306, 307, 308, 311, 315, 316, 318, 321, 322, 325, 328, 329, 332, 333, 334, 336, 337, 338, 339, 

  'precision', 'predicted', average, warn_for)


In [162]:
# obesity
dataPath = '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'

# read data
data, label = read_data(dataPath)

# get arff files
savePath = "/Users/jiangjunhao/Desktop/test/"
get_arff(data,label,savePath)

dataPath = "/Users/jiangjunhao/Desktop/test/"
resultFile = 'obesity_k2_e20_i5000'
k = '2'
i = '5000'
l = '2'
e = '20'

run_eskdb(dataPath, resultFile, k, l, e, i)

KeyboardInterrupt: 

In [6]:
resultPath = '/Volumes/Samsung_T5/research/programme/ESKDB_HDP/result/obesity_k2_e20_i5000/'
y_pred,y_true,y_prob,index = get_result(resultPath)

performance = evaluation(y_pred=y_pred, y_true=y_true, y_prob=y_prob)

# get confusion mbatrix
print("\n--- Confusion matrix ---")
get_confusion_matrix(y_pred, y_true, y_true.columns)
    
# print performance
print("--- Performance ---")
for key, value in performance.items():
    if key == "f1_each_label":
        print("\n- f1 for each label -")
        for i in range(n_label):
            print("label_"+label.columns[i],"=",round(value[i],2))
    elif key == "coverage_error":
        print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
    else:
        print(key,'=',round(value,2))

ValueError: Length mismatch: Expected axis has 42 elements, new values have 43 elements

## analysis

In [11]:
print("--- number of positive instances in test data set ---")
print(y_test.sum())

print("--- number of positive instances in training data set ---")
print(y_train.sum())

--- number of positive instances in test data set ---
act                                                   8
advertising                                          11
advertising-and-marketing                            13
australia                                           176
business-economics-and-finance                       13
cancer                                               14
child-health-and-behaviour                           58
children                                             37
community-and-society                                27
diabetes                                             30
diet-and-nutrition                                   55
diseases-and-disorders                               34
doctors-and-medical-professionals                    24
education                                             8
exercise-and-fitness                                 28
family-and-children                                  18
federal---state-issues                            

``
--- Performance of BR-NB ---
coverage_error = 14.29 ( avg_label_per_instance = 3.25 )
ranking_loss = 0.14
hamming_loss = 0.1
f1_macro = 0.26
f1_micro = 0.39
Jaccard_Index = 0.9
zero_one_error = 0.98
``

``
--- Performance of BR-ESKDB --- I=5000, E=20, K=5
coverage_error = 14.69 ( avg_label_per_instance = 3.25 )
ranking_loss = 0.15
hamming_loss = 0.08
f1_macro = 0.17
f1_micro = 0.33
Jaccard_Index = 0.92
zero_one_error = 0.95
``

- There is a non-balanced data problem that when the number of positive instance for a label is small, the f1 score of this label is very low. It makes the F1-macro(averageg by the number of labels) low. It also appears in other text data set like medical and enron
![title](img/1.png)

- The memory used depends on the number of attributes, there are 1449 attributes in dataset `Medical` and it can run on my desktop. 

- there's a problem that when spliting the data set into training and test, there is no positive instance for some label in training set or testing set, and therefore, ESKDB does not work on that label.

# Classifier Chain

## CC using naive Bayes

In [123]:
def naiveBayes_multi_label_training(X_train, y_train):
    start = time.time()
    
    n_label = y_train.shape[1]
    
    order = random.sample(list(range(n_label)),n_label) # get orders
    
    classifier_list = [MultinomialNB() for i in range(n_label)] # create a classifier chain
    
    for i in range(n_label):
        if i == 0:
            classifier_list[i].fit(X_train,y_train.iloc[:, order[i]])
        else:
            X_train = pd.concat([X_train, y_train.iloc[:,order[i-1]]],axis=1) # put the previous label into attribute space
            classifier_list[i].fit(X_train,y_train.iloc[:,order[i]])

    end = time.time()
    training_time = end-start
    
    return classifier_list, training_time, order

def naiveBayes_multi_label_testing(X_test, n_label, classifier_list, order):
    y_predict = pd.DataFrame(index=X_test.index)
    y_prob = pd.DataFrame(index=X_test.index)
    y_true = pd.DataFrame(index=X_test.index)
    
    start = time.time()
    
    for i in range(n_label):
        y_predict_i = classifier_list[i].predict(X_test)
        y_predict = pd.concat([y_predict, pd.DataFrame(y_predict_i,index=X_test.index)],axis=1)

        y_predict_prob_i = classifier_list[i].predict_proba(X_test)[:,1]
        y_prob = pd.concat([y_prob, pd.DataFrame(y_predict_prob_i,index=X_test.index)],axis=1)

        X_test = pd.concat([X_test, pd.DataFrame(y_predict_i,index=X_test.index)],axis=1,ignore_index=True) # put the previous label into attribute space

    end = time.time()
    testing_time = end-start
        
    return y_predict, y_prob, testing_time

def CC_test(data, label, dataPath, random_state=3071980):
    
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()
    
    # split training and test data set
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=random_state)
    
    # training
    print("--- start training ---\n")
    classifier_list, training_time, order = naiveBayes_multi_label_training(X_train, y_train)
    
    # testing
    print("--- start testing ---\n")
    y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(X_test, n_label, classifier_list, order)
    
    # evaluation
    y_test = y_test.iloc[:,order]
    performance = evaluation(y_predict, y_prob, y_test)
    
    # print data information
    print("--- Data Information ---")
    print("dataset:", dataPath)
    print("number of label:",n_label)
    print("number of attribute:",n_attr)
    print("number of instance:",n_instance,"\n")
    
    # print orders
    print("\n--- Order of the chain ---")
    print(label.columns[order])
    print("")
    
    # print performance
    print("--- Performance ---")
    for key, value in performance.items():
        if key == "f1_each_label":
            print("\n- f1 for each label -")
            for i in range(n_label):
                print("label_"+label.columns[i],"=",round(value[i],2))
        elif key == "coverage_error":
            print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
        else:
            print(key,'=',round(value,2))
            
def ECC_test(data, label, dataPath, random_state=3071980, ensemble = 5):
    
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()
    
    # split training and test data set
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=random_state)
    
    # ensemble
    y_pred_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)
    y_prob_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)
    for i in range(ensemble):
        # training
        #print("--- start training ---\n")
        classifier_list, training_time, order = naiveBayes_multi_label_training(X_train, y_train)

        # print orders
        print("Order of the chain:",label.columns[order])

        # testing
        #print("--- start testing ---\n")
        y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(X_test, n_label, classifier_list, order)

        y_predict.columns = label.columns[order]
        y_prob.columns = label.columns[order]
        y_predict = y_predict[label.columns]
        y_prob = y_prob[label.columns]

        y_pred_ensemble = y_pred_ensemble + y_predict
        y_prob_ensemble = y_prob_ensemble + y_prob

    y_pred_ensemble = (((y_pred_ensemble / ensemble) >= 0.5)*1).astype('int')
    y_prob_ensemble = y_prob_ensemble / ensemble 
    
    # evaluation
    performance = evaluation(y_pred_ensemble, y_prob_ensemble, y_test)
    
    # print data information
    print("--- Data Information ---")
    print("dataset:", dataPath)
    print("number of label:",n_label)
    print("number of attribute:",n_attr)
    print("number of instance:",n_instance,"\n")
    
    # print orders
    print("\n--- Order of the chain ---")
    print(label.columns[order])
    print("")
    
    # print performance
    print("--- Performance ---")
    for key, value in performance.items():
        if key == "f1_each_label":
            print("\n- f1 for each label -")
            for i in range(n_label):
                print("label_"+label.columns[i],"=",round(value[i],2))
        elif key == "coverage_error":
            print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
        else:
            print(key,'=',round(value,2))

In [124]:
dataPath = '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'

# read data
data, label = read_data(dataPath)

# train - test
print("------ Binary Relevance using Naive Bayes ------")
ECC_test(data, label, dataPath, 3071980, 1)

------ Binary Relevance using Naive Bayes ------
Order of the chain: Index(['community-and-society', 'nsw', 'qld', 'education',
       'diseases-and-disorders', 'exercise-and-fitness', 'act', 'australia',
       'wa', 'doctors-and-medical-professionals',
       'indigenous-aboriginal-and-torres-strait-islander',
       'states-and-territories', 'lifestyle-and-leisure',
       'advertising-and-marketing', 'healthcare-facilities',
       'government-and-politics', 'family-and-children', 'sa', 'health',
       'federal-government', 'health-policy', 'diabetes', 'obesity', 'cancer',
       'advertising', 'schools', 'federal---state-issues', 'research',
       'smoking', 'united-states', 'children', 'united-kingdom',
       'science-and-technology', 'industry', 'vic',
       'business-economics-and-finance', 'medical-research',
       'child-health-and-behaviour', 'tas', 'heart-disease',
       'food-and-beverage', 'diet-and-nutrition'],
      dtype='object')
--- Data Information ---
dataset

In [125]:
# data set information
n_label = label.shape[1]
n_attr = data.shape[1]
n_instance = data.shape[0]
avg_label_per_instance = label.sum(axis=1).mean()

# split training and test data set
# X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=random_state)

X_train = data[data.index.isin(index)==False]
X_test = data[data.index.isin(index)==True]
y_train = label[label.index.isin(index)==False]
y_test = label[label.index.isin(index)==True]

print("-- test index --")
print(X_test.index)

# training
print("--- start training ---\n")
classifier_list, training_time, order = naiveBayes_multi_label_training(X_train, y_train)

# testing
print("--- start testing ---\n")
y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(X_test, n_label, classifier_list, order)

# evaluation
y_test = y_test.iloc[:,order]
performance = evaluation(y_predict, y_prob, y_test)

# print data information
print("--- Data Information ---")
print("dataset:", dataPath)
print("number of label:",n_label)
print("number of attribute:",n_attr)
print("number of instance:",n_instance,"\n")

# print orders
print("\n--- Order of the chain ---")
print(label.columns[order])
print("")


# print performance
print("--- Performance ---")
for key, value in performance.items():
    if key == "f1_each_label":
        print("\n- f1 for each label -")
        for i in range(n_label):
            print("label_"+label.columns[i],"=",round(value[i],2))
    elif key == "coverage_error":
        print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
    else:
        print(key,'=',round(value,2))

-- test index --
Int64Index([  4,   5,   7,   8,  15,  16,  17,  21,  23,  24,
            ...
            876, 878, 879, 880, 881, 882, 884, 888, 889, 890],
           dtype='int64', length=446)
--- start training ---

--- start testing ---

--- Data Information ---
dataset: /Volumes/Samsung_T5/research/data/ABC_news_data/obesity/
number of label: 42
number of attribute: 335
number of instance: 891 


--- Order of the chain ---
Index(['research', 'cancer', 'advertising', 'act', 'food-and-beverage', 'qld',
       'vic', 'tas', 'family-and-children', 'heart-disease',
       'federal---state-issues', 'children', 'diseases-and-disorders',
       'diabetes', 'smoking', 'wa', 'government-and-politics', 'nsw',
       'industry', 'education', 'united-kingdom', 'healthcare-facilities',
       'doctors-and-medical-professionals', 'united-states',
       'business-economics-and-finance', 'schools', 'medical-research',
       'community-and-society', 'health-policy', 'federal-government',
       

## Ensemble CC using naive Bayes (E = 10)

In [126]:
dataPath = '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'

# read data
data, label = read_data(dataPath)

# train - test
print("------ Binary Relevance using Naive Bayes ------")
ECC_test(data, label, dataPath, random_state=3071980, ensemble=10)

------ Binary Relevance using Naive Bayes ------
Order of the chain: Index(['sa', 'doctors-and-medical-professionals', 'exercise-and-fitness',
       'child-health-and-behaviour', 'federal-government', 'advertising',
       'industry', 'medical-research', 'smoking', 'nsw', 'vic',
       'diseases-and-disorders', 'cancer', 'diet-and-nutrition',
       'states-and-territories', 'united-states', 'food-and-beverage', 'qld',
       'government-and-politics', 'australia', 'federal---state-issues',
       'united-kingdom', 'research', 'tas', 'schools', 'heart-disease',
       'education', 'advertising-and-marketing', 'children', 'wa',
       'lifestyle-and-leisure', 'health', 'health-policy',
       'science-and-technology',
       'indigenous-aboriginal-and-torres-strait-islander', 'diabetes',
       'family-and-children', 'community-and-society', 'act',
       'healthcare-facilities', 'business-economics-and-finance', 'obesity'],
      dtype='object')
Order of the chain: Index(['diet-and-nu

Order of the chain: Index(['federal---state-issues', 'diabetes', 'education', 'research',
       'healthcare-facilities', 'sa',
       'indigenous-aboriginal-and-torres-strait-islander', 'heart-disease',
       'child-health-and-behaviour', 'doctors-and-medical-professionals',
       'cancer', 'act', 'australia', 'tas', 'nsw',
       'business-economics-and-finance', 'wa', 'states-and-territories',
       'federal-government', 'science-and-technology',
       'government-and-politics', 'lifestyle-and-leisure',
       'community-and-society', 'united-states', 'exercise-and-fitness',
       'smoking', 'food-and-beverage', 'schools', 'medical-research', 'health',
       'advertising-and-marketing', 'advertising', 'industry', 'obesity',
       'vic', 'diseases-and-disorders', 'united-kingdom',
       'family-and-children', 'health-policy', 'qld', 'diet-and-nutrition',
       'children'],
      dtype='object')
--- Data Information ---
dataset: /Volumes/Samsung_T5/research/data/ABC_news_data

In [127]:
dataPath = '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'

# read data
data, label = read_data(dataPath)

ensemble=10

# data set information
n_label = label.shape[1]
n_attr = data.shape[1]
n_instance = data.shape[0]
avg_label_per_instance = label.sum(axis=1).mean()

X_train = data[data.index.isin(index)==False]
X_test = data[data.index.isin(index)==True]
y_train = label[label.index.isin(index)==False]
y_test = label[label.index.isin(index)==True]

print("-- test index --")
print(X_test.index)

# ensemble
y_pred_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)
y_prob_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)
for i in range(ensemble):
    # training
    #print("--- start training ---\n")
    classifier_list, training_time, order = naiveBayes_multi_label_training(X_train, y_train)

    # print orders
    print("Order of the chain:",label.columns[order])

    # testing
    #print("--- start testing ---\n")
    y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(X_test, n_label, classifier_list, order)

    y_predict.columns = label.columns[order]
    y_prob.columns = label.columns[order]
    y_predict = y_predict[label.columns]
    y_prob = y_prob[label.columns]

    y_pred_ensemble = y_pred_ensemble + y_predict
    y_prob_ensemble = y_prob_ensemble + y_prob

y_pred_ensemble = (((y_pred_ensemble / ensemble) >= 0.5)*1).astype('int')
y_prob_ensemble = y_prob_ensemble / ensemble 

# evaluation
performance = evaluation(y_pred_ensemble, y_prob_ensemble, y_test)

# print data information
print("--- Data Information ---")
print("dataset:", dataPath)
print("number of label:",n_label)
print("number of attribute:",n_attr)
print("number of instance:",n_instance,"\n")

# print orders
print("\n--- Order of the chain ---")
print(label.columns[order])
print("")

# get confusion matrix
get_confusion_matrix(y_pred_ensemble, y_test, y_test.columns)

# print performance
print("--- Performance ---")
for key, value in performance.items():
    if key == "f1_each_label":
        print("\n- f1 for each label -")
        for i in range(n_label):
            print("label_"+label.columns[i],"=",round(value[i],2))
    elif key == "coverage_error":
        print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
    else:
        print(key,'=',round(value,2))

-- test index --
Int64Index([  4,   5,   7,   8,  15,  16,  17,  21,  23,  24,
            ...
            876, 878, 879, 880, 881, 882, 884, 888, 889, 890],
           dtype='int64', length=446)
Order of the chain: Index(['tas', 'united-kingdom', 'nsw', 'industry', 'united-states',
       'australia', 'lifestyle-and-leisure', 'smoking',
       'healthcare-facilities', 'states-and-territories', 'children',
       'advertising', 'vic', 'research', 'family-and-children',
       'diseases-and-disorders', 'education', 'diet-and-nutrition',
       'federal-government', 'advertising-and-marketing',
       'doctors-and-medical-professionals', 'diabetes', 'obesity', 'wa',
       'food-and-beverage', 'business-economics-and-finance',
       'medical-research', 'government-and-politics',
       'child-health-and-behaviour', 'cancer',
       'indigenous-aboriginal-and-torres-strait-islander', 'schools',
       'health-policy', 'health', 'community-and-society', 'sa', 'act',
       'federal---stat

Order of the chain: Index(['federal-government', 'united-states', 'states-and-territories',
       'health', 'sa', 'business-economics-and-finance',
       'indigenous-aboriginal-and-torres-strait-islander', 'education',
       'food-and-beverage', 'nsw', 'act', 'united-kingdom', 'smoking', 'qld',
       'healthcare-facilities', 'heart-disease', 'exercise-and-fitness',
       'diabetes', 'health-policy', 'federal---state-issues',
       'doctors-and-medical-professionals', 'diseases-and-disorders',
       'medical-research', 'advertising', 'wa', 'child-health-and-behaviour',
       'schools', 'government-and-politics', 'obesity', 'family-and-children',
       'australia', 'children', 'vic', 'tas', 'lifestyle-and-leisure',
       'industry', 'research', 'diet-and-nutrition', 'community-and-society',
       'cancer', 'advertising-and-marketing', 'science-and-technology'],
      dtype='object')
--- Data Information ---
dataset: /Volumes/Samsung_T5/research/data/ABC_news_data/obesity/
numb

## Ensemble CC using naive Bayes (E = 50)

In [128]:
dataPath = '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'

# read data
data, label = read_data(dataPath)

# train - test
print("------ Binary Relevance using Naive Bayes ------")
ECC_test(data, label, dataPath, random_state=3071980, ensemble=50)

------ Binary Relevance using Naive Bayes ------
Order of the chain: Index(['indigenous-aboriginal-and-torres-strait-islander',
       'child-health-and-behaviour', 'research', 'health', 'smoking',
       'obesity', 'industry', 'exercise-and-fitness',
       'advertising-and-marketing', 'sa', 'states-and-territories', 'qld',
       'diseases-and-disorders', 'wa', 'united-states',
       'business-economics-and-finance', 'federal-government',
       'doctors-and-medical-professionals', 'united-kingdom', 'diabetes',
       'medical-research', 'cancer', 'vic', 'government-and-politics',
       'health-policy', 'nsw', 'heart-disease', 'tas', 'healthcare-facilities',
       'family-and-children', 'community-and-society', 'advertising',
       'federal---state-issues', 'food-and-beverage', 'education', 'australia',
       'schools', 'children', 'science-and-technology', 'act',
       'diet-and-nutrition', 'lifestyle-and-leisure'],
      dtype='object')
Order of the chain: Index(['healthcare-

Order of the chain: Index(['tas', 'research', 'family-and-children', 'states-and-territories',
       'food-and-beverage', 'medical-research', 'united-states',
       'federal---state-issues', 'schools', 'diabetes',
       'business-economics-and-finance', 'children', 'community-and-society',
       'wa', 'diseases-and-disorders', 'smoking', 'vic',
       'child-health-and-behaviour', 'health-policy', 'advertising',
       'diet-and-nutrition', 'federal-government', 'science-and-technology',
       'sa', 'obesity', 'doctors-and-medical-professionals', 'australia',
       'united-kingdom', 'indigenous-aboriginal-and-torres-strait-islander',
       'cancer', 'health', 'industry', 'exercise-and-fitness',
       'lifestyle-and-leisure', 'advertising-and-marketing', 'qld', 'nsw',
       'act', 'education', 'government-and-politics', 'heart-disease',
       'healthcare-facilities'],
      dtype='object')
Order of the chain: Index(['advertising', 'schools', 'federal---state-issues', 'food-and

Order of the chain: Index(['federal---state-issues', 'diseases-and-disorders', 'heart-disease',
       'vic', 'cancer', 'states-and-territories', 'smoking', 'obesity',
       'united-states', 'industry',
       'indigenous-aboriginal-and-torres-strait-islander', 'food-and-beverage',
       'nsw', 'qld', 'tas', 'research', 'diabetes', 'healthcare-facilities',
       'wa', 'exercise-and-fitness', 'doctors-and-medical-professionals',
       'advertising', 'advertising-and-marketing', 'australia',
       'community-and-society', 'health-policy', 'lifestyle-and-leisure',
       'health', 'children', 'education', 'child-health-and-behaviour',
       'science-and-technology', 'family-and-children', 'united-kingdom',
       'schools', 'business-economics-and-finance', 'diet-and-nutrition', 'sa',
       'federal-government', 'government-and-politics', 'act',
       'medical-research'],
      dtype='object')
Order of the chain: Index(['smoking', 'indigenous-aboriginal-and-torres-strait-islander'

Order of the chain: Index(['doctors-and-medical-professionals', 'diabetes',
       'federal---state-issues', 'states-and-territories',
       'government-and-politics', 'cancer', 'australia', 'research',
       'food-and-beverage', 'sa', 'united-states', 'united-kingdom', 'qld',
       'obesity', 'family-and-children', 'smoking', 'tas',
       'lifestyle-and-leisure', 'children', 'wa', 'diet-and-nutrition',
       'medical-research', 'business-economics-and-finance', 'act',
       'advertising-and-marketing', 'exercise-and-fitness', 'education',
       'community-and-society', 'federal-government', 'heart-disease',
       'science-and-technology', 'diseases-and-disorders', 'schools',
       'advertising', 'vic', 'healthcare-facilities',
       'child-health-and-behaviour', 'nsw',
       'indigenous-aboriginal-and-torres-strait-islander', 'industry',
       'health', 'health-policy'],
      dtype='object')
Order of the chain: Index(['exercise-and-fitness', 'diseases-and-disorders', 'edu

Order of the chain: Index(['exercise-and-fitness', 'australia', 'family-and-children',
       'science-and-technology', 'government-and-politics', 'cancer',
       'medical-research', 'doctors-and-medical-professionals', 'diabetes',
       'vic', 'united-kingdom', 'child-health-and-behaviour',
       'advertising-and-marketing', 'health-policy', 'schools', 'health',
       'obesity', 'food-and-beverage', 'wa', 'diet-and-nutrition', 'research',
       'nsw', 'federal---state-issues', 'lifestyle-and-leisure', 'act',
       'business-economics-and-finance', 'industry', 'community-and-society',
       'healthcare-facilities', 'united-states', 'states-and-territories',
       'advertising', 'heart-disease', 'education', 'federal-government',
       'tas', 'indigenous-aboriginal-and-torres-strait-islander', 'qld', 'sa',
       'diseases-and-disorders', 'smoking', 'children'],
      dtype='object')
Order of the chain: Index(['united-kingdom', 'research', 'community-and-society', 'smoking', 's

Order of the chain: Index(['sa', 'qld', 'united-states', 'nsw', 'diet-and-nutrition', 'wa',
       'child-health-and-behaviour', 'schools', 'advertising',
       'exercise-and-fitness', 'cancer', 'medical-research',
       'federal---state-issues', 'health', 'united-kingdom',
       'indigenous-aboriginal-and-torres-strait-islander',
       'advertising-and-marketing', 'tas', 'health-policy', 'education', 'act',
       'research', 'industry', 'community-and-society',
       'doctors-and-medical-professionals', 'smoking', 'diabetes',
       'science-and-technology', 'federal-government', 'australia', 'children',
       'lifestyle-and-leisure', 'heart-disease', 'food-and-beverage',
       'obesity', 'diseases-and-disorders', 'business-economics-and-finance',
       'government-and-politics', 'healthcare-facilities',
       'family-and-children', 'states-and-territories', 'vic'],
      dtype='object')
Order of the chain: Index(['child-health-and-behaviour', 'advertising-and-marketing',
  

In [129]:
dataPath = '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'

# read data
data, label = read_data(dataPath)

ensemble=50

# data set information
n_label = label.shape[1]
n_attr = data.shape[1]
n_instance = data.shape[0]
avg_label_per_instance = label.sum(axis=1).mean()

X_train = data[data.index.isin(index)==False]
X_test = data[data.index.isin(index)==True]
y_train = label[label.index.isin(index)==False]
y_test = label[label.index.isin(index)==True]

print("-- test index --")
print(X_test.index)

# ensemble
y_pred_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)
y_prob_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)
for i in range(ensemble):
    # training
    #print("--- start training ---\n")
    classifier_list, training_time, order = naiveBayes_multi_label_training(X_train, y_train)

    # print orders
    print("Order of the chain:",label.columns[order])

    # testing
    #print("--- start testing ---\n")
    y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(X_test, n_label, classifier_list, order)

    y_predict.columns = label.columns[order]
    y_prob.columns = label.columns[order]
    y_predict = y_predict[label.columns]
    y_prob = y_prob[label.columns]

    y_pred_ensemble = y_pred_ensemble + y_predict
    y_prob_ensemble = y_prob_ensemble + y_prob

y_pred_ensemble = (((y_pred_ensemble / ensemble) >= 0.5)*1).astype('int')
y_prob_ensemble = y_prob_ensemble / ensemble 

# evaluation
performance = evaluation(y_pred_ensemble, y_prob_ensemble, y_test)


# print data information
print("--- Data Information ---")
print("dataset:", dataPath)
print("number of label:",n_label)
print("number of attribute:",n_attr)
print("number of instance:",n_instance,"\n")

# print orders
print("\n--- Order of the chain ---")
print(label.columns[order])
print("")

# get confusion matrix
get_confusion_matrix(y_pred_ensemble, y_test, y_test.columns)

# print performance
print("--- Performance ---")
for key, value in performance.items():
    if key == "f1_each_label":
        print("\n- f1 for each label -")
        for i in range(n_label):
            print("label_"+label.columns[i],"=",round(value[i],2))
    elif key == "coverage_error":
        print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
    else:
        print(key,'=',round(value,2))

-- test index --
Int64Index([  4,   5,   7,   8,  15,  16,  17,  21,  23,  24,
            ...
            876, 878, 879, 880, 881, 882, 884, 888, 889, 890],
           dtype='int64', length=446)
Order of the chain: Index(['health-policy', 'healthcare-facilities', 'federal---state-issues',
       'government-and-politics', 'lifestyle-and-leisure', 'health',
       'food-and-beverage', 'science-and-technology', 'advertising',
       'diseases-and-disorders', 'act', 'heart-disease',
       'exercise-and-fitness', 'industry', 'sa', 'child-health-and-behaviour',
       'qld', 'united-states', 'schools', 'business-economics-and-finance',
       'states-and-territories', 'family-and-children', 'diabetes',
       'education', 'research', 'vic', 'tas', 'federal-government', 'wa',
       'community-and-society', 'smoking', 'doctors-and-medical-professionals',
       'indigenous-aboriginal-and-torres-strait-islander', 'children',
       'united-kingdom', 'cancer', 'advertising-and-marketing',
  

Order of the chain: Index(['heart-disease', 'obesity', 'sa', 'advertising-and-marketing', 'vic',
       'education', 'child-health-and-behaviour', 'act', 'health-policy',
       'cancer', 'nsw', 'schools', 'diet-and-nutrition',
       'states-and-territories', 'medical-research', 'family-and-children',
       'children', 'federal-government', 'food-and-beverage', 'qld',
       'diseases-and-disorders', 'exercise-and-fitness',
       'doctors-and-medical-professionals', 'australia',
       'business-economics-and-finance', 'research', 'wa', 'united-states',
       'industry', 'healthcare-facilities', 'federal---state-issues',
       'indigenous-aboriginal-and-torres-strait-islander',
       'lifestyle-and-leisure', 'health', 'smoking', 'advertising',
       'united-kingdom', 'tas', 'diabetes', 'community-and-society',
       'government-and-politics', 'science-and-technology'],
      dtype='object')
Order of the chain: Index(['science-and-technology', 'federal-government',
       'adver

Order of the chain: Index(['united-states', 'lifestyle-and-leisure', 'smoking', 'tas',
       'diseases-and-disorders', 'health-policy',
       'indigenous-aboriginal-and-torres-strait-islander', 'cancer',
       'united-kingdom', 'wa', 'children', 'health', 'diet-and-nutrition',
       'qld', 'industry', 'australia', 'nsw', 'federal-government',
       'federal---state-issues', 'sa', 'advertising-and-marketing', 'research',
       'obesity', 'schools', 'heart-disease', 'states-and-territories',
       'government-and-politics', 'business-economics-and-finance', 'vic',
       'family-and-children', 'education', 'advertising',
       'exercise-and-fitness', 'healthcare-facilities',
       'community-and-society', 'child-health-and-behaviour',
       'science-and-technology', 'doctors-and-medical-professionals',
       'diabetes', 'medical-research', 'act', 'food-and-beverage'],
      dtype='object')
Order of the chain: Index(['wa', 'schools', 'united-states', 'lifestyle-and-leisure', 'h

Order of the chain: Index(['food-and-beverage', 'family-and-children', 'research',
       'states-and-territories', 'health-policy', 'medical-research',
       'australia', 'qld', 'cancer', 'business-economics-and-finance', 'tas',
       'indigenous-aboriginal-and-torres-strait-islander',
       'science-and-technology', 'advertising', 'united-states',
       'heart-disease', 'government-and-politics', 'industry', 'nsw',
       'doctors-and-medical-professionals', 'exercise-and-fitness',
       'advertising-and-marketing', 'smoking', 'act', 'diet-and-nutrition',
       'lifestyle-and-leisure', 'schools', 'education', 'sa',
       'child-health-and-behaviour', 'diabetes', 'united-kingdom', 'children',
       'vic', 'obesity', 'federal-government', 'health', 'wa',
       'community-and-society', 'federal---state-issues',
       'healthcare-facilities', 'diseases-and-disorders'],
      dtype='object')
Order of the chain: Index(['united-kingdom', 'food-and-beverage', 'smoking', 'cancer',
 

Order of the chain: Index(['doctors-and-medical-professionals', 'united-states',
       'business-economics-and-finance', 'research', 'healthcare-facilities',
       'lifestyle-and-leisure', 'advertising', 'health',
       'science-and-technology', 'medical-research', 'food-and-beverage',
       'federal-government', 'cancer', 'diet-and-nutrition', 'nsw', 'children',
       'smoking', 'diseases-and-disorders', 'tas', 'vic',
       'child-health-and-behaviour', 'industry', 'health-policy', 'qld',
       'diabetes', 'sa', 'family-and-children', 'australia', 'act',
       'federal---state-issues', 'schools', 'exercise-and-fitness',
       'advertising-and-marketing', 'obesity', 'united-kingdom',
       'indigenous-aboriginal-and-torres-strait-islander', 'wa',
       'heart-disease', 'community-and-society', 'education',
       'states-and-territories', 'government-and-politics'],
      dtype='object')
Order of the chain: Index(['diseases-and-disorders', 'vic', 'united-kingdom', 'education

Order of the chain: Index(['industry', 'wa', 'child-health-and-behaviour',
       'diseases-and-disorders', 'australia', 'healthcare-facilities',
       'children', 'act', 'business-economics-and-finance', 'research',
       'heart-disease', 'lifestyle-and-leisure', 'united-states',
       'diet-and-nutrition', 'nsw', 'science-and-technology', 'smoking',
       'health', 'schools', 'exercise-and-fitness',
       'indigenous-aboriginal-and-torres-strait-islander', 'obesity', 'sa',
       'qld', 'united-kingdom', 'government-and-politics',
       'federal-government', 'federal---state-issues', 'family-and-children',
       'advertising', 'diabetes', 'food-and-beverage',
       'doctors-and-medical-professionals', 'advertising-and-marketing',
       'health-policy', 'medical-research', 'vic', 'states-and-territories',
       'tas', 'cancer', 'education', 'community-and-society'],
      dtype='object')
Order of the chain: Index(['sa', 'advertising-and-marketing', 'community-and-society',
 

  'precision', 'predicted', average, warn_for)


--- Data Information ---
dataset: /Volumes/Samsung_T5/research/data/ABC_news_data/obesity/
number of label: 42
number of attribute: 335
number of instance: 891 


--- Order of the chain ---
Index(['act', 'schools', 'qld', 'smoking', 'federal-government',
       'lifestyle-and-leisure', 'exercise-and-fitness',
       'science-and-technology', 'community-and-society',
       'family-and-children', 'united-kingdom', 'food-and-beverage',
       'diet-and-nutrition', 'nsw', 'federal---state-issues',
       'advertising-and-marketing', 'doctors-and-medical-professionals',
       'diabetes', 'united-states', 'health', 'heart-disease',
       'healthcare-facilities', 'advertising', 'industry', 'vic',
       'business-economics-and-finance', 'wa', 'states-and-territories', 'tas',
       'diseases-and-disorders', 'education', 'medical-research',
       'indigenous-aboriginal-and-torres-strait-islander',
       'government-and-politics', 'obesity', 'child-health-and-behaviour',
       'australia'

## CC using ESKDB


In [3]:
# prepare data
def csv_to_arff(X, label_i, savePath):
    # get attributes
    attributes=[(X.columns[i],['0', '1']) for i in range(len(X.columns))]
    attributes.append(('label_'+label_i.name,['0', '1']))

    data=[]
    i = 0
    while i < len(label_i):
        attr_data = [str(j) for j in list(X.iloc[i,:])]
        label_data = [str(label_i[i])]
        row_data = attr_data+label_data
        data.append(row_data) 
        i+=1
    # set obj
    obj = {
       'description': u'',
       'relation': 'relation',
       'attributes': attributes,
       'data': data,
    }
    arff_data = arff.dumps(obj)
    w_file = open(savePath+label_i.name+".arff", "w")
    w_file.write(arff_data)
    w_file.close()

def get_arff(X, label, savePath):
    
    n_label = label.shape[1]
    # get orders
    order = random.sample(list(range(n_label)),n_label) # get orders
    
    #  get all arff files, one for each label
    for i in range(n_label):
        label_i = label.iloc[:,order[i]]
        print("--Running label:",label_i.name)
        csv_to_arff(X, label_i, savePath)
        
        label_i.name = 'label_' + label_i.name
        X = pd.concat([X, label_i], axis=1)
    print("--finished getting arff files")
    return order

def run_eskdb(label_arff, resultFile, k, l, e, i):
    command = "./run_ECC.sh "+resultFile+" "+k+" "+i+" "+l+" "+e+" "+label_arff
    subprocess.call("cd /Volumes/Samsung_T5/research/programme/research_python/", shell=True)
    #print(command)
    return subprocess.call(command, shell=True)

def get_result(resultPath):

    y_pred = pd.DataFrame()
    y_true = pd.DataFrame()
    y_prob = pd.DataFrame()
    names = []
    for file in os.listdir(resultPath):
        with open(os.path.join(resultPath,file), 'r') as f:
            try:
                lines = f.readlines()
            except:
                print(file)
            else:
                names.append(file[:-4])
                pred = []
                true = []
                prob = []
                train_time_total = 0
                test_time_total = 0
                error_marco = 0
                for line in lines:
                    if line.startswith('pred'):
                        pred.append(int(re.search('pred :\t(.)',line).group(1)))
                        true.append(int(re.search('true :\t(.)',line).group(1)))
                        prob.append(float(re.search('prob :\t(.*)',line).group(1)))
                    elif line.startswith('RSME'):
                        rsme = float(re.search('RSME :\t\t(.*)',line).group(1))
                    elif line.startswith('Error'):
                        error = float(re.search('Error :\t\t(.*)',line).group(1))
                    elif line.startswith("Training time"):
                        train_time = float(re.search('Training time :\s{1,}(.*)',line).group(1))
                        train_time_total = train_time_total + train_time
                    elif line.startswith("Testing time"):
                        test_time = float(re.search('Testing time :\s{1,}(.*)',line).group(1))
                        test_time_total = test_time_total + test_time
                    elif line.startswith("["):
                        para = line
                    elif line.startswith("test0Indexes"):
                        index = list(map(int,re.search('test0Indexes: {(.*)}',line).group(1).split(', ')))
                        
                y_pred = pd.concat([y_pred,pd.DataFrame(pred)],axis=1)
                y_true = pd.concat([y_true,pd.DataFrame(true)],axis=1)
                y_prob = pd.concat([y_prob,pd.DataFrame(prob)],axis=1)
    y_pred.columns = names
    y_true.columns = names
    print(para)
    print("number of label:", y_pred.shape[1])
    print("training time:",train_time_total)
    print("testing time:",test_time_total)
    return y_pred,y_true,y_prob

In [153]:
dataPath = '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'

# read data
data, label = read_data(dataPath)
n_label = label.shape[1]

# data set information
n_label = label.shape[1]
n_attr = data.shape[1]
n_instance = data.shape[0]
avg_label_per_instance = label.sum(axis=1).mean()

# get orders
order = random.sample(list(range(n_label)),n_label) # get orders

# train and test on the first label
savePath = '/Users/jiangjunhao/Desktop/test/'
## prepare data, get the arff file
for i in range(n_label):
    label_i = label.iloc[:,order[i]]
    #print("--Running label:",label_i.name)
    csv_to_arff(data, label_i, savePath)

    # run eskdb
    label_arff = os.path.join(savePath,label_i.name+'.arff')
    resultFile = 'a' # /Volumes/Samsung_T5/research/programme/ESKDB_HDP/result/resultFile
    k = '5'
    i = '5000'
    l = '2'
    e = '20'

    run_eskdb(label_arff, resultFile, k, l, e, i)

    result = os.path.join("/Volumes/Samsung_T5/research/programme/ESKDB_HDP/result",resultFile, label_i.name+'.txt')
    with open(result, 'r') as f:
        try:
            lines = f.readlines()
        except:
            print(file)
        else:
            pred = []
            for line in lines:
                if line.startswith('pred'):
                    pred.append(int(re.search('pred :\t(.)',line).group(1)))
                elif line.startswith("test0Indexes"):
                    index = list(map(int,re.search('test0Indexes: {(.*)}',line).group(1).split(', ')))

            label.loc[index,label_i.name] = pred
            temp = label.loc[:,label_i.name]
            temp.name = 'label_'+label_i.name
            data = pd.concat([data, label.loc[:,label_i.name]],axis=1)

# get result
resultPath = '/Volumes/Samsung_T5/research/programme/ESKDB_HDP/result/'+resultFile+'/'
y_pred,y_true,y_prob = get_result(resultPath)
performance = evaluation(y_pred=y_pred, y_true=y_true, y_prob=y_prob)

# print orders:
print(label.columns[order])

# get confusion matrix
print("\n--- Confusion matrix ---")
get_confusion_matrix(y_pred, y_true, y_pred.columns)

# print performance
print("--- Performance ---")
for key, value in performance.items():
    if key == "f1_each_label":
        print("\n- f1 for each label -")
        for i in range(n_label):
            print("label_"+label.columns[i],"=",round(value[i],2))
    elif key == "coverage_error":
        print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
    else:
        print(key,'=',round(value,2))

[-t, /Users/jiangjunhao/Desktop/test/food-and-beverage.arff, -S, ESKDB, -K, 5, -I, 5000, -L, 2, -E, 20, -V, -M]

number of label: 42
training time: 2682.0
testing time: 2210.0


  'precision', 'predicted', average, warn_for)


Index(['advertising-and-marketing', 'act', 'lifestyle-and-leisure',
       'food-and-beverage', 'heart-disease', 'children', 'health',
       'medical-research', 'doctors-and-medical-professionals',
       'federal---state-issues', 'advertising', 'united-states', 'industry',
       'healthcare-facilities', 'government-and-politics', 'qld',
       'family-and-children', 'exercise-and-fitness', 'federal-government',
       'cancer', 'research', 'diet-and-nutrition', 'vic', 'smoking', 'wa',
       'diseases-and-disorders', 'sa', 'obesity', 'schools', 'health-policy',
       'education', 'diabetes', 'nsw', 'child-health-and-behaviour',
       'australia', 'community-and-society', 'states-and-territories',
       'science-and-technology',
       'indigenous-aboriginal-and-torres-strait-islander', 'tas',
       'united-kingdom', 'business-economics-and-finance'],
      dtype='object')

--- Confusion matrix ---
united-states: TP:29, TN:389, FP:7, FN:21
australia: TP:84, TN:229, FP:41, FN:92
g

## Ensemble CC using ESKDB(E=10)

In [6]:
dataPath = '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'

# read data
data, label = read_data(dataPath)
n_label = label.shape[1]

# data set information
n_label = label.shape[1]
n_attr = data.shape[1]
n_instance = data.shape[0]
avg_label_per_instance = label.sum(axis=1).mean()

# ensemble
ensemble = 10

y_pred_ensemble = pd.DataFrame(np.zeros((int(label.shape[0]/2),label.shape[1])),columns=label.columns)
y_prob_ensemble = pd.DataFrame(np.zeros((int(label.shape[0]/2),label.shape[1])),columns=label.columns)
    
    
for i in range(ensemble):
    dataPath = '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'

    # read data
    data, label = read_data(dataPath)
    n_label = label.shape[1]

    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()

    # get orders
    order = random.sample(list(range(n_label)),n_label) # get orders

    # train and test on the first label
    savePath = '/Users/jiangjunhao/Desktop/test/'
    ## prepare data, get the arff file
    for i in range(n_label):
        label_i = label.iloc[:,order[i]]
        print("--Running label:",label_i.name)
        csv_to_arff(data, label_i, savePath)

        # run eskdb
        label_arff = os.path.join(savePath,label_i.name+'.arff')
        resultFile = 'a' # /Volumes/Samsung_T5/research/programme/ESKDB_HDP/result/resultFile
        k = '5'
        i = '5000'
        l = '2'
        e = '20'

        run_eskdb(label_arff, resultFile, k, l, e, i)

        result = os.path.join("/Volumes/Samsung_T5/research/programme/ESKDB_HDP/result",resultFile, label_i.name+'.txt')
        with open(result, 'r') as f:
            try:
                lines = f.readlines()
            except:
                print(file)
            else:
                pred = []
                for line in lines:
                    if line.startswith('pred'):
                        pred.append(int(re.search('pred :\t(.)',line).group(1)))
                    elif line.startswith("test0Indexes"):
                        index = list(map(int,re.search('test0Indexes: {(.*)}',line).group(1).split(', ')))

                label.loc[index,label_i.name] = pred
                temp = label.loc[:,label_i.name]
                temp.name = 'label_'+label_i.name
                data = pd.concat([data, label.loc[:,label_i.name]],axis=1)

    # get result
    resultPath = '/Volumes/Samsung_T5/research/programme/ESKDB_HDP/result/'+resultFile+'/'
    y_pred,y_true,y_prob = get_result(resultPath)
    
    y_pred.columns = label.columns
    y_prob.columns = label.columns
    
    y_pred_ensemble = y_pred_ensemble + y_pred
    y_prob_ensemble = y_prob_ensemble + y_prob
    
    
y_pred_ensemble = (((y_pred_ensemble / ensemble) >= 0.5)*1).astype('int')
y_prob_ensemble = y_prob_ensemble / ensemble 

--Running label: doctors-and-medical-professionals
--Running label: heart-disease
--Running label: health-policy
--Running label: united-states
--Running label: lifestyle-and-leisure
--Running label: smoking
--Running label: advertising-and-marketing
--Running label: food-and-beverage
--Running label: healthcare-facilities
--Running label: sa
--Running label: family-and-children
--Running label: schools
--Running label: qld
--Running label: diseases-and-disorders
--Running label: research
--Running label: australia
--Running label: indigenous-aboriginal-and-torres-strait-islander
--Running label: wa
--Running label: united-kingdom
--Running label: industry
--Running label: federal---state-issues
--Running label: community-and-society
--Running label: tas
--Running label: cancer
--Running label: advertising
--Running label: federal-government
--Running label: diabetes
--Running label: children
--Running label: science-and-technology
--Running label: exercise-and-fitness
--Running label:

--Running label: act
--Running label: wa
--Running label: research
--Running label: family-and-children
--Running label: science-and-technology
--Running label: health-policy
--Running label: indigenous-aboriginal-and-torres-strait-islander
--Running label: federal---state-issues
--Running label: community-and-society
--Running label: children
--Running label: federal-government
--Running label: sa
--Running label: united-kingdom
--Running label: united-states
--Running label: heart-disease
--Running label: nsw
--Running label: diseases-and-disorders
--Running label: government-and-politics
--Running label: smoking
--Running label: advertising-and-marketing
--Running label: obesity
--Running label: lifestyle-and-leisure
--Running label: australia
--Running label: states-and-territories
--Running label: tas
--Running label: schools
--Running label: vic
[-t, /Users/jiangjunhao/Desktop/test/food-and-beverage.arff, -S, ESKDB, -K, 5, -I, 5000, -L, 2, -E, 20, -V, -M]

number of label: 42
tra

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [4]:
performance = evaluation(y_pred=y_pred_ensemble, y_true=y_true, y_prob=y_prob_ensemble)

# print orders:
print(label.columns[order])

# get confusion matrix
print("\n--- Confusion matrix ---")
get_confusion_matrix(y_pred, y_true, y_pred.columns)

# print performance
print("--- Performance ---")
for key, value in performance.items():
    if key == "f1_each_label":
        print("\n- f1 for each label -")
        for i in range(n_label):
            print("label_"+label.columns[i],"=",round(value[i],2))
    elif key == "coverage_error":
        print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
    else:
        print(key,'=',round(value,2))

NameError: name 'y_pred_ensemble' is not defined

# Result

`
**Performance of BR_naive Bayes**
coverage_error = 13.6 ( avg_label_per_instance = 3.25 )
ranking_loss = 0.13
hamming_loss = 0.1
f1_macro = 0.26
f1_micro = 0.4
Jaccard_Index = 0.26
zero_one_error = 0.98
`

`
**Performance of BR_ESKDB**
coverage_error = 14.76 ( avg_label_per_instance = 3.25 )
ranking_loss = 0.15
hamming_loss = 0.08
f1_macro = 0.15
f1_micro = 0.33
jaccard_index = 0.23
zero_one_error = 0.94
`

`
**Performance of Ensemble Classifier Chain using naive Bayes**
coverage_error = 13.63 ( avg_label_per_instance = 3.25 )
ranking_loss = 0.13
hamming_loss = 0.11
f1_macro = 0.26
f1_micro = 0.38
Jaccard_Index = 0.25
zero_one_error = 0.97
`

`
**Performance of Ensemble Classifier Chain using ESKDB(E=2)**
coverage_error = 15.87 ( avg_label_per_instance = 3.25 )
ranking_loss = 0.16
hamming_loss = 0.08
f1_macro = 0.22
f1_micro = 0.38
Jaccard_Index = 0.25
zero_one_error = 0.95
`


recall 体现了分类模型H对正样本的识别能力，recall 越高，说明模型对正样本的识别能力越强.

precision 体现了模型对负样本的区分能力，precision越高，说明模型对负样本的区分能力越强。F1-score 是两者的综合。F1-score 越高，说明分类模型越稳健。