In [1]:
import pandas as pd
import numpy as np
import os
import time
import subprocess
import re

import random
import arff

from pomegranate import BayesianNetwork
import pomegranate
from math import sqrt
from sklearn.model_selection import  train_test_split
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB  
from sklearn.metrics import classification_report
from sklearn.metrics import coverage_error
from sklearn.metrics import label_ranking_loss
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import zero_one_loss
from sklearn.metrics import jaccard_similarity_score

#Python program to print topological sorting of a DAG 
from collections import defaultdict 

## Basic function

In [2]:
#Class to represent a graph 
class Graph: 
    def __init__(self,vertices): 
        self.graph = defaultdict(list) #dictionary containing adjacency List 
        self.V = vertices #No. of vertices 
  
    # function to add an edge to graph 
    def addEdge(self,u,v): 
        self.graph[u].extend(v) 
  
    # A recursive function used by topologicalSort 
    def topologicalSortUtil(self,v,visited,stack): 
  
        # Mark the current node as visited. 
        visited[v] = True
  
        # Recur for all the vertices adjacent to this vertex 
        for i in self.graph[v]: 
            if visited[i] == False: 
                self.topologicalSortUtil(i,visited,stack) 
  
        # Push current vertex to stack which stores result 
        stack.insert(0,v) 
  
    # The function to do Topological Sort. It uses recursive  
    # topologicalSortUtil() 
    def topologicalSort(self): 
        # Mark all the vertices as not visited 
        visited = [False]*self.V 
        stack =[] 
  
        # Call the recursive helper function to store Topological 
        # Sort starting from all vertices one by one 
        for i in range(self.V): 
            if visited[i] == False: 
                self.topologicalSortUtil(i,visited,stack) 
  
        # Print contents of the stack 
        return stack

In [3]:
def read_data(dataPath, X_file, y_file):
    # input: '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'
    # read data
    data = pd.read_csv(os.path.join(dataPath,X_file))
    label = pd.read_csv(os.path.join(dataPath,y_file))
    return data,label

def get_structure(model, labels):
    dic = {}
    for item, attr in zip(model.structure, labels):
        if item == ():
            dic[attr] = {}
        else:
            dic[attr] = set(labels[list(item)])
    return dic

def get_order(model, labels):
    
    g = Graph(len(labels))
    for item, i in zip(model.structure, range(len(labels))):
        if item == ():
            pass
        else:
            g.addEdge(i, list(item))
    
    # get order
    a = g.topologicalSort()
    a.reverse()

    return labels[a]

def evaluation(y_pred, y_prob, y_true):
    
    coverage = coverage_error(y_true, y_prob)
    hamming = hamming_loss(y_true, y_pred)
    ranking_loss = label_ranking_loss(y_true, y_prob) 
    
    f1_macro = metrics.f1_score(y_true, y_pred, average='macro')
    f1_micro = metrics.f1_score(y_true, y_pred, average='micro')
    
    acc = 0
    for i in range(y_true.shape[0]):
        acc += jaccard_similarity_score(y_true.iloc[i,:], y_pred.iloc[i,:]) # jaccard_similarity_score
    acc = acc / y_true.shape[0]
    
    zero_one = zero_one_loss(y_true, y_pred) # 0-1 error 
    
    performance = {"coverage_error":coverage,
                   "ranking_loss":ranking_loss,
                   "hamming_loss":hamming,
                   "f1_macro":f1_macro,
                   "f1_micro":f1_micro,
                   "Jaccard_Index":acc,
                   "zero_one_error":zero_one}
    return performance



## Binary Relevance

In [4]:
# BR for getting error matrix
def naiveBayes_multi_label_training_BR(X_train, y_train):
    start = time.time()
    
    n_label = y_train.shape[1]
    classifier_list = [MultinomialNB() for i in range(n_label)]
    for i in range(n_label):
        classifier_list[i].fit(X_train,y_train.iloc[:,i])
    
    end = time.time()
    training_time = end-start
    
    return classifier_list, training_time

def naiveBayes_multi_label_testing_BR(X_test, n_label, classifier_list):
    y_predict = pd.DataFrame()
    y_prob = pd.DataFrame()
    
    start = time.time()
    
    for i in range(n_label):
        y_predict_i = classifier_list[i].predict(X_test)
        y_predict = pd.concat([y_predict, pd.DataFrame(y_predict_i)],axis=1)
        
        y_predict_prob_i = classifier_list[i].predict_proba(X_test)[:,1]
        y_prob = pd.concat([y_prob, pd.DataFrame(y_predict_prob_i)],axis=1)
        
    end = time.time()
    testing_time = end-start
        
    return y_predict, y_prob, testing_time

def BR_test(data, label, dataPath, random_state=3071980):
    
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    
    # split training and test data set
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=random_state)
    
    # training
    classifier_list, training_time = naiveBayes_multi_label_training_BR(X_train, y_train)
    
    # testing
    y_predict, y_prob, testing_time = naiveBayes_multi_label_testing_BR(X_test, n_label, classifier_list)
    
    y_predict.columns = label.columns
    return y_predict, y_test

## BCC with different Bayesian network structure

In [5]:
def naiveBayes_multi_label_training_order(X_train, y_train, bayes_net, order):
    start = time.time()
    
    n_label = y_train.shape[1]
    
    classifier_list = [MultinomialNB() for i in range(n_label)] # create a classifier chain
    
    learned_label = []
    
    for i in range(n_label):
        if i == 0:
            l = order[i]
            classifier_list[i].fit(X_train, y_train.loc[:, l])
            learned_label.append(l)
            
        else:
            l = order[i]
            par = [x for x in bayes_net[l] if x in learned_label]
            X = pd.concat([X_train, y_train.loc[:,par]],axis=1) # put the previous label into attribute space
            classifier_list[i].fit(X, y_train.loc[:, l])
            learned_label.append(l)

    end = time.time()
    training_time = end-start
    
    return classifier_list, learned_label

def naiveBayes_multi_label_testing_order(X_test, n_label, classifier_list, bayes_net, learned_label):
    y_predict = pd.DataFrame(index=X_test.index)
    y_prob = pd.DataFrame(index=X_test.index)
    y_true = pd.DataFrame(index=X_test.index)
    
    start = time.time()
    
    predicted_list = []
    
    for i in range(n_label):
        if i == 0:
            l = learned_label[i]
            y_predict_i = classifier_list[i].predict(X_test)
            y_predict_prob_i = classifier_list[i].predict_proba(X_test)[:,1]
            y_predict = pd.concat([y_predict, pd.DataFrame(y_predict_i,index=X_test.index,columns=[l])],axis=1)
            y_prob = pd.concat([y_prob, pd.DataFrame(y_predict_prob_i,index=X_test.index,columns=[l])],axis=1)
            predicted_list.append(l)
        
        else:
            l = learned_label[i]
            par = [p for p in bayes_net[l] if p in predicted_list]
            if len(par) != 0:
                X = pd.concat([X_test, y_predict.loc[:,par]],axis=1) # put the previous label into attribute space
            else:
                X= X_test
            y_predict_i = classifier_list[i].predict(X)
            y_predict_prob_i = classifier_list[i].predict_proba(X)[:,1]
            
            y_predict = pd.concat([y_predict, pd.DataFrame(y_predict_i,index=X_test.index,columns=[l])],axis=1)
            y_prob = pd.concat([y_prob, pd.DataFrame(y_predict_prob_i,index=X_test.index,columns=[l])],axis=1)
  
            predicted_list.append(l)            
        
    return y_predict, y_prob

def BCC_test_structure(data, label, dataPath, random_state=3071980, ensemble = 5, structure="random"):
    
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()
    
    if structure=="DAG":
        model = BayesianNetwork.from_samples(label, algorithm='exact-dp')
        bayes_net = get_structure(model, label.columns)
        order = get_order(model, label.columns)
    
    # split training and test data set
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=random_state)
    
    # ensemble
    y_pred_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)
    y_prob_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)
    
    for i in range(ensemble):
        if order_method=="tree":
            if len(label.columns) <= ensemble:
                root_index = i
            else:
                root_index = random.randint(0,len(label.columns))
            model = BayesianNetwork.from_samples(label, algorithm='chow-liu', root=root_index)
            bayes_net = get_structure(model, label.columns)
            order = get_order(model, label.columns)

        # training
        #print("--- start training ---\n")
        classifier_list, learned_label = naiveBayes_multi_label_training_order(X_train, y_train, bayes_net, order)

        # testing
        #print("--- start testing ---\n")
        y_predict, y_prob = naiveBayes_multi_label_testing_order(X_test, n_label, classifier_list, bayes_net, learned_label)

        y_predict = y_predict[label.columns]
        y_prob = y_prob[label.columns]

        y_pred_ensemble = y_pred_ensemble + y_predict
        y_prob_ensemble = y_prob_ensemble + y_prob

        
    y_pred_ensemble = (((y_pred_ensemble / ensemble) >= 0.5)*1).astype('int')
    y_prob_ensemble = y_prob_ensemble / ensemble
    y_pred_ensemble = y_pred_ensemble.fillna(0)
    y_prob_ensemble = y_prob_ensemble.fillna(0)
    
    # evaluation
    performance = evaluation(y_pred_ensemble, y_prob_ensemble, y_test)
    
    performance_df = pd.DataFrame.from_dict(performance, orient='index')
    
    return performance_df


def BCC_test_structure_twofold(data, label, dataPath, random_state=3071980, ensemble = 5, structure="random"):
    
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    
    # get order
    if structure=="DAG":
        model = BayesianNetwork.from_samples(label, algorithm='greedy')
        bayes_net = get_structure(model, label.columns)
        order = get_order(model, label.columns)
    
    # split training and test data set
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=random_state)
    
    performance_df_all = pd.DataFrame(np.zeros([7,1]))
    for j in range(2):
        X_train, y_train = X_test, y_test
    # ensemble
        y_pred_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)
        y_prob_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)

        for i in range(ensemble):
            if structure=="tree":
                if len(label.columns) <= ensemble:
                    root_index = i
                else:
                    root_index = random.randint(0,len(label.columns)-1)
                model = BayesianNetwork.from_samples(label, algorithm='chow-liu', root=root_index)
                bayes_net = get_structure(model, label.columns)
                order = get_order(model, label.columns)
                
                
            # training
            #print("--- start training ---\n")
            classifier_list, learned_label = naiveBayes_multi_label_training_order(X_train, y_train, bayes_net, order)

            # testing
            #print("--- start testing ---\n")
            y_predict, y_prob = naiveBayes_multi_label_testing_order(X_test, n_label, classifier_list, bayes_net, learned_label)

            y_predict = y_predict[label.columns]
            y_prob = y_prob[label.columns]

            y_pred_ensemble = y_pred_ensemble + y_predict
            y_prob_ensemble = y_prob_ensemble + y_prob

        
        y_pred_ensemble = (((y_pred_ensemble / ensemble) >= 0.5)*1).astype('int')
        y_prob_ensemble = y_prob_ensemble / ensemble
        y_pred_ensemble = y_pred_ensemble.fillna(0)
        y_prob_ensemble = y_prob_ensemble.fillna(0)

        # evaluation
        performance = evaluation(y_pred_ensemble, y_prob_ensemble, y_test)
    
        performance_df = pd.DataFrame.from_dict(performance, orient='index')
        
        performance_df_all.index = performance_df.index
        performance_df_all.columns = performance_df.columns
        
        performance_df_all = performance_df_all + performance_df
        
    performance_df_all = performance_df_all / 2
    return performance_df_all

# Tree

In [None]:
BCC_tree= pd.DataFrame()

seed = [1234,2234,12345,12346,1234567]

data_list = ["yeast","emotions","scene","enron","genbase","medical"]
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/small_datasets/'+dataset+"/"
    X_file = "X.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data
    
    d = pd.DataFrame(np.zeros([7,1]))
    if label.shape[1] > 30:
        ensemble = 30
    else:
        ensemble = label.shape[1]
    for s in seed:
        df = BCC_test_structure_twofold(data, label, dataPath, random_state=s, ensemble = ensemble, structure="tree")
        df.columns = [dataset]
        d.columns = [dataset]
        d.index = df.index
        d = d + df
    BCC_tree = pd.concat([BCC_tree, d/5],axis=1)

BCC_tree.to_csv("/Users/jiangjunhao/Desktop/results_algorithms/twofold/BCC_tree.csv")
BCC_tree

yeast
emotions
scene
enron


  self.class_log_prior_ = (np.log(self.class_count_) -
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


genbase
medical


Unnamed: 0,yeast,emotions,scene,enron,genbase,medical
coverage_error,7.822498,2.753535,1.502159,12.07309,1.389728,1.888753
ranking_loss,0.19875,0.164368,0.082875,0.079046,0.002847,0.012165
hamming_loss,0.240683,0.217284,0.203128,0.093003,0.004811,0.01412
f1_macro,0.422559,0.64269,0.626591,0.378793,0.504426,0.275167
f1_micro,0.594736,0.672793,0.607796,0.517522,0.944995,0.725833
Jaccard_Index,0.759317,0.782716,0.796872,0.906997,0.995189,0.98588
zero_one_error,0.862862,0.706397,0.803654,0.964512,0.107553,0.470757


In [6]:
BCC_tree = pd.read_csv("/Users/jiangjunhao/Desktop/results_algorithms/twofold/BCC_tree.csv", index_col=0)
BCC_tree

Unnamed: 0,yeast,emotions,scene,enron,genbase,medical
coverage_error,7.822498,2.753535,1.502159,12.07309,1.389728,1.888753
ranking_loss,0.19875,0.164368,0.082875,0.079046,0.002847,0.012165
hamming_loss,0.240683,0.217284,0.203128,0.093003,0.004811,0.01412
f1_macro,0.422559,0.64269,0.626591,0.378793,0.504426,0.275167
f1_micro,0.594736,0.672793,0.607796,0.517522,0.944995,0.725833
Jaccard_Index,0.759317,0.782716,0.796872,0.906997,0.995189,0.98588
zero_one_error,0.862862,0.706397,0.803654,0.964512,0.107553,0.470757


In [7]:
data_list = ['tmc2007','rcv1subset1','rcv1subset2','rcv1subset3','rcv1subset4','rcv1subset5']
seed = [1234,2234,12345,12346,1234567]
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/large_datasets/'+dataset+"/"
    X_file = "X_dis_1500.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data
    
    d = pd.DataFrame(np.zeros([7,1]))
    if label.shape[1] > 30:
        ensemble = 30
    else:
        ensemble = label.shape[1]
        
    for s in seed:
        df = BCC_test_structure_twofold(data, label, dataPath, random_state=s, ensemble = ensemble, structure="tree")
        df.columns = [dataset]
        d.columns = [dataset]
        d.index = df.index
        d = d + df
    BCC_tree = pd.concat([BCC_tree, d/5],axis=1)
    
BCC_tree.to_csv("/Users/jiangjunhao/Desktop/results_algorithms/twofold/BCC_tree.csv")
BCC_tree

tmc2007
rcv1subset1


  self.class_log_prior_ = (np.log(self.class_count_) -
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


rcv1subset2
rcv1subset3
rcv1subset4
rcv1subset5


Unnamed: 0,yeast,emotions,scene,enron,genbase,medical,tmc2007,rcv1subset1,rcv1subset2,rcv1subset3,rcv1subset4,rcv1subset5
coverage_error,7.822498,2.753535,1.502159,12.07309,1.389728,1.888753,3.965142,8.843567,7.666133,7.6754,7.259567,7.696333
ranking_loss,0.19875,0.164368,0.082875,0.079046,0.002847,0.012165,0.053674,0.031038,0.026412,0.026597,0.025732,0.026246
hamming_loss,0.240683,0.217284,0.203128,0.093003,0.004811,0.01412,0.112906,0.028498,0.027052,0.027366,0.026751,0.02724
f1_macro,0.422559,0.64269,0.626591,0.378793,0.504426,0.275167,0.495739,0.339552,0.329761,0.316706,0.269549,0.304158
f1_micro,0.594736,0.672793,0.607796,0.517522,0.944995,0.725833,0.595701,0.511235,0.494076,0.491553,0.463382,0.490171
Jaccard_Index,0.759317,0.782716,0.796872,0.906997,0.995189,0.98588,0.887094,0.971502,0.972948,0.972634,0.973249,0.97276
zero_one_error,0.862862,0.706397,0.803654,0.964512,0.107553,0.470757,0.875857,0.954767,0.8848,0.864133,0.798167,0.892167


# DAG

In [6]:
BCC_dag= pd.DataFrame()

seed = [1234,2234,12345,12346,1234567]

data_list = ["yeast","emotions","scene"]
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/small_datasets/'+dataset+"/"
    X_file = "X.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data
    
    d = pd.DataFrame(np.zeros([7,1]))
    for s in seed:
        df = BCC_test_structure_twofold(data, label, dataPath, random_state = s, ensemble = 1, structure="DAG")
        df.columns = [dataset]
        d.columns = [dataset]
        d.index = df.index
        d = d + df
    BCC_dag = pd.concat([BCC_dag, d/5],axis=1)

BCC_dag.to_csv("/Users/jiangjunhao/Desktop/results_algorithms/twofold/BCC_dag.csv")
BCC_dag

yeast
emotions
scene


Unnamed: 0,yeast,emotions,scene
coverage_error,7.848304,2.776431,1.525914
ranking_loss,0.196679,0.169306,0.087787
hamming_loss,0.230249,0.221212,0.200831
f1_macro,0.411008,0.651435,0.629346
f1_micro,0.600733,0.673136,0.609666
Jaccard_Index,0.769751,0.778788,0.799169
zero_one_error,0.843176,0.709764,0.794518


In [None]:
data_list = ['tmc2007','rcv1subset1','rcv1subset2','rcv1subset3','rcv1subset4','rcv1subset5']
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/large_datasets/'+dataset+"/"
    X_file = "X_dis_1500.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data
    
    d = pd.DataFrame(np.zeros([7,1]))
    for s in seed:
        df = BCC_test_structure_twofold(data, label, dataPath, random_state = s, ensemble = 1, structure="DAG")
        df.columns = [dataset]
        d.columns = [dataset]
        d.index = df.index
        d = d + df
    BCC_dag = pd.concat([BCC_dag, d/5],axis=1)
    
BCC_dag.to_csv("/Users/jiangjunhao/Desktop/results_algorithms/twofold/BCC_dag.csv")
BCC_dag

In [6]:
data_list = ["enron"]
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/small_datasets/'+dataset+"/"
    X_file = "X.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data

enron


In [None]:
model = BayesianNetwork.from_samples(label, algorithm='greedy',max_parents=2)
bayes_net = get_structure(model, label.columns)
order = get_order(model, label.columns)