In [1]:
import pandas as pd
import numpy as np
import os
import time
import subprocess
import re
import random
import arff

from math import sqrt
from sklearn.model_selection import  train_test_split
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB  
from sklearn.metrics import classification_report
from sklearn.metrics import coverage_error
from sklearn.metrics import label_ranking_loss
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import zero_one_loss
from sklearn.metrics import jaccard_similarity_score

In [2]:
def read_data(dataPath, X_file, y_file):
    # input: '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'
    # read data
    data = pd.read_csv(os.path.join(dataPath,X_file))
    label = pd.read_csv(os.path.join(dataPath,y_file))
    return data,label

def evaluation(y_pred, y_prob, y_true):
    
    coverage = coverage_error(y_true, y_prob)
    hamming = hamming_loss(y_true, y_pred)
    ranking_loss = label_ranking_loss(y_true, y_prob) 
    
    f1_macro = metrics.f1_score(y_true, y_pred, average='macro')
    f1_micro = metrics.f1_score(y_true, y_pred, average='micro')
    
    acc = 0
    for i in range(y_true.shape[0]):
        acc += jaccard_similarity_score(y_true.iloc[i,:], y_pred.iloc[i,:]) # jaccard_similarity_score
    acc = acc / y_true.shape[0]
    
    zero_one = zero_one_loss(y_true, y_pred) # 0-1 error 
    
    performance = {"coverage_error":coverage,
                   "ranking_loss":ranking_loss,
                   "hamming_loss":hamming,
                   "f1_macro":f1_macro,
                   "f1_micro":f1_micro,
                   "Jaccard_Index":acc,
                   "zero_one_error":zero_one}
    return performance

In [3]:
def naiveBayes_multi_label_training(X_train, y_train):
    start = time.time()
    
    n_label = y_train.shape[1]
    
    order = random.sample(list(range(n_label)),n_label) # get orders
    
    classifier_list = [MultinomialNB() for i in range(n_label)] # create a classifier chain
    
    for i in range(n_label):
        if i == 0:
            classifier_list[i].fit(X_train,y_train.iloc[:, order[i]])
        else:
            X_train = pd.concat([X_train, y_train.iloc[:,order[i-1]]],axis=1) # put the previous label into attribute space
            classifier_list[i].fit(X_train,y_train.iloc[:,order[i]])

    end = time.time()
    training_time = end-start
    
    return classifier_list, training_time, order

def naiveBayes_multi_label_testing(X_test, n_label, classifier_list, order):
    y_predict = pd.DataFrame(index=X_test.index)
    y_prob = pd.DataFrame(index=X_test.index)
    y_true = pd.DataFrame(index=X_test.index)
    
    start = time.time()
    
    for i in range(n_label):
        y_predict_i = classifier_list[i].predict(X_test)
        y_predict = pd.concat([y_predict, pd.DataFrame(y_predict_i,index=X_test.index)],axis=1)

        y_predict_prob_i = classifier_list[i].predict_proba(X_test)[:,1]
        y_prob = pd.concat([y_prob, pd.DataFrame(y_predict_prob_i,index=X_test.index)],axis=1)

        X_test = pd.concat([X_test, pd.DataFrame(y_predict_i,index=X_test.index)],axis=1,ignore_index=True) # put the previous label into attribute space

    end = time.time()
    testing_time = end-start
        
    return y_predict, y_prob, testing_time

            
def ECC_test(data, label, dataPath, random_state=3071980, ensemble = 5):
    
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()
    
    # split training and test data set
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=random_state)
    
    # ensemble
    y_pred_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)
    y_prob_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)
    for i in range(ensemble):
        # training
        #print("--- start training ---\n")
        classifier_list, training_time, order = naiveBayes_multi_label_training(X_train, y_train)

        # testing
        #print("--- start testing ---\n")
        y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(X_test, n_label, classifier_list, order)

        y_predict.columns = label.columns[order]
        y_prob.columns = label.columns[order]
        y_predict = y_predict[label.columns]
        y_prob = y_prob[label.columns]

        y_pred_ensemble = y_pred_ensemble + y_predict
        y_prob_ensemble = y_prob_ensemble + y_prob

    y_pred_ensemble = (((y_pred_ensemble / ensemble) >= 0.5)*1).astype('int')
    y_prob_ensemble = y_prob_ensemble / ensemble 
    
    # evaluation
    performance = evaluation(y_pred_ensemble, y_prob_ensemble, y_test)
    
    performance_df = pd.DataFrame.from_dict(performance, orient='index')
    
    return performance_df

def ECC_test_2_fold(data, label, dataPath, random_state=3071980, ensemble = 5):
    
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()
    
    # split training and test data set
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=random_state)
    
    
    performance_df_all = pd.DataFrame(np.zeros([7,1]))
    
    for j in range(2):
        X_train, y_train = X_test, y_test
        
        # ensemble
        y_pred_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)
        y_prob_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)
        for i in range(ensemble):
            # training
            #print("--- start training ---\n")
            classifier_list, training_time, order = naiveBayes_multi_label_training(X_train, y_train)

            # testing
            #print("--- start testing ---\n")
            y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(X_test, n_label, classifier_list, order)

            y_predict.columns = label.columns[order]
            y_prob.columns = label.columns[order]
            y_predict = y_predict[label.columns]
            y_prob = y_prob[label.columns]

            y_pred_ensemble = y_pred_ensemble + y_predict
            y_prob_ensemble = y_prob_ensemble + y_prob

        y_pred_ensemble = (((y_pred_ensemble / ensemble) >= 0.5)*1).astype('int')
        y_prob_ensemble = y_prob_ensemble / ensemble 

        # evaluation
        performance = evaluation(y_pred_ensemble, y_prob_ensemble, y_test)
        performance_df = pd.DataFrame.from_dict(performance, orient='index')
        performance_df_all.index = performance_df.index
        performance_df_all.columns = performance_df.columns
        performance_df_all = performance_df_all + performance_df
    
    performance_df_all = performance_df_all / 2
    return performance_df_all

# CC

In [9]:
df_all_1 = pd.DataFrame()
data_list = ["yeast","emotions","scene","enron","genbase","medical"]
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/small_datasets/'+dataset+"/"
    X_file = "X.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data

    # train - test
    df = ECC_test(data, label, dataPath, random_state=3071980, ensemble=1)
    df.columns = [dataset]
    
    df_all_1 = pd.concat([df_all_1, df],axis=1)

data_list = ['tmc2007','rcv1subset1','rcv1subset2','rcv1subset3','rcv1subset4','rcv1subset5']
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/large_datasets/'+dataset+"/"
    X_file = "X_dis_1500.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data

    # train - test
    df = ECC_test(data, label, dataPath, random_state=3071980, ensemble=1)
    df.columns = [dataset]
    
    df_all_1 = pd.concat([df_all_1, df],axis=1)
    
df_all_1.to_csv("/Users/jiangjunhao/Desktop/results_algorithms/ClassifierChain_naive_bayes.csv")
df_all_1

yeast
emotions
scene
enron


  self.class_log_prior_ = (np.log(self.class_count_) -
  'precision', 'predicted', average, warn_for)


genbase


  'recall', 'true', average, warn_for)


medical
tmc2007
rcv1subset1
rcv1subset2
rcv1subset3
rcv1subset4
rcv1subset5


Unnamed: 0,yeast,emotions,scene,enron,genbase,medical,tmc2007,rcv1subset1,rcv1subset2,rcv1subset3,rcv1subset4,rcv1subset5
coverage_error,8.314309,2.875421,1.510797,17.390129,1.389728,3.824131,4.219052,14.054667,13.936,14.311,12.762333,14.293333
ranking_loss,0.230073,0.182473,0.082962,0.13306,0.003236,0.047049,0.062098,0.056307,0.057106,0.060514,0.054238,0.058865
hamming_loss,0.253574,0.231762,0.197259,0.123717,0.005595,0.02354,0.117804,0.029957,0.027937,0.028604,0.024838,0.028809
f1_macro,0.384789,0.62011,0.63909,0.186216,0.485587,0.15463,0.474961,0.247974,0.21646,0.188286,0.189647,0.196622
f1_micro,0.570112,0.657829,0.618065,0.402186,0.934555,0.561017,0.576928,0.465083,0.422697,0.414194,0.431055,0.415612
Jaccard_Index,0.75,0.77,0.8,0.88,0.99,0.98,0.88,0.97,0.97,0.97,0.98,0.97
zero_one_error,0.885856,0.710438,0.787375,0.990599,0.117825,0.658487,0.885159,0.956,0.867667,0.866,0.799333,0.886333


## two-fold 

In [6]:
df_all_1_fold = pd.DataFrame()
seed = [1234,2234,12345,12346,1234567]
data_list = ["yeast","emotions","scene","enron","genbase","medical"]
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/small_datasets/'+dataset+"/"
    X_file = "X.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data  
    
    d = pd.DataFrame(np.zeros([7,1]))
    for s in seed:
        df = ECC_test_2_fold(data, label, dataPath, random_state=3071980, ensemble=1)
        df.columns = [dataset]
        d.columns = [dataset]
        d.index = df.index
        d = d + df
    df_all_1_fold = pd.concat([df_all_1_fold, d/5],axis=1)

data_list = ['tmc2007','rcv1subset1','rcv1subset2','rcv1subset3','rcv1subset4','rcv1subset5']
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/large_datasets/'+dataset+"/"
    X_file = "X_dis_1500.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data
    
    d = pd.DataFrame(np.zeros([7,1]))
    for s in seed:
        df = ECC_test_2_fold(data, label, dataPath, random_state=3071980, ensemble=1)
        df.columns = [dataset]
        d.columns = [dataset]
        d.index = df.index
        d = d + df
    df_all_1_fold = pd.concat([df_all_1_fold, d/5],axis=1)
    
df_all_1_fold.to_csv("/Users/jiangjunhao/Desktop/results_algorithms/twofold/ClassifierChain_naive_bayes.csv")
df_all_1_fold

yeast


  'precision', 'predicted', average, warn_for)


emotions
scene
enron
genbase


  self.class_log_prior_ = (np.log(self.class_count_) -
  'recall', 'true', average, warn_for)


medical
tmc2007
rcv1subset1
rcv1subset2
rcv1subset3
rcv1subset4
rcv1subset5


Unnamed: 0,yeast,emotions,scene,enron,genbase,medical,tmc2007,rcv1subset1,rcv1subset2,rcv1subset3,rcv1subset4,rcv1subset5
coverage_error,8.085277,2.812795,1.512625,12.860282,1.331118,1.771984,4.019562,9.401933,8.2606,8.415567,7.8166,8.407567
ranking_loss,0.204726,0.168115,0.083603,0.087931,0.002091,0.010484,0.05573,0.033625,0.029344,0.030343,0.028701,0.02904
hamming_loss,0.235307,0.229293,0.19276,0.097801,0.004274,0.01352,0.112968,0.026829,0.023318,0.023456,0.020536,0.024045
f1_macro,0.431348,0.64808,0.642607,0.340286,0.553365,0.321123,0.497193,0.340807,0.356302,0.340567,0.336662,0.344566
f1_micro,0.601635,0.668027,0.624526,0.501544,0.950593,0.733876,0.593046,0.525019,0.51363,0.514507,0.515441,0.506585
Jaccard_Index,0.764693,0.770707,0.80724,0.902199,0.995726,0.98648,0.887032,0.973171,0.976682,0.976544,0.979464,0.975955
zero_one_error,0.862945,0.722222,0.791362,0.967215,0.099094,0.448262,0.877948,0.947533,0.837767,0.828667,0.753933,0.861133


# ECC

In [5]:
df_all = pd.DataFrame()
data_list = ["yeast","emotions","scene","enron","genbase","medical"]
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/small_datasets/'+dataset+"/"
    X_file = "X.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data

    # train - test
    df = ECC_test(data, label, dataPath, random_state=3071980, ensemble=10)
    df.columns = [dataset]
    
    df_all = pd.concat([df_all, df],axis=1)

data_list = ['tmc2007', 'rcv1subset1','rcv1subset2','rcv1subset3','rcv1subset4','rcv1subset5']
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/large_datasets/'+dataset+"/"
    X_file = "X_dis_1500.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data

    # train - test
    df = ECC_test(data, label, dataPath, random_state=3071980, ensemble=10)
    df.columns = [dataset]
    
    df_all = pd.concat([df_all, df],axis=1)
    
df_all.to_csv("/Users/jiangjunhao/Desktop/results_algorithms/EnsembleClassifierChain_naive_bayes.csv")

yeast
emotions
scene
enron


  self.class_log_prior_ = (np.log(self.class_count_) -
  'precision', 'predicted', average, warn_for)


genbase


  'recall', 'true', average, warn_for)


medical
tmc2007
rcv1subset1
rcv1subset2
rcv1subset3
rcv1subset4
rcv1subset5


## 5 times 2 fold

In [7]:
df_all_2_fold = pd.DataFrame()
seed = [1234,2234,12345,12346,1234567]
data_list = ["yeast","emotions","scene","enron","genbase","medical"]
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/small_datasets/'+dataset+"/"
    X_file = "X.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data  
    
    d = pd.DataFrame(np.zeros([7,1]))
    for s in seed:
        df = ECC_test_2_fold(data, label, dataPath, random_state=3071980, ensemble=10)
        df.columns = [dataset]
        d.columns = [dataset]
        d.index = df.index
        d = d + df
    df_all_2_fold = pd.concat([df_all_2_fold, d/5],axis=1)

data_list = ['tmc2007','rcv1subset1','rcv1subset2','rcv1subset3','rcv1subset4','rcv1subset5']
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/large_datasets/'+dataset+"/"
    X_file = "X_dis_1500.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data
    
    d = pd.DataFrame(np.zeros([7,1]))
    for s in seed:
        df = ECC_test_2_fold(data, label, dataPath, random_state=3071980, ensemble=10)
        df.columns = [dataset]
        d.columns = [dataset]
        d.index = df.index
        d = d + df
    df_all_2_fold = pd.concat([df_all_2_fold, d/5],axis=1)
    
df_all_2_fold.to_csv("/Users/jiangjunhao/Desktop/results_algorithms/twofold/EnsembleClassifierChain_naive_bayes.csv")
df_all_2_fold

yeast


  'precision', 'predicted', average, warn_for)


emotions
scene
enron
genbase


  self.class_log_prior_ = (np.log(self.class_count_) -
  'recall', 'true', average, warn_for)


medical
tmc2007
rcv1subset1
rcv1subset2
rcv1subset3
rcv1subset4
rcv1subset5


Unnamed: 0,yeast,emotions,scene,enron,genbase,medical,tmc2007,rcv1subset1,rcv1subset2,rcv1subset3,rcv1subset4,rcv1subset5
coverage_error,7.834243,2.817845,1.497924,11.575323,1.314502,1.765235,3.996944,8.5487,7.532367,7.6773,7.116467,7.701533
ranking_loss,0.194215,0.167507,0.080375,0.076747,0.001802,0.010274,0.054764,0.029843,0.025817,0.02677,0.025019,0.025863
hamming_loss,0.231673,0.22862,0.194477,0.09792,0.004006,0.013601,0.113594,0.026616,0.022713,0.022758,0.019913,0.023224
f1_macro,0.429116,0.653563,0.641782,0.339263,0.560672,0.323638,0.496086,0.345292,0.379203,0.364055,0.362358,0.366712
f1_micro,0.607744,0.671188,0.622727,0.501604,0.953876,0.733506,0.592886,0.532414,0.530172,0.531616,0.533935,0.524146
Jaccard_Index,0.768327,0.77138,0.805523,0.90208,0.995994,0.986399,0.886406,0.973384,0.977287,0.977242,0.980087,0.976776
zero_one_error,0.864847,0.717845,0.793688,0.968508,0.090937,0.451738,0.87878,0.954,0.850733,0.839733,0.765933,0.8666


# result