In [70]:
import pandas as pd
import numpy as np
import os
import time
import subprocess
import re
import random

from math import sqrt
from sklearn.model_selection import  train_test_split
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB  
from sklearn.metrics import classification_report
from sklearn.metrics import coverage_error
from sklearn.metrics import label_ranking_loss
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import zero_one_loss
from sklearn.metrics import jaccard_similarity_score

In [185]:
def read_data(dataPath):
    # input: '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'
    # read data
    data = pd.read_csv(os.path.join(dataPath,'X.csv'), index_col=0).reset_index(drop=True)
    label = pd.read_csv(os.path.join(dataPath,'Y.csv'), index_col=0).reset_index(drop=True)
    return data,label

def naiveBayes_multi_label_training(X_train, y_train):
    start = time.time()
    
    n_label = y_train.shape[1]
    
    order = random.sample(list(range(n_label)),n_label) # get orders
    
    classifier_list = [MultinomialNB() for i in range(n_label)] # create a classifier chain
    
    for i in range(n_label):
        if i == 0:
            classifier_list[i].fit(X_train,y_train.iloc[:, order[i]])
        else:
            X_train = pd.concat([X_train, y_train.iloc[:,order[i-1]]],axis=1) # put the previous label into attribute space
            classifier_list[i].fit(X_train,y_train.iloc[:,order[i]])

    end = time.time()
    training_time = end-start
    
    return classifier_list, training_time, order

def naiveBayes_multi_label_testing(X_test, n_label, classifier_list, order):
    y_predict = pd.DataFrame(index=X_test.index)
    y_prob = pd.DataFrame(index=X_test.index)
    y_true = pd.DataFrame(index=X_test.index)
    
    start = time.time()
    
    for i in range(n_label):
        y_predict_i = classifier_list[i].predict(X_test)
        y_predict = pd.concat([y_predict, pd.DataFrame(y_predict_i,index=X_test.index)],axis=1)

        y_predict_prob_i = classifier_list[i].predict_proba(X_test)[:,1]
        y_prob = pd.concat([y_prob, pd.DataFrame(y_predict_prob_i,index=X_test.index)],axis=1)

        X_test = pd.concat([X_test, pd.DataFrame(y_predict_i,index=X_test.index)],axis=1,ignore_index=True) # put the previous label into attribute space

    end = time.time()
    testing_time = end-start
        
    return y_predict, y_prob, testing_time

def evaluation(y_pred, y_prob, y_true):
    coverage = coverage_error(y_true, y_prob)
    hamming = hamming_loss(y_true, y_pred)
    ranking_loss = label_ranking_loss(y_true, y_prob)
    
    f1_macro = metrics.f1_score(y_true, y_pred, average='macro')
    f1_micro = metrics.f1_score(y_true, y_pred, average='micro')
    
    acc = jaccard_similarity_score(y_true, y_pred) # jaccard_similarity_score
    zero_one = zero_one_loss(y_true, y_pred) # 0-1 error 
    
    f1_each = metrics.f1_score(y_true, y_pred, average=None)
    
    
    performance = {"coverage_error":coverage,
                   "ranking_loss":ranking_loss,
                   "hamming_loss":hamming,
                   "f1_macro":f1_macro,
                   "f1_micro":f1_micro,
                   "Jaccard_Index":acc,
                   "zero_one_error":zero_one,
                   "f1_each_label":f1_each}
    return performance


def CC_test(data, label, dataPath, random_state=3071980):
    
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()
    
    # split training and test data set
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=random_state)
    
    # training
    print("--- start training ---\n")
    classifier_list, training_time, order = naiveBayes_multi_label_training(X_train, y_train)
    
    # testing
    print("--- start testing ---\n")
    y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(X_test, n_label, classifier_list, order)
    
    # evaluation
    y_test = y_test.iloc[:,order]
    performance = evaluation(y_predict, y_prob, y_test)
    
    # print data information
    print("--- Data Information ---")
    print("dataset:", dataPath)
    print("number of label:",n_label)
    print("number of attribute:",n_attr)
    print("number of instance:",n_instance,"\n")
    
    # print orders
    print("\n--- Order of the chain ---")
    print(label.columns[order])
    print("")
    
    # print performance
    print("--- Performance ---")
    for key, value in performance.items():
        if key == "f1_each_label":
            print("\n- f1 for each label -")
            for i in range(n_label):
                print("label_"+label.columns[i],"=",round(value[i],2))
        elif key == "coverage_error":
            print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
        else:
            print(key,'=',round(value,2))
            
def ECC_test(data, label, dataPath, random_state=3071980, ensemble = 5):
    
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()
    
    # split training and test data set
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=random_state)
    
    # ensemble
    y_pred_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)
    y_prob_ensemble = pd.DataFrame(np.zeros(y_test.shape),columns=y_test.columns, index=y_test.index)
    for i in range(ensemble):
        # training
        #print("--- start training ---\n")
        classifier_list, training_time, order = naiveBayes_multi_label_training(X_train, y_train)

        # print orders
        print("Order of the chain:",label.columns[order])

        # testing
        #print("--- start testing ---\n")
        y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(X_test, n_label, classifier_list, order)

        y_predict.columns = label.columns[order]
        y_prob.columns = label.columns[order]
        y_predict = y_predict[label.columns]
        y_prob = y_prob[label.columns]

        y_pred_ensemble = y_pred_ensemble + y_predict
        y_prob_ensemble = y_prob_ensemble + y_prob

    y_pred_ensemble = (((y_pred_ensemble / ensemble) >= 0.5)*1).astype('int')
    y_prob_ensemble = y_prob_ensemble / ensemble 
    
    # evaluation
    performance = evaluation(y_pred_ensemble, y_prob_ensemble, y_test)
    
    # print data information
    print("--- Data Information ---")
    print("dataset:", dataPath)
    print("number of label:",n_label)
    print("number of attribute:",n_attr)
    print("number of instance:",n_instance,"\n")
    
    # print orders
    print("\n--- Order of the chain ---")
    print(label.columns[order])
    print("")
    
    # print performance
    print("--- Performance ---")
    for key, value in performance.items():
        if key == "f1_each_label":
            print("\n- f1 for each label -")
            for i in range(n_label):
                print("label_"+label.columns[i],"=",round(value[i],2))
        elif key == "coverage_error":
            print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
        else:
            print(key,'=',round(value,2))

In [190]:
dataPath = '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'

# read data
data, label = read_data(dataPath)

# train - test
print("------ Binary Relevance using Naive Bayes ------")
ECC_test(data, label, dataPath, 12343, 50)

------ Binary Relevance using Naive Bayes ------
Order of the chain: Index(['federal---state-issues', 'act', 'smoking', 'federal-government', 'sa',
       'united-states', 'health', 'united-kingdom', 'tas',
       'healthcare-facilities', 'lifestyle-and-leisure',
       'science-and-technology', 'community-and-society',
       'government-and-politics', 'education', 'exercise-and-fitness',
       'obesity', 'research', 'nsw', 'heart-disease', 'states-and-territories',
       'diet-and-nutrition', 'advertising', 'industry', 'qld',
       'diseases-and-disorders', 'medical-research',
       'doctors-and-medical-professionals', 'cancer',
       'indigenous-aboriginal-and-torres-strait-islander',
       'family-and-children', 'vic', 'children', 'food-and-beverage',
       'australia', 'diabetes', 'health-policy', 'advertising-and-marketing',
       'schools', 'child-health-and-behaviour',
       'business-economics-and-finance', 'wa'],
      dtype='object')
Order of the chain: Index(['act'

Order of the chain: Index(['federal---state-issues', 'healthcare-facilities',
       'states-and-territories', 'exercise-and-fitness',
       'indigenous-aboriginal-and-torres-strait-islander', 'sa', 'act',
       'medical-research', 'community-and-society', 'health-policy', 'smoking',
       'advertising-and-marketing', 'government-and-politics', 'diabetes',
       'united-kingdom', 'cancer', 'nsw', 'tas', 'diet-and-nutrition', 'wa',
       'research', 'industry', 'heart-disease', 'advertising',
       'family-and-children', 'science-and-technology', 'food-and-beverage',
       'australia', 'health', 'obesity', 'children', 'vic',
       'doctors-and-medical-professionals', 'federal-government', 'education',
       'schools', 'lifestyle-and-leisure', 'business-economics-and-finance',
       'child-health-and-behaviour', 'qld', 'united-states',
       'diseases-and-disorders'],
      dtype='object')
Order of the chain: Index(['advertising-and-marketing', 'health-policy', 'united-kingdom

Order of the chain: Index(['wa', 'heart-disease', 'sa', 'exercise-and-fitness', 'medical-research',
       'healthcare-facilities', 'advertising', 'children', 'smoking',
       'states-and-territories', 'health-policy', 'schools', 'education',
       'science-and-technology', 'tas', 'advertising-and-marketing',
       'united-kingdom', 'community-and-society', 'government-and-politics',
       'food-and-beverage', 'obesity', 'child-health-and-behaviour',
       'doctors-and-medical-professionals', 'health', 'qld', 'industry',
       'federal---state-issues', 'vic', 'act', 'research', 'united-states',
       'cancer', 'australia',
       'indigenous-aboriginal-and-torres-strait-islander',
       'family-and-children', 'diabetes', 'lifestyle-and-leisure',
       'federal-government', 'diseases-and-disorders', 'diet-and-nutrition',
       'nsw', 'business-economics-and-finance'],
      dtype='object')
Order of the chain: Index(['doctors-and-medical-professionals', 'australia',
       'hea

Order of the chain: Index(['qld', 'children', 'child-health-and-behaviour', 'education',
       'federal-government', 'diseases-and-disorders',
       'business-economics-and-finance', 'medical-research', 'schools',
       'heart-disease', 'smoking', 'diet-and-nutrition',
       'lifestyle-and-leisure', 'research', 'exercise-and-fitness',
       'government-and-politics', 'doctors-and-medical-professionals', 'tas',
       'vic', 'industry', 'cancer', 'diabetes', 'advertising-and-marketing',
       'nsw', 'family-and-children', 'advertising', 'united-states', 'wa',
       'community-and-society', 'sa', 'australia', 'health-policy',
       'states-and-territories', 'health', 'food-and-beverage',
       'united-kingdom', 'indigenous-aboriginal-and-torres-strait-islander',
       'federal---state-issues', 'obesity', 'act', 'healthcare-facilities',
       'science-and-technology'],
      dtype='object')
Order of the chain: Index(['united-kingdom', 'states-and-territories', 'qld', 'diet-and-

Order of the chain: Index(['food-and-beverage', 'states-and-territories', 'lifestyle-and-leisure',
       'vic', 'obesity', 'government-and-politics', 'medical-research',
       'united-states', 'united-kingdom', 'education',
       'indigenous-aboriginal-and-torres-strait-islander', 'industry', 'wa',
       'diet-and-nutrition', 'australia', 'federal-government',
       'federal---state-issues', 'diabetes', 'research', 'act',
       'heart-disease', 'tas', 'family-and-children',
       'doctors-and-medical-professionals', 'smoking',
       'child-health-and-behaviour', 'children', 'advertising-and-marketing',
       'cancer', 'science-and-technology', 'advertising',
       'business-economics-and-finance', 'qld', 'diseases-and-disorders',
       'nsw', 'healthcare-facilities', 'health-policy',
       'community-and-society', 'sa', 'schools', 'exercise-and-fitness',
       'health'],
      dtype='object')
Order of the chain: Index(['vic', 'doctors-and-medical-professionals', 'united-st

Order of the chain: Index(['diet-and-nutrition', 'health-policy', 'government-and-politics',
       'united-kingdom', 'healthcare-facilities', 'qld', 'wa',
       'federal-government', 'science-and-technology', 'children',
       'indigenous-aboriginal-and-torres-strait-islander',
       'states-and-territories', 'child-health-and-behaviour', 'cancer',
       'schools', 'family-and-children', 'medical-research',
       'doctors-and-medical-professionals', 'food-and-beverage',
       'community-and-society', 'diabetes', 'lifestyle-and-leisure',
       'exercise-and-fitness', 'australia', 'advertising', 'nsw', 'research',
       'advertising-and-marketing', 'health', 'heart-disease',
       'federal---state-issues', 'obesity', 'smoking',
       'diseases-and-disorders', 'act', 'tas', 'united-states',
       'business-economics-and-finance', 'sa', 'industry', 'education', 'vic'],
      dtype='object')
Order of the chain: Index(['australia', 'lifestyle-and-leisure', 'education', 'medical-r