In [2]:
import pandas as pd
import numpy as np
import os
import time
import subprocess
import re
import random
import arff

from math import sqrt
from sklearn.model_selection import  train_test_split
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB  
from sklearn.metrics import classification_report
from sklearn.metrics import coverage_error
from sklearn.metrics import label_ranking_loss
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import zero_one_loss
from sklearn.metrics import jaccard_similarity_score

In [6]:
def read_data(dataPath, X_file, y_file):
    # input: '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'
    # read data
    data = pd.read_csv(os.path.join(dataPath,X_file))
    label = pd.read_csv(os.path.join(dataPath,y_file))
    return data,label

def evaluation(y_pred, y_prob, y_true):
    
    coverage = coverage_error(y_true, y_prob)
    hamming = hamming_loss(y_true, y_pred)
    ranking_loss = label_ranking_loss(y_true, y_prob) 
    
    f1_macro = metrics.f1_score(y_true, y_pred, average='macro')
    f1_micro = metrics.f1_score(y_true, y_pred, average='micro')
    
    acc = 0
    for i in range(y_true.shape[0]):
        acc += jaccard_similarity_score(y_true.iloc[i,:], y_pred.iloc[i,:]) # jaccard_similarity_score
    acc = round(acc / y_true.shape[0],2)
    
    zero_one = zero_one_loss(y_true, y_pred) # 0-1 error 
    
    performance = {"coverage_error":coverage,
                   "ranking_loss":ranking_loss,
                   "hamming_loss":hamming,
                   "f1_macro":f1_macro,
                   "f1_micro":f1_micro,
                   "Jaccard_Index":acc,
                   "zero_one_error":zero_one}
    return performance

In [15]:
def naiveBayes_multi_label_training(X_train, y_train):
    start = time.time()
    
    n_label = y_train.shape[1]
    classifier_list = [MultinomialNB() for i in range(n_label)]
    for i in range(n_label):
        classifier_list[i].fit(X_train,y_train.iloc[:,i])
    
    end = time.time()
    training_time = end-start
    
    return classifier_list, training_time

def naiveBayes_multi_label_testing(X_test, n_label, classifier_list):
    y_predict = pd.DataFrame()
    y_prob = pd.DataFrame()
    
    start = time.time()
    
    for i in range(n_label):
        y_predict_i = classifier_list[i].predict(X_test)
        y_predict = pd.concat([y_predict, pd.DataFrame(y_predict_i)],axis=1)
        
        y_predict_prob_i = classifier_list[i].predict_proba(X_test)[:,1]
        y_prob = pd.concat([y_prob, pd.DataFrame(y_predict_prob_i)],axis=1)
        
    end = time.time()
    testing_time = end-start
        
    return y_predict, y_prob, testing_time

def BR_test(data, label, dataPath, random_state=3071980):
    
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()
    
    # split training and test data set
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=random_state)
    
    print("-- test index --")
    print(X_test.index)
    
    # training
    classifier_list, training_time = naiveBayes_multi_label_training(X_train, y_train)
    
    # testing
    y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(X_test, n_label, classifier_list)
    
    # evaluation
    performance = evaluation(y_predict, y_prob, y_test)
    
    performance_df = pd.DataFrame.from_dict(performance, orient='index')
    
    return performance_df
            
# two fold cross-validation
def two_fold_BR_test(data, label, dataPath, n_iter=5, random_state=3071980):
    
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()
    
    # 2-fold cross validatiom
    KF=KFold(n_splits=2, shuffle=True, random_state=random_state)
    i = 0
    
    performance = {}
    for train_index,test_index in KF.split(data):
        i += 1
        
        X_train,X_test=data.iloc[train_index,:],data.iloc[test_index,:]
        y_train,y_test=label.iloc[train_index,:],label.iloc[test_index,:]
        
        print("--- kfold time="+str(i)+" ---")
        # training
        classifier_list, training_time = naiveBayes_multi_label_training(X_train, y_train)
        # testing
        y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(X_test, n_label, classifier_list)
        
        # evaluation
        if performance == {}:
            performance = evaluation(y_predict, y_prob, y_test)
        else:
            performance_i = evaluation(y_predict, y_prob, y_test)
            for key, value in performance_i.items():
                performance[key] = (performance[key] + value)/2
            else:
                performance[key] = value
    
    performance_df = pd.DataFrame.from_dict(performance, orient='index')
    
    return performance_df

In [46]:
df_all = pd.DataFrame()
data_list = ["yeast","enron","emotions","genbase","scene","medical"]
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/small_datasets/'+dataset+"/"
    X_file = "X.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data

    # train - test
    print("------ Binary Relevance using Naive Bayes ------")
    df = BR_test(data, label, dataPath,307190)
    df.columns = [dataset]
    
    df_all = pd.concat([df_all, df],axis=1)

data_list = ['rcv1subset1','rcv1subset2','rcv1subset3','rcv1subset4','rcv1subset5','tmc2007']
for dataset in data_list:
    print(dataset)
    dataPath = '/Volumes/Samsung_T5/research/data/large_datasets/'+dataset+"/"
    X_file = "X_dis_1500.csv"
    y_file = "y.csv"
    data, label = read_data(dataPath, X_file, y_file) # read data

    # train - test
    df = BR_test(data, label, dataPath,307190)
    df.columns = [dataset]
    
    df_all = pd.concat([df_all, df],axis=1)
    
df_all.to_csv("/Users/jiangjunhao/Desktop/results_algorithms/binary_relevance_naive_bayes.csv")

yeast
------ Binary Relevance using Naive Bayes ------
-- test index --
Int64Index([1922, 2019, 1276, 2123,  700, 1114,   28, 2386,  406, 1129,
            ...
            1424,  627,  805, 1659, 1850,  988, 2373,  293, 2012,  531],
           dtype='int64', length=1209)
enron
------ Binary Relevance using Naive Bayes ------
-- test index --
Int64Index([ 996,  964,  846,  734, 1275, 1610, 1483, 1084,  113, 1168,
            ...
            1673, 1040,  639, 1227,  954,  566,  541,  707,  532,  202],
           dtype='int64', length=851)


  self.class_log_prior_ = (np.log(self.class_count_) -
  'precision', 'predicted', average, warn_for)


emotions
------ Binary Relevance using Naive Bayes ------
-- test index --
Int64Index([178, 264,  79, 405, 208, 219, 589,  28,  93,  23,
            ...
            408, 413, 568, 297, 313,  43, 259, 552, 424, 166],
           dtype='int64', length=297)
genbase
------ Binary Relevance using Naive Bayes ------
-- test index --
Int64Index([537, 405, 549, 196, 321, 493, 458, 127, 113, 390,
            ...
            475, 420, 325,  12, 586, 560, 635, 588,  53, 246],
           dtype='int64', length=331)


  'recall', 'true', average, warn_for)


scene
------ Binary Relevance using Naive Bayes ------
-- test index --
Int64Index([ 454, 1014,  590, 2047, 1794,  611,  345,  484, 1202,  101,
            ...
            2311, 1993,  593,   41, 1415, 1623,  404,  661,  252, 2012],
           dtype='int64', length=1204)
medical
------ Binary Relevance using Naive Bayes ------
-- test index --
Int64Index([843, 914, 822, 933, 510, 267, 520, 329, 820, 491,
            ...
            139, 800, 884, 697, 709, 952,  94, 850,  75, 798],
           dtype='int64', length=489)
rcv1subset1
-- test index --
Int64Index([1718, 3386,  593, 5318,  431, 5926, 4997, 4959, 1332, 5838,
            ...
            5003, 5033, 2254, 1444, 5457, 1144, 4496,  838, 2001,  592],
           dtype='int64', length=3000)
rcv1subset2
-- test index --
Int64Index([1718, 3386,  593, 5318,  431, 5926, 4997, 4959, 1332, 5838,
            ...
            5003, 5033, 2254, 1444, 5457, 1144, 4496,  838, 2001,  592],
           dtype='int64', length=3000)
rcv1subset3
-- te