In [1]:
import pandas as pd
import numpy as np
import os
import time
from sklearn.model_selection import  train_test_split
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB  
from sklearn.metrics import classification_report
from sklearn.metrics import coverage_error
from sklearn.metrics import label_ranking_loss
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import zero_one_loss
from sklearn.metrics import jaccard_similarity_score

# functions

In [2]:
def read_data(dataPath):
    # input: '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'
    # read data
    data = pd.read_csv(os.path.join(dataPath,'X.csv'), index_col=0).reset_index(drop=True)
    label = pd.read_csv(os.path.join(dataPath,'Y.csv'), index_col=0).reset_index(drop=True)
    return data,label

def naiveBayes_multi_label_training(X_train, y_train):
    start = time.time()
    
    n_label = y_train.shape[1]
    classifier_list = [MultinomialNB() for i in range(n_label)]
    for i in range(n_label):
        classifier_list[i].fit(X_train,y_train.iloc[:,i])
    
    end = time.time()
    training_time = end-start
    
    return classifier_list, training_time

def naiveBayes_multi_label_testing(X_test, n_label, classifier_list):
    y_predict = pd.DataFrame()
    y_prob = pd.DataFrame()
    
    start = time.time()
    
    for i in range(n_label):
        y_predict_i = classifier_list[i].predict(X_test)
        y_predict = pd.concat([y_predict, pd.DataFrame(y_predict_i)],axis=1)
        
        y_predict_prob_i = classifier_list[i].predict_proba(X_test)[:,1]
        y_prob = pd.concat([y_prob, pd.DataFrame(y_predict_prob_i)],axis=1)
        
    end = time.time()
    testing_time = end-start
        
    return y_predict, y_prob, testing_time

def evaluation(y_pred, y_prob, y_true):
    coverage = coverage_error(y_true, y_prob)
    hamming = hamming_loss(y_true, y_pred)
    ranking_loss = label_ranking_loss(y_true, y_prob)
    
    f1_macro = metrics.f1_score(y_true, y_pred, average='macro')
    f1_micro = metrics.f1_score(y_true, y_pred, average='micro')
    
    acc = jaccard_similarity_score(y_true, y_pred) # jaccard_similarity_score
    zero_one = zero_one_loss(y_true, y_pred) # 0-1 error 
    
    f1_each = metrics.f1_score(y_true, y_pred, average=None)
    
    
    performance = {"coverage_error":coverage,
                   "ranking_loss":ranking_loss,
                   "hamming_loss":hamming,
                   "f1_macro":f1_macro,
                   "f1_micro":f1_micro,
                   "accuracy":acc,
                   "zero_one_error":zero_one,
                   "f1_each_label":f1_each}
    return performance


def BR_test(data, label, dataPath, random_state=301234):
    
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()
    
    # split training and test data set
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=random_state)
    
    # training
    print("--- start training ---\n")
    classifier_list, training_time = naiveBayes_multi_label_training(X_train, y_train)
    
    # testing
    print("--- start testing ---\n")
    y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(X_test, n_label, classifier_list)
    
    # evaluation
    performance = evaluation(y_predict, y_prob, y_test)
    
    # print data information
    print("--- Data Information ---")
    print("dataset:", dataPath)
    print("number of label:",n_label)
    print("number of attribute:",n_attr)
    print("number of instance:",n_instance,"\n")
    
    # print performance
    print("--- Performance ---")
    for key, value in performance.items():
        if key == "f1_each_label":
            print("\n- f1 for each label -")
            for i in range(n_label):
                print("label_"+label.columns[i],"=",round(value[i],2))
        elif key == "coverage_error":
            print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
        else:
            print(key,'=',round(value,2))
            
# two fold cross-validation
def two_fold_BR_test(data, label, dataPath, n_iter=5, random_state=301234):
    
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()
    
    # 2-fold cross validatiom
    KF=KFold(n_splits=2, shuffle=True, random_state=301234)
    i = 0
    
    performance = {}
    for train_index,test_index in KF.split(data):
        i += 1
        
        X_train,X_test=data.iloc[train_index,:],data.iloc[test_index,:]
        y_train,y_test=label.iloc[train_index,:],label.iloc[test_index,:]
        
        print("--- kfold time="+str(i)+" ---")
        # training
        classifier_list, training_time = naiveBayes_multi_label_training(X_train, y_train)
        # testing
        y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(X_test, n_label, classifier_list)
        
        # evaluation
        if performance == {}:
            performance = evaluation(y_predict, y_prob, y_test)
        else:
            performance_i = evaluation(y_predict, y_prob, y_test)
            for key, value in performance_i.items():
                performance[key] = (performance[key] + value)/2
            else:
                performance[key] = value
    
    # print data information
    print("\n--- Data Information ---")
    print("dataset:", dataPath)
    print("number of label:",n_label)
    print("number of attribute:",n_attr)
    print("number of instance:",n_instance,"\n")
    
    # print performance
    print("--- 2 fold cross-validation Performance ---")
    for key, value in performance.items():
        if key == "f1_each_label":
            continue
        elif key == "coverage_error":
            print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
        else:
            print(key,'=',round(value,2))

# test 

## obesity

In [3]:
dataPath = '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'

# read data
data, label = read_data(dataPath)

# train - test
print("------ Binary Relevance using Naive Bayes ------")
BR_test(data, label, dataPath,123)

print("")
print("------ two_fold Binary Relevance using Naive Bayes ------")
two_fold_BR_test(data, label, dataPath,123)

------ Binary Relevance using Naive Bayes ------
--- start training ---

--- start testing ---

--- Data Information ---
dataset: /Volumes/Samsung_T5/research/data/ABC_news_data/obesity/
number of label: 42
number of attribute: 335
number of instance: 891 

--- Performance ---
coverage_error = 14.05 ( avg_label_per_instance = 3.25 )
ranking_loss = 0.14
hamming_loss = 0.1
f1_macro = 0.28
f1_micro = 0.4
accuracy = 0.26
zero_one_error = 0.97

- f1 for each label -
label_act = 0.0
label_advertising = 0.56
label_advertising-and-marketing = 0.43
label_australia = 0.57
label_business-economics-and-finance = 0.25
label_cancer = 0.29
label_child-health-and-behaviour = 0.37
label_children = 0.29
label_community-and-society = 0.05
label_diabetes = 0.25
label_diet-and-nutrition = 0.31
label_diseases-and-disorders = 0.32
label_doctors-and-medical-professionals = 0.31
label_education = 0.12
label_exercise-and-fitness = 0.15
label_family-and-children = 0.08
label_federal---state-issues = 0.35
label_f

## test medical

In [4]:
dataPath = "/Volumes/Samsung_T5/research/data/small_datasets/medical/"

# read data
data, label = read_data(dataPath)

# train - test
print("------ Binary Relevance using Naive Bayes ------")
BR_test(data, label, dataPath,123)

#print("------ two_fold Binary Relevance using Naive Bayes ------")
#two_fold_BR_test(data, label, dataPath,123)

------ Binary Relevance using Naive Bayes ------
--- start training ---

--- start testing ---



  self.class_log_prior_ = (np.log(self.class_count_) -


--- Data Information ---
dataset: /Volumes/Samsung_T5/research/data/small_datasets/medical/
number of label: 44
number of attribute: 1448
number of instance: 978 

--- Performance ---
coverage_error = 3.39 ( avg_label_per_instance = 1.14 )
ranking_loss = 0.04
hamming_loss = 0.02
f1_macro = 0.13
f1_micro = 0.52
accuracy = 0.45
zero_one_error = 0.66

- f1 for each label -
label_Class-1-079_99 = 0.0
label_Class-2-786_09 = 0.0
label_Class-3-759_89 = 0.0
label_Class-4-753_0 = 0.75
label_Class-5-786_2 = 0.0
label_Class-6-V72_5 = 0.0
label_Class-7-511_9 = 0.0
label_Class-8-596_8 = 0.0
label_Class-9-599_0 = 0.73
label_Class-10-518_0 = 0.0
label_Class-11-593_5 = 0.0
label_Class-12-V13_09 = 0.0
label_Class-13-791_0 = 0.0
label_Class-14-789_00 = 0.0
label_Class-15-593_1 = 0.0
label_Class-16-462 = 0.0
label_Class-17-592_0 = 0.0
label_Class-18-786_59 = 0.0
label_Class-19-785_6 = 0.0
label_Class-20-V67_09 = 0.0
label_Class-21-795_5 = 0.4
label_Class-22-789_09 = 0.0
label_Class-23-786_50 = 0.4
label_

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


# test for one label

In [5]:
dataPath = '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'

# read data
data, label = read_data(dataPath)

label.columns

Index(['act', 'advertising', 'advertising-and-marketing', 'australia',
       'business-economics-and-finance', 'cancer',
       'child-health-and-behaviour', 'children', 'community-and-society',
       'diabetes', 'diet-and-nutrition', 'diseases-and-disorders',
       'doctors-and-medical-professionals', 'education',
       'exercise-and-fitness', 'family-and-children', 'federal---state-issues',
       'federal-government', 'food-and-beverage', 'government-and-politics',
       'health', 'health-policy', 'healthcare-facilities', 'heart-disease',
       'indigenous-aboriginal-and-torres-strait-islander', 'industry',
       'lifestyle-and-leisure', 'medical-research', 'nsw', 'obesity', 'qld',
       'research', 'sa', 'schools', 'science-and-technology', 'smoking',
       'states-and-territories', 'tas', 'united-kingdom', 'united-states',
       'vic', 'wa'],
      dtype='object')

In [6]:
dataPath = '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'

# read data
data, label = read_data(dataPath)

# label australia
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=11)

for col in label.columns:
    cls = MultinomialNB()
    cls.fit(X_train,y_train[col])
    y_predict = cls.predict(X_test)
    y_predict_prob = cls.predict_proba(X_test)
    print((col,sum(y_predict == y_test[col]) / len(y_predict)))

('act', 0.9506726457399103)
('advertising', 0.9304932735426009)
('advertising-and-marketing', 0.9394618834080718)
('australia', 0.6973094170403588)
('business-economics-and-finance', 0.9439461883408071)
('cancer', 0.9708520179372198)
('child-health-and-behaviour', 0.827354260089686)
('children', 0.8878923766816144)
('community-and-society', 0.9260089686098655)
('diabetes', 0.8923766816143498)
('diet-and-nutrition', 0.8071748878923767)
('diseases-and-disorders', 0.8385650224215246)
('doctors-and-medical-professionals', 0.9170403587443946)
('education', 0.9753363228699552)
('exercise-and-fitness', 0.9237668161434978)
('family-and-children', 0.9461883408071748)
('federal---state-issues', 0.9372197309417041)
('federal-government', 0.8542600896860987)
('food-and-beverage', 0.9192825112107623)
('government-and-politics', 0.8430493273542601)
('health', 0.6278026905829597)
('health-policy', 0.9439461883408071)
('healthcare-facilities', 0.9417040358744395)
('heart-disease', 0.9730941704035875)


In [7]:
dataPath = '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'

# read data
data, label = read_data(dataPath)

# label australia
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=11)

cls = MultinomialNB()
cls.fit(X_train,y_train['australia'])
y_predict = cls.predict(X_test)
y_predict_prob = cls.predict_proba(X_test)
print("only X accuracy:",sum(y_predict == y_test['australia']) / len(y_predict))

cls = MultinomialNB()
data = pd.concat([data, label.drop(columns='australia')],axis=1)
label = label['australia']
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=11)
cls.fit(X_train,y_train)
y_predict = cls.predict(X_test)
y_predict_prob = cls.predict_proba(X_test)
print("X and all other labels except austrilia accuracy:",sum(y_predict == y_test) / len(y_predict))

only X accuracy: 0.6973094170403588
X and all other labels except austrilia accuracy: 0.7488789237668162


In [10]:
dataPath = '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'

# read data
data, label = read_data(dataPath)

# label australia
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=11)

cls = MultinomialNB()
cls.fit(X_train,y_train['act'])
y_predict = cls.predict(X_test)
y_predict_prob = cls.predict_proba(X_test)
print("only X accuracy:",sum(y_predict == y_test['act']) / len(y_predict))
print("only X accuracy:",metrics.f1_score(y_test['act'], y_predict))

cls = MultinomialNB()
data = pd.concat([data, label.drop(columns='act')],axis=1)
label = label['act']
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=11)
cls.fit(X_train,y_train)
y_predict = cls.predict(X_test)
y_predict_prob = cls.predict_proba(X_test)
print("X and all other labels except act accuracy:",metrics.f1_score(y_test, y_predict))

only X accuracy: 0.9506726457399103
only X accuracy: 0.0
X and all other labels except act accuracy: 0.0
