In [1]:
import subprocess
import pandas as pd

In [2]:
import pandas as pd
import numpy as np
import os
import time
import subprocess
import re
import random
import arff

from math import sqrt
from sklearn.model_selection import  train_test_split
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB  
from sklearn.metrics import classification_report
from sklearn.metrics import coverage_error
from sklearn.metrics import label_ranking_loss
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import zero_one_loss
from sklearn.metrics import jaccard_similarity_score

In [3]:
def read_data(dataPath):
    # input: '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'
    # read data
    data = pd.read_csv(os.path.join(dataPath,'X.csv'))
    label = pd.read_csv(os.path.join(dataPath,'Y.csv'))
    return data,label

def evaluation(y_pred, y_prob, y_true):
    
    coverage = coverage_error(y_true, y_prob)
    hamming = hamming_loss(y_true, y_pred)
    ranking_loss = label_ranking_loss(y_true, y_prob) 
    
    f1_macro = metrics.f1_score(y_true, y_pred, average='macro')
    f1_micro = metrics.f1_score(y_true, y_pred, average='micro')
    
    acc = 0
    for i in range(y_true.shape[0]):
        acc += jaccard_similarity_score(y_true.iloc[i,:], y_pred.iloc[i,:]) # jaccard_similarity_score
    acc = round(acc / y_true.shape[0],2)
    
    zero_one = zero_one_loss(y_true, y_pred) # 0-1 error 
    
    f1_each = metrics.f1_score(y_true, y_pred, average=None)
    
    performance = {"coverage_error":coverage,
                   "ranking_loss":ranking_loss,
                   "hamming_loss":hamming,
                   "f1_macro":f1_macro,
                   "f1_micro":f1_micro,
                   "Jaccard_Index":acc,
                   "zero_one_error":zero_one,
                   "f1_each_label":f1_each}
    return performance

def get_confusion_matrix(y_pred, y_test, column_names):
    """confusion matrix """
    confusion_matrix = pd.DataFrame(np.array(y_pred) - np.array(y_test), columns=column_names)
    pos = pd.DataFrame((np.array(y_pred) == np.array(y_test)) & (np.array(y_pred) == 1), columns=y_test.columns).sum(axis=0)
    neg = pd.DataFrame((np.array(y_pred) == np.array(y_test)) & (np.array(y_pred) == 0), columns=y_test.columns).sum(axis=0)
    for i in range(confusion_matrix.shape[1]): 
        name = confusion_matrix.iloc[:,i].name
        temp = confusion_matrix.iloc[:,i].value_counts()
        TP = pos[name]
        TN = neg[name]
        if 1 in temp.index:
            FP = temp[1]
        else:
            FP = 0
        if -1 in temp.index:
            FN = temp[-1]
        else:
            FN = 0

def naiveBayes_multi_label_training(X_train, y_train):
    start = time.time()
    
    n_label = y_train.shape[1]
    classifier_list = [MultinomialNB() for i in range(n_label)]
    for i in range(n_label):
        classifier_list[i].fit(X_train,y_train.iloc[:,i])
    
    end = time.time()
    training_time = end-start
    
    return classifier_list, training_time

def naiveBayes_multi_label_testing(X_test, n_label, classifier_list):
    y_predict = pd.DataFrame()
    y_prob = pd.DataFrame()
    
    start = time.time()
    
    for i in range(n_label):
        y_predict_i = classifier_list[i].predict(X_test)
        y_predict = pd.concat([y_predict, pd.DataFrame(y_predict_i)],axis=1)
        
        y_predict_prob_i = classifier_list[i].predict_proba(X_test)[:,1]
        y_prob = pd.concat([y_prob, pd.DataFrame(y_predict_prob_i)],axis=1)
        
    end = time.time()
    testing_time = end-start
        
    return y_predict, y_prob, testing_time

def BR_test(data, label, dataPath, random_state=3071980):
    
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()
    
    # split training and test data set
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5, random_state=random_state)
    
    print("-- test index --")
    print(X_test.index)
    
    # training
    classifier_list, training_time = naiveBayes_multi_label_training(X_train, y_train)
    
    # testing
    y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(X_test, n_label, classifier_list)
    
    # evaluation
    performance = evaluation(y_predict, y_prob, y_test)
    
    # print data information
    print("--- Data Information ---")
    print("dataset:", dataPath)
    print("number of label:",n_label)
    print("number of attribute:",n_attr)
    print("number of instance:",n_instance,"\n")
    
    # get confusion matrix
    get_confusion_matrix(y_predict, y_test, y_test.columns)
    
    # print performance
    print("--- Performance ---")
    for key, value in performance.items():
        if key == "f1_each_label":
            print("\n- f1 for each label -")
            for i in range(n_label):
                print("label_"+label.columns[i],"=",round(value[i],2))
        elif key == "coverage_error":
            print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
        else:
            print(key,'=',round(value,2))
    
    return y_predict, y_test

# two fold cross-validation
def two_fold_BR_test(data, label, dataPath, n_iter=5, random_state=3071980):
    
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()
    
    # 2-fold cross validatiom
    KF=KFold(n_splits=2, shuffle=True, random_state=random_state)
    i = 0
    
    performance = {}
    for train_index,test_index in KF.split(data):
        i += 1
        
        X_train,X_test=data.iloc[train_index,:],data.iloc[test_index,:]
        y_train,y_test=label.iloc[train_index,:],label.iloc[test_index,:]
        
        print("--- kfold time="+str(i)+" ---")
        # training
        classifier_list, training_time = naiveBayes_multi_label_training(X_train, y_train)
        # testing
        y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(X_test, n_label, classifier_list)
        
        # evaluation
        if performance == {}:
            performance = evaluation(y_predict, y_prob, y_test)
        else:
            performance_i = evaluation(y_predict, y_prob, y_test)
            for key, value in performance_i.items():
                performance[key] = (performance[key] + value)/2
            else:
                performance[key] = value
    
    # print data information
    print("\n--- Data Information ---")
    print("dataset:", dataPath)
    print("number of label:",n_label)
    print("number of attribute:",n_attr)
    print("number of instance:",n_instance,"\n")
    
    # print performance
    print("--- 2 fold cross-validation Performance ---")
    for key, value in performance.items():
        if key == "f1_each_label":
            continue
        elif key == "coverage_error":
            print(key,'=',round(value,2),"( avg_label_per_instance =",round(avg_label_per_instance,2),")")
        else:
            print(key,'=',round(value,2))

In [4]:
dataPath = '/Volumes/Samsung_T5/research/data/small_datasets/emotions/'
dataset = 'emotions'
data, label = read_data(dataPath) # read data

# get data information
n_label = label.shape[1]
n_attr = data.shape[1]
n_instance = data.shape[0]
avg_label_per_instance = label.sum(axis=1).mean()
avg_instance_per_label = label.sum(axis=0).mean()
# print data information
print("\n--- Data Information ---")
print("dataset:", dataPath)
print("number of label:",n_label)
print("number of attribute:",n_attr)
print("number of instance:",n_instance)
print("avgerage number of labels for an instance:",avg_label_per_instance)
print("avgerage number of positive instances for a label:",avg_instance_per_label,"the std:",sqrt(label.sum(axis=0).var()),"\n")

print("-- number of positive instances --")
print(label.sum(axis=0))

# read data
data, label = read_data(dataPath)

# train - test
print("------ Binary Relevance using Naive Bayes ------")
y_predict, y_test = BR_test(data, label, dataPath,3071980)

error_matrix = pd.DataFrame(np.array(y_predict) - np.array(y_test), columns=y_test.columns)
error_matrix.to_csv("/Users/jiangjunhao/Desktop/error_matrix.csv",index=False)


--- Data Information ---
dataset: /Volumes/Samsung_T5/research/data/small_datasets/emotions/
number of label: 6
number of attribute: 72
number of instance: 593
avgerage number of labels for an instance: 1.8684654300168635
avgerage number of positive instances for a label: 184.66666666666666 the std: 41.03494445794543 

-- number of positive instances --
amazed-suprised    173
happy-pleased      166
relaxing-calm      264
quiet-still        148
sad-lonely         168
angry-aggresive    189
dtype: int64
------ Binary Relevance using Naive Bayes ------
-- test index --
Int64Index([305,  40, 469, 422, 166,  29, 537, 285,  57, 112,
            ...
            317,  27, 249, 551, 591,  36, 334, 480, 494, 511],
           dtype='int64', length=297)
--- Data Information ---
dataset: /Volumes/Samsung_T5/research/data/small_datasets/emotions/
number of label: 6
number of attribute: 72
number of instance: 593 

--- Performance ---
coverage_error = 2.83 ( avg_label_per_instance = 1.87 )
ranking_l

In [5]:
def build_BN(labelFile, labelName, savePng):
    cmd = """cd /Volumes/Samsung_T5/research/programme/Chordalysis/ 
    java -Xmx1g -classpath bin:lib/core/commons-math3-3.2.jar:lib/core/jayes.jar:lib/core/jgrapht-jdk1.6.jar:lib/extra/jgraphx.jar:lib/loader/weka.jar demo.Run %s 0.05 %s false
    """ % (labelFile,savePng)

    p = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
    out,err = p.communicate()  
    for line in out.splitlines():  
        if line.decode("utf-8").startswith('['):
            graph_set = [i for i in map(lambda x: x.split(','), line.decode("utf-8").replace(' ',',').strip('[[\,]]').split(',]['))]

    dic = {}
    for l in labelName:
        s = set()
        for i in map(lambda x: set(x) if l in x else None, graph_set):
            if i != None:
                s.update(i)
        s.remove(l)       
        dic[l] = s

    return dic

In [8]:
data, label = read_data(dataPath) # read data

# get data information
n_label = label.shape[1]
n_attr = data.shape[1]
n_instance = data.shape[0]
avg_label_per_instance = label.sum(axis=1).mean()
avg_instance_per_label = label.sum(axis=0).mean()
# print data information
print("\n--- Data Information ---")
print("dataset:", dataPath)
print("number of label:",n_label)
print("number of attribute:",n_attr)
print("number of instance:",n_instance)
print("avgerage number of labels for an instance:",avg_label_per_instance)
print("avgerage number of positive instances for a label:",avg_instance_per_label,"the std:",sqrt(label.sum(axis=0).var()),"\n")

print("-- number of positive instances --")
print(label.sum(axis=0))


--- Data Information ---
dataset: /Volumes/Samsung_T5/research/data/small_datasets/emotions/
number of label: 6
number of attribute: 72
number of instance: 593
avgerage number of labels for an instance: 1.8684654300168635
avgerage number of positive instances for a label: 184.66666666666666 the std: 41.03494445794543 

-- number of positive instances --
amazed-suprised    173
happy-pleased      166
relaxing-calm      264
quiet-still        148
sad-lonely         168
angry-aggresive    189
dtype: int64


In [10]:
cooccurrence_matrix = label.T.dot(label)
np.fill_diagonal(cooccurrence_matrix.values, 0)
#cooccurrence_matrix.to_csv('/Users/jiangjunhao/Desktop/cooccurrence_matrix.csv', index=False)
cooccurrence_matrix

Unnamed: 0,amazed-suprised,happy-pleased,relaxing-calm,quiet-still,sad-lonely,angry-aggresive
amazed-suprised,0,56,13,0,10,92
happy-pleased,56,0,91,7,1,12
relaxing-calm,13,91,0,104,95,7
quiet-still,0,7,104,0,105,2
sad-lonely,10,1,95,105,0,20
angry-aggresive,92,12,7,2,20,0


In [13]:
cooccurrence_matrix = error_matrix.T.dot(error_matrix)
np.fill_diagonal(cooccurrence_matrix.values, 0)
#cooccurrence_matrix.to_csv('/Users/jiangjunhao/Desktop/cooccurrence_matrix.csv', index=False)
cooccurrence_matrix

Unnamed: 0,amazed-suprised,happy-pleased,relaxing-calm,quiet-still,sad-lonely,angry-aggresive
amazed-suprised,0,-4,-16,-9,-17,11
happy-pleased,-4,0,13,-7,-6,-18
relaxing-calm,-16,13,0,12,19,-15
quiet-still,-9,-7,12,0,39,-6
sad-lonely,-17,-6,19,39,0,-6
angry-aggresive,11,-18,-15,-6,-6,0


In [6]:
labelFile = "/Volumes/Samsung_T5/research/data/small_datasets/emotions/y.csv"
savePng = "/Users/jiangjunhao/Desktop/1.png"
label = pd.read_csv(labelFile)
dic = build_BN(labelFile, label.columns, savePng)
dic

{'amazed-suprised': {'quiet-still', 'relaxing-calm'},
 'angry-aggresive': {'relaxing-calm'},
 'happy-pleased': {'sad-lonely'},
 'quiet-still': {'amazed-suprised', 'sad-lonely'},
 'relaxing-calm': {'amazed-suprised', 'angry-aggresive'},
 'sad-lonely': {'happy-pleased', 'quiet-still'}}

{'amazed-suprised': {'relaxing-calm', 'sad-lonely'},
 'angry-aggresive': {'happy-pleased', 'sad-lonely'},
 'happy-pleased': {'angry-aggresive'},
 'quiet-still': {'sad-lonely'},
 'relaxing-calm': {'amazed-suprised'},
 'sad-lonely': {'amazed-suprised', 'angry-aggresive', 'quiet-still'}}