In [1]:
import os
import numpy as np
import scipy.stats as stats
from scipy.stats._stats import _kendall_dis
import statistics
import time
# import warnings

## Evaluation metrics

In [17]:
# Evaluation metrics
def top_n_accuracy(pred_val, gt_val, n: int):
    tmp_pred = pred_val.copy()
    tmp_gt_val = gt_val.copy()
    tmp_pred.sort(key=lambda element: element[1], reverse=True)
    tmp_gt_val.sort(key=lambda element: element[1], reverse=True)
    
    pred, gt = [], []
    for i in range(int(len(tmp_pred)*n/100)):
        pred.append(tmp_pred[i][0])
        gt.append(tmp_gt_val[i][0])
    
    return float(len(set(gt)&set(pred)) / len(pred))

def Kendall_tau_distance(pred_val, gt_val):
    pred_bc_val, gt_bc_val = [], []
    for i in pred_val:
        pred_bc_val.append(i[1])
    for i in gt_val:
        gt_bc_val.append(i[1])

    tau, _ = stats.kendalltau(pred_bc_val, gt_bc_val)
    
    return float(tau)

## K-BC

In [3]:
os.chdir('./BeBeCA/Source_Code/')

In [4]:
os.getcwd()

'/home/lin-chia/Desktop/BeBeCA/Source_Code'

### 處理資料格式讓其符合kbc的input

#### Synthetic data

In [None]:
for i in range(30):
    data_path = '../../hw1_data/Synthetic/5000/' + str(i) + '.txt'
    out_data_path = '../../hw1_data/Synthetic/5000/kbc_data/' + str(i) + '.txt'
    f_graph = open(data_path, mode='r')
    tmp_src, tmp_tgt = [], []
    for line in f_graph:
        src, tgt = line.split()
        # print(src, tgt)
        tmp_src.append(src)
        tmp_tgt.append(tgt)
    f_graph.close()
    edges = len(tmp_src)
    out_f_graph = open(out_data_path, mode='w')
    out_f_graph.writelines("5000 "+str(edges)+"\n")
    for i in range(edges):
        out_f_graph.writelines(tmp_src[i]+' '+tmp_tgt[i]+"\n")
    out_f_graph.close()

##### Computing betweenness centrality

In [54]:
wall_clock_time_list = []
for i in range(30):
    t0 = time.time()
    input_file = '../../hw1_data/Synthetic/5000/kbc_data/'+str(i)+'.txt'
    output_file = '../../baseline_model/k_bc/Synthetic/'+str(i)+'_score.txt'
    cmd = './KPATH 4 '+input_file+ ' '+output_file
    os.system(cmd)
    wall_clock_time = time.time() - t0
    wall_clock_time_list.append(wall_clock_time)
    
print('avg excute time',sum(wall_clock_time_list)/len(wall_clock_time_list))
print('stdev excute time',statistics.stdev(wall_clock_time_list))

#### Real-world data

In [21]:
data_path = '../../hw1_data/youtube/com-youtube.txt'
score_path = '../../hw1_data/youtube/com-youtube_score.txt'
out_data_path = '../../hw1_data/youtube/kbc_data/com-youtube.txt'

# Get number of nodes.
f_score = open(score_path, mode='r')
node = []
for line in f_score:
    tmp_node, _ = line.split(':')
    node.append(tmp_node)
f_score.close()
nodes = len(node)

# Get number of edges.
f_graph = open(data_path, mode='r')
tmp_src, tmp_tgt = [], []
for line in f_graph:
    src, tgt = line.split()
    # print(src, tgt)
    tmp_src.append(src)
    tmp_tgt.append(tgt)
f_graph.close()
edges = len(tmp_src)

# write a new data for k-bc running data.
out_f_graph = open(out_data_path, mode='w')
out_f_graph.writelines(str(nodes)+" "+str(edges)+"\n")
for i in range(edges):
    out_f_graph.writelines(tmp_src[i]+' '+tmp_tgt[i]+"\n")
out_f_graph.close()


##### Computing betweenness centrality

In [22]:
t0 = time.time()
input_file = '../../hw1_data/youtube/kbc_data/com-youtube.txt'
output_file = '../../baseline_model/k_bc/Youtube/com-youtube_score.txt'
cmd = './KPATH 4 '+input_file+ ' '+output_file
os.system(cmd)
wall_clock_time = time.time() - t0
    
print('excute time: ', wall_clock_time)

excute time:  127602.76922130585


### Testing

In [20]:
top1_list = []
top5_list = []
top10_list = []
kendall_ist = []
for i in range(30):
    pred_val_path = '../../baseline_model/k_bc/Synthetic/' + str(i) + '_score.txt'
    gt_val_path = '../../hw1_data/Synthetic/5000/' + str(i) + '_score.txt'
    f_pred = open(pred_val_path, mode='r')
    f_gt = open(gt_val_path, mode='r')

    pred_list, gt_list = [], []
    for line in f_pred:
        ind, _ = line.split(':')
        _, score = line.split()
        pred_list.append([int(ind), float(score)])

    for line in f_gt:
        ind, score = line.split()
        gt_list.append([int(ind), float(score)])
    f_pred.close()
    f_gt.close()
    top1_list.append(top_n_accuracy(pred_list, gt_list, 1))
    top5_list.append(top_n_accuracy(pred_list, gt_list, 5))
    top10_list.append(top_n_accuracy(pred_list, gt_list, 10))
    kendall_ist.append(Kendall_tau_distance(pred_list, gt_list))

print("top-1%:", sum(top1_list) / len(top1_list),'+-',statistics.stdev(top1_list))
print("top-5%:", sum(top5_list) / len(top5_list),'+-',statistics.stdev(top5_list))
print("top-10%:", sum(top10_list) / len(top10_list),'+-',statistics.stdev(top10_list))
print("kendall tau disance:",sum(kendall_ist) / len(kendall_ist),'+-',statistics.stdev(kendall_ist),"\n")

top-1%: 0.9953333333333334 +- 0.008603661343041535
top-5%: 0.9889333333333336 +- 0.004540494681642412
top-10%: 0.9864666666666665 +- 0.0027131014614698613
kendall tau disance: 0.9718834646929385 +- 0.0011523943180940996 

