In [None]:
import os
import sys
import copy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm, trange
import scipy.sparse as sp
# import wandb
# import weave

# Metrics

In [None]:
# base_path = '../results/metrics/'
# nsms = os.listdir(base_path)
# methods = os.listdir('')
# nsm = 'random'
# method = 'sl2mf'
# path = f'../results/metrics/{nsm}/{method}'
# res_name = os.listdir(path)
# # res_name
# pd.read_csv(f'{path}/{res_name[1]}').loc[2,:].values[3:].astype(float)

In [None]:
base_path = '../results/metrics/'
summary_res_df = pd.DataFrame(columns=['AUROC', 'AUPR', 'F1', 'N10', 'N20', 'N50', 'R10', 'R20', 'R50', 'P10', 'P20', 'P50', 'M10', 'M20', 'M50'],
                              index = ['CV1,1','CV2,1','CV3,1','CV1,5','CV2,5','CV3,5','CV1,20','CV2,20','CV3,20','CV1,50','CV2,50','CV3,50'])
nsms = os.listdir(base_path)
res_dict = {}
for nsm in nsms:
    res_dict[nsm]={}
    methods = os.listdir(f'{base_path}/{nsm}')
    for method in methods:
        res_dict[nsm][method] = copy.deepcopy(summary_res_df)
        res_name = os.listdir(f'{base_path}/{nsm}/{method}')
        for n in res_name:
            res = pd.read_csv(f'{base_path}/{nsm}/{method}/{n}')
            res = res.loc[2,:].values[3:].astype(float)
            n_split = n.split('_')
            cvx,pnr = n_split[4],int(1/float(n_split[3]))
            x_id = str(cvx)+','+str(pnr)
            res_dict[nsm][method].loc[x_id,:] = res
        # res_dict[nsm][method] = res_dict[nsm][method][['AUROC', 'AUPR', 'F1', 'N10', 'N20', 'N50', 'R10', 'R20', 'R50', 'P10', 'P20', 'P50', 'M10', 'M20', 'M50']]
        res_dict[nsm][method].to_csv(f'../results/summary/summary_{nsm}_{method}.csv')


In [None]:
all_col_name = ['Model', 'NSM']
for i in ['CV1,1','CV2,1','CV3,1','CV1,5','CV2,5','CV3,5','CV1,20','CV2,20','CV3,20','CV1,50','CV2,50','CV3,50']:
    for j in ['AUROC', 'AUPR', 'F1', 'N10', 'N20', 'N50', 'R10', 'R20', 'R50', 'P10', 'P20', 'P50', 'M10', 'M20', 'M50']:
        all_col_name.append(i+','+j)
len(all_col_name)

In [None]:
# new_df = pd.DataFrame(columns=all_col_name)
score_values = np.zeros((33,182), dtype=object)
score_values[:,1] = ['random']*11+['exp']*11+['dep']*11
nsms = ['random', 'exp', 'dep']
models = ['sl2mf', 'slmgae', 'cmfw', 'ddgcn', 'gcatsl', 'grsmf', 'kg4sl', 'mge4sl', 'nsf4sl', 'pilsl', 'ptgnn']
for i in range(33):
    score_values[i,0] = models[i%11]
    model = models[i%11]
    nsm = nsms[i//11]
    for j in range(2,182):
        cvx,pnr,met = all_col_name[j].split(',')
        r = cvx+','+pnr
        c = met
        score_values[i,j] = res_dict[nsm][model].loc[r,c]

In [None]:
new_df = pd.DataFrame(data = score_values,columns=all_col_name)

In [None]:
new_df.to_csv('summary_all_matrics.csv', index=False)

In [None]:
res_dict['random'].keys()

# Score distribution

#### Note: the following code is used to generate the score distribution figure for the paper
#### **Must use parameter --save_mat to successfully run the following code**

In [None]:
nsm = 'random'
base_path = f'../results/{nsm}_score_mat/'
models = os.listdir(base_path)
res_names = os.listdir(f'{base_path}/{models[-2]}')
res_names_classify = [i for i in res_names if 'classify' in i]
res_names_ranking = [i for i in res_names if 'ranking' in i]


In [None]:
pos_samples, neg_samples = np.load('../data/data_split/CV1_1.npy',allow_pickle=True)
_, _, train_pos_kfold, test_pos_kfold = pos_samples
_, _, train_neg_kfold, test_neg_kfold = neg_samples

In [None]:
train_pos_kfold[0].shape

In [None]:
cvx = ['CV1', 'CV2', 'CV3']
pnrs = [1.0,0.2,0.05,0.02]
train_pos_scores = []
train_neg_scores = []
test_pos_scores =[]
test_neg_scores =[]
for fold_num in range(5):

    score_mat = np.load(f'{base_path}/{models[-2]}/slmgae_fold_{fold_num}_pos_neg_1.0_CV1_Random_ranking.npy')
    train_pos_scores.append(score_mat[train_pos_kfold[fold_num][:,0],train_pos_kfold[fold_num][:,1]])
    train_neg_scores.append(score_mat[train_neg_kfold[fold_num][:,0],train_neg_kfold[fold_num][:,1]])
    test_pos_scores.append(score_mat[test_pos_kfold[fold_num][:,0],test_pos_kfold[fold_num][:,1]])
    test_neg_scores.append(score_mat[test_neg_kfold[fold_num][:,0],test_neg_kfold[fold_num][:,1]])

In [None]:
plt.close('all')

In [None]:
def except_abnormal(data):
    Q1 = np.percentile(data, 5)
    Q3 = np.percentile(data, 95)
    IQR = Q3 - Q1
    filtered_data = data[(data >= Q1) & (data <= Q3)]
    return filtered_data


In [None]:
def extract_data(cv,pnr,fold_num,base_path,model):
    pos_samples, neg_samples = np.load(f'../data/data_split/{cv}_{int(1/float(pnr))}.npy',allow_pickle=True)
    _, _, train_pos_kfold, test_pos_kfold = pos_samples
    _, _, train_neg_kfold, test_neg_kfold = neg_samples
    
    score_mat = np.load(f'{base_path}/{model}/{model}_fold_{fold_num}_pos_neg_{pnr}_{cv}_Random_classify.npy')

    train_pos_score = except_abnormal(score_mat[train_pos_kfold[fold_num][:,0],train_pos_kfold[fold_num][:,1]])
    train_neg_score = except_abnormal(score_mat[train_neg_kfold[fold_num][:,0],train_neg_kfold[fold_num][:,1]])
    test_pos_score = except_abnormal(score_mat[test_pos_kfold[fold_num][:,0],test_pos_kfold[fold_num][:,1]])
    test_neg_score = except_abnormal(score_mat[test_neg_kfold[fold_num][:,0],test_neg_kfold[fold_num][:,1]])
        
    return train_pos_score, train_neg_score, test_pos_score, test_neg_score

In [None]:
def load_score_data(train_pos_score, train_neg_score, test_pos_score, test_neg_score):

    plot_df = pd.DataFrame(columns=['Predict score','Sample type'])
    plot_df = plot_df.append(pd.DataFrame({'Predict score':train_pos_score,'Sample type':['$Train_{pos}$']*len(train_pos_score)}))
    plot_df = plot_df.append(pd.DataFrame({'Predict score':train_neg_score,'Sample type':['$Train_{neg}$']*len(train_neg_score)}))
    plot_df = plot_df.append(pd.DataFrame({'Predict score':test_pos_score,'Sample type':['$Test_{pos}$']*len(test_pos_score)}))
    plot_df = plot_df.append(pd.DataFrame({'Predict score':test_neg_score,'Sample type':['$Test_{neg}$']*len(test_neg_score)}))
    return plot_df

In [None]:
def normalize_data(all_score):
    biggest_score = -np.inf
    smallest_score = np.inf
    for k in all_score.keys():
        train_pos_score, train_neg_score, test_pos_score, test_neg_score = all_score[k]
        max_score = max(train_pos_score.max(),train_neg_score.max(),test_pos_score.max(),test_neg_score.max())
        min_score = min(train_pos_score.min(),train_neg_score.min(),test_pos_score.min(),test_neg_score.min())
        if max_score>biggest_score:
            biggest_score = max_score
        if min_score<smallest_score:
            smallest_score = min_score
    new_all_score = {}
    for k in all_score.keys():
        train_pos_score, train_neg_score, test_pos_score, test_neg_score = all_score[k]
        train_pos_score = (train_pos_score-smallest_score)/(biggest_score-smallest_score)
        train_neg_score = (train_neg_score-smallest_score)/(biggest_score-smallest_score)
        test_pos_score = (test_pos_score-smallest_score)/(biggest_score-smallest_score)
        test_neg_score = (test_neg_score-smallest_score)/(biggest_score-smallest_score)
        new_all_score[k] = train_pos_score, train_neg_score, test_pos_score, test_neg_score
    return new_all_score

In [None]:
def prepare_data(fold_num,base_path,model):
    processed_data = {}
    for scenario in ['CV1_1.0','CV2_1.0','CV3_1.0','CV1_0.2','CV1_0.05']:
        cv, pnr = scenario.split('_')
        processed_data[scenario] = extract_data(cv,pnr,fold_num,base_path,model)
    processed_data = normalize_data(processed_data)
    processed_df = {}
    for k in processed_data.keys():
        train_pos_score, train_neg_score, test_pos_score, test_neg_score = processed_data[k]
        processed_df[k] = load_score_data(train_pos_score, train_neg_score, test_pos_score, test_neg_score)
    return processed_df

In [None]:
nsm = 'random'
base_path = f'../results/{nsm}_score_mat/'
models = ['pilsl']
# models = ['gcatsl','slmgae','sl2mf','cmfw','ddgcn','grsmf','kg4sl','mge4sl','nsf4sl','ptgnn']
# fig, axes = plt.subplots(1, 6, figsize=(18, 3))
# fig_pos = [[0,0],[0,3],[0,4],[0,5],[0,1],[0,2]]
fig_pos = [[0,0],[0,1],[0,2],[1,0],[1,1],[1,2]]

for fold_num in range(5):
    for model in models:
        fig, axes = plt.subplots(2, 3, figsize=(12, 8))
        fig_num = 0
        # fold_num = 4
        plot_legend = False
        processed_df = prepare_data(fold_num,base_path,model)
        for k in processed_df.keys():
            cv, pnr = k.split('_')
            sns.kdeplot(data=processed_df[k], x='Predict score', hue='Sample type', fill=True, common_norm=False, alpha=.4, 
                        linewidth=0.5, ax=axes[fig_pos[fig_num][0],fig_pos[fig_num][1]], legend=plot_legend)
            axes[fig_pos[fig_num][0],fig_pos[fig_num][1]].set_title(f'PiLSL ({cv}, 1:{int(1/float(pnr))})')
            axes[fig_pos[fig_num][0],fig_pos[fig_num][1]].set_xlim(-0.1, 1.1)
            fig_num +=1
        plt.savefig(f"../results/score_dist/{model}_{fold_num}_Random.svg", bbox_inches='tight')
        # plt.savefig(f"../score_dist/{model}_{fold_num}_Random.svg",format='svg', bbox_inches='tight')

In [None]:
cvx = ['CV1', 'CV2', 'CV3']
pnrs = [1.0,0.2,0.05,0.02]
nsm = 'random'
base_path = f'../results/{nsm}_score_mat/'
models = ['gcatsl','slmgae','sl2mf','cmfw','ddgcn','grsmf','kg4sl','mge4sl','nsf4sl','ptgnn']
# fig_pos = [[0,0],[0,3],[0,4],[0,5],[0,1],[0,2]]
fig_pos = [[0,0],[1,0],[1,1],[1,2],[0,1],[0,2]]

for fold_num in range(5):
    for model in models:
        fig, axes = plt.subplots(2, 3, figsize=(12, 8))
        fig_num = 0
        # fold_num = 4
        plot_legend = False
        for cv in cvx:
            if cv == 'CV1':
                for pnr in pnrs:
                    
                    plot_df = load_score_data(cv,pnr,fold_num,base_path,model)
                    
                    sns.kdeplot(data=plot_df, x='Predict score', hue='Sample type', fill=True, common_norm=False, alpha=.4, 
                                linewidth=0.5, ax=axes[fig_pos[fig_num][0],fig_pos[fig_num][1]], legend=plot_legend)
                    axes[fig_pos[fig_num][0],fig_pos[fig_num][1]].set_title(f'{model.upper()} ({cv}, 1:{int(1/float(pnr))})')
                    axes[fig_pos[fig_num][0],fig_pos[fig_num][1]].set_xlim(-0.1, 1.1)
                    
                    # # 获取当前kdeplot的legend数据，以便后续使用
                    # if legend_data is None:
                    #     legend_data = [line for line in ax.lines if line.get_label() != "_no_legend_"]
                    #     legend_labels = [line.get_label() for line in legend_data]

                    fig_num +=1
            else:
                if fig_num == 5:
                    plot_legend = True
                pnr = 1.0
                plot_df = load_score_data(cv,pnr,fold_num,base_path,model)

                sns.kdeplot(data=plot_df, x='Predict score', hue='Sample type', fill=True, common_norm=False, alpha=.4, 
                            linewidth=0.5, ax=axes[fig_pos[fig_num][0],fig_pos[fig_num][1]], legend=plot_legend)
                axes[fig_pos[fig_num][0],fig_pos[fig_num][1]].set_title(f'{model.upper()} ({cv}, 1:{int(1/float(pnr))})')
                axes[fig_pos[fig_num][0],fig_pos[fig_num][1]].set_xlim(-0.1, 1.1)

                fig_num +=1

        # plt.tight_layout()
        # plt.show()
        plt.savefig(f"../results/score_dist/{model}_{fold_num}_Random.svg",format='svg', bbox_inches='tight')

In [None]:
plt.close('all')

In [None]:
cv = 'CV1'
pnr = 1.0
model = 'ptgnn'
pos_samples, neg_samples = np.load(f'../data/data_split/{cv}_{int(1/float(pnr))}.npy',allow_pickle=True)
_, _, train_pos_kfold, test_pos_kfold = pos_samples
_, _, train_neg_kfold, test_neg_kfold = neg_samples

score_mat = np.load(f'{base_path}{model}/{model}_fold_{fold_num}_pos_neg_{pnr}_{cv}_Random_classify.npy')
train_pos_score = score_mat[train_pos_kfold[fold_num][:,0],train_pos_kfold[fold_num][:,1]]
train_neg_score = score_mat[train_neg_kfold[fold_num][:,0],train_neg_kfold[fold_num][:,1]]
test_pos_score = score_mat[test_pos_kfold[fold_num][:,0],test_pos_kfold[fold_num][:,1]]
test_neg_score = score_mat[test_neg_kfold[fold_num][:,0],test_neg_kfold[fold_num][:,1]]

# Running time

In [None]:
from sklearn.datasets import load_iris
iris = load_iris(as_frame=True)
df = iris.data.assign(target=iris.target_names[iris.target])
df

In [None]:
from weave.monitoring import StreamTable

In [None]:
api = wandb.Api()
done_methods = ['KG4SL','NSF4SL','PTGNN','SLMGAE','PiLSL','CMFW','DDGCN','SL2MF','GRSMF','GCATSL','MGE4SL']
# done_methods = ['PiLSL']
runs = api.runs('slbench/Benchmarking')

In [None]:
runs[0].scan_history(keys = ['test_M10'])

In [None]:
StreamTable(runs[0].history)