In [1]:
import matplotlib.pyplot as plt
from IPython.display import display, Markdown, Latex
import numpy as np
import json
import pandas as pd
from scipy import stats
import os

# Experiments and Functions

In [51]:
experiments = [
            #{'id': '1', 'source':'imdb', 'target':'uwcse', 'predicate':'workedunder', 'to_predicate':'advisedby', 'arity': 2},
            #{'id': '2', 'source':'uwcse', 'target':'imdb', 'predicate':'advisedby', 'to_predicate':'workedunder', 'arity': 2},
            #{'id': '3', 'source':'imdb', 'target':'cora', 'predicate':'workedunder', 'to_predicate':'samevenue', 'arity': 2},
            #{'id': '4', 'source':'cora', 'target':'imdb', 'predicate':'samevenue', 'to_predicate':'workedunder', 'arity': 2},
            ##{'id': '5', 'source':'uwcse', 'target':'cora', 'predicate':'advisedby', 'to_predicate':'samevenue', 'arity': 2},
            ##{'id': '6', 'source':'cora', 'target':'uwcse', 'predicate':'samevenue', 'to_predicate':'advisedby', 'arity': 2},
            #{'id': '7', 'source':'yeast', 'target':'twitter', 'predicate':'proteinclass', 'to_predicate':'accounttype', 'arity': 2},
            #{'id': '8', 'source':'twitter', 'target':'yeast', 'predicate':'accounttype', 'to_predicate':'proteinclass', 'arity': 2},
            {'id': '9', 'source':'nell_sports', 'target':'nell_finances', 'predicate':'teamplayssport', 'to_predicate':'companyeconomicsector', 'arity': 2},
            #{'id': '10', 'source':'nell_finances', 'target':'nell_sports', 'predicate':'companyeconomicsector', 'to_predicate':'teamplayssport', 'arity': 2}
]

In [10]:
def load_data(experiments, metric, directory):
    data = {}
    for item in experiments:
        experiment_title = item['id'] + '_' + item['source'] + '_' + item['target']
        
        if(metric != ''):
            file_path = os.getcwd() + directory + experiment_title + '/' + experiment_title + '_fasttext_{}.json'.format(metric)
        else:
            file_path = os.getcwd() + directory + experiment_title + '/' + experiment_title + '.json'
            
        #if(os.path.isfile(file_path)):
        with open(file_path, 'r') as fp:
            results = json.load(fp)

            data[experiment_title] = results
    return data

# Estudo dos Mapeamentos

## Depth-First Mapping

In [52]:
# Setting paths parameters
mapping_order = 'deep'
k = 1
compare_stopwords = False

#Removing Yeast and Twitter pair of experiments because there is no stopwords in predicates
#exp = experiments.copy()
#exp.remove({'id': '7', 'source':'yeast', 'target':'twitter', 'predicate':'proteinclass', 'to_predicate':'accounttype', 'arity': 2})
#exp.remove({'id': '8', 'source':'twitter', 'target':'yeast', 'predicate':'accounttype', 'to_predicate':'proteinclass', 'arity': 2})

In [50]:
path = f'/results/transfer/{mapping_order}/w_stopwords/k_{k}/'
data_softcosine = load_data(experiments, 'softcosine', path)
data_euclidean = load_data(experiments, 'euclidean', path)
data_wmd = load_data(experiments, 'wmd', path)
data_rwmd = load_data(experiments, 'relax-wmd', path)

path = f'/results/transfer/{mapping_order}/w_stopwords/k_{k}_no_revision/'
data_softcosine_no_rev = load_data(experiments, 'softcosine', path)
data_euclidean_no_rev = load_data(experiments, 'euclidean', path)
data_wmd_no_rev = load_data(experiments, 'wmd', path)
data_rwmd_no_rev = load_data(experiments, 'relax-wmd', path)

baseline_path = f'/results/baselines/transfer/'
baseline_data = load_data(experiments, '', baseline_path)

for j in range(len(experiments)):
    dataset = experiments[j]['id'] + '_' + experiments[j]['source'] + '_' + experiments[j]['target']
    #if dataset in data['results']: 
    display(Markdown('# Results for ' + dataset))
    table = []
    for metric in ['CLL', 'AUC ROC', 'AUC PR']: #, 'Learning and Revision time', 'Inference time']:
        display(Markdown('## ' + metric))
        
        trans_softcosine = np.zeros(len(data_softcosine[str(dataset)][0]))
        trans_softcosine_no_rev = np.zeros(len(data_softcosine_no_rev[str(dataset)][0]))
        
        trans_euclidean = np.zeros(len(data_euclidean[str(dataset)][0]))
        trans_euclidean_no_rev = np.zeros(len(data_euclidean_no_rev[str(dataset)][0]))
        
        trans_wmd = np.zeros(len(data_wmd[str(dataset)][0]))
        trans_wmd_no_rev = np.zeros(len(data_wmd_no_rev[str(dataset)][0]))
        
        trans_relax_wmd = np.zeros(len(data_rwmd[str(dataset)][0]))
        trans_relax_wmd_no_rev = np.zeros(len(data_rwmd_no_rev[str(dataset)][0]))
        
        tree_no_rev = np.zeros(len(baseline_data[str(dataset)][0]))
        tree = np.zeros(len(baseline_data[str(dataset)][0]))
        
        rdnb_no_rev = np.zeros(len(baseline_data[str(dataset)][0]))
        rdnb = np.zeros(len(baseline_data[str(dataset)][0]))
        
        for m in range(len(baseline_data[str(dataset)][0])):
            trans_softcosine += np.array([item['transfer'][metric] for item in data_softcosine[str(dataset)][m]])
            trans_softcosine_no_rev += np.array([item['transfer'][metric] for item in data_softcosine_no_rev[str(dataset)][m]])

            trans_euclidean += np.array([item['transfer'][metric] for item in data_euclidean[str(dataset)][m]])
            trans_euclidean_no_rev += np.array([item['transfer'][metric] for item in data_euclidean_no_rev[str(dataset)][m]])

            trans_wmd += np.array([item['transfer'][metric] for item in data_wmd[str(dataset)][m]])
            trans_wmd_no_rev += np.array([item['transfer'][metric] for item in data_wmd_no_rev[str(dataset)][m]])
            
            trans_relax_wmd += np.array([item['transfer'][metric] for item in data_rwmd[str(dataset)][m]])
            trans_relax_wmd_no_rev += np.array([item['transfer'][metric] for item in data_rwmd_no_rev[str(dataset)][m]])

            tree_no_rev += np.array([item['transfer']['parameter'][metric] for item in baseline_data[str(dataset)][m]])
            tree += np.array([item['transfer'][metric] for item in baseline_data[str(dataset)][m]])
            
            rdnb_no_rev += np.array([item['rdn_b'][metric] for item in baseline_data[str(dataset)][m]]) 
            rdnb += np.array([item['rdn_b'][metric] for item in baseline_data[str(dataset)][m]])
        
        # Fora do FOR
            
        trans_softcosine /= len(data_softcosine[str(dataset)])
        trans_softcosine_no_rev /= len(data_softcosine_no_rev[str(dataset)])
        
        trans_euclidean /= len(data_euclidean[str(dataset)])
        trans_euclidean_no_rev /= len(data_euclidean_no_rev[str(dataset)])
        
        trans_wmd /= len(data_wmd[str(dataset)])
        trans_wmd_no_rev /= len(data_wmd_no_rev[str(dataset)])
        
        trans_relax_wmd /= len(data_rwmd[str(dataset)])
        trans_relax_wmd_no_rev /= len(data_rwmd_no_rev[str(dataset)])
        
        tree /= len(baseline_data[str(dataset)])
        tree_no_rev /= len(baseline_data[str(dataset)])
        
        rdnb /= len(baseline_data[str(dataset)])
        rdnb_no_rev /= len(baseline_data[str(dataset)])
        
        pvalue_table = []
        pvalue = 0
        tvalue, pvalue = stats.ttest_rel(trans_softcosine,tree)
        tvalue2, pvalue2 = stats.ttest_rel(trans_softcosine, rdnb)
        pvalue_table.append(['TransBoostler Soft Cosine', '%.3f' % (pvalue), '%.3f' % (pvalue2)])
        del pvalue,pvalue2
        
        pvalue = 0
        tvalue, pvalue = stats.ttest_rel(trans_euclidean,tree)
        tvalue2, pvalue2 = stats.ttest_rel(trans_euclidean,rdnb)

        pvalue_table.append(['TransBoostler Euclidean', '%.3f' % (pvalue), '%.3f' % (pvalue2)])
        del pvalue,pvalue2
        
        pvalue = 0
        tvalue, pvalue = stats.ttest_rel(trans_wmd,tree)
        tvalue2, pvalue2 = stats.ttest_rel(trans_wmd,rdnb)
        
        pvalue_table.append(['TransBoostler WMD', '%.3f' % (pvalue), '%.3f' % (pvalue2)])
        del pvalue,pvalue2
        
        pvalue = 0
        tvalue, pvalue = stats.ttest_rel(trans_relax_wmd,tree)
        tvalue2, pvalue2 = stats.ttest_rel(trans_relax_wmd,rdnb)
        
        pvalue_table.append(['TransBoostler Relax-WMD', '%.3f' % (pvalue), '%.3f' % (pvalue2)])
        del pvalue,pvalue2
        
        # No Revision
        
        pvalue = 0
        tvalue, pvalue = stats.ttest_rel(trans_softcosine_no_rev,tree_no_rev)
        tvalue2, pvalue2 = stats.ttest_rel(trans_softcosine_no_rev,rdnb_no_rev)
        
        pvalue_table.append(['TransBoostler* Soft Cosine', '%.3f' % (pvalue), '%.3f' % (pvalue2)])
        del pvalue,pvalue2
        
        pvalue = 0
        tvalue, pvalue = stats.ttest_rel(trans_euclidean_no_rev,tree_no_rev)
        tavlue2, pvalue2 = stats.ttest_rel(trans_euclidean_no_rev,rdnb_no_rev)
        
        pvalue_table.append(['TransBoostler* Euclidean', '%.3f' % (pvalue), '%.3f' % (pvalue2)])
        del pvalue,pvalue2
        
        pvalue = 0
        tvalue, pvalue = stats.ttest_rel(trans_wmd_no_rev,tree_no_rev)
        tvalue, pvalue2 = stats.ttest_rel(trans_wmd_no_rev,rdnb_no_rev)
        
        pvalue_table.append(['TransBoostler* WMD', '%.3f' % (pvalue), '%.3f' % (pvalue2)])
        del pvalue,pvalue2
        
        pvalue = 0
        tvalue, pvalue = stats.ttest_rel(trans_relax_wmd,tree_no_rev)
        tvalue, pvalue2 = stats.ttest_rel(trans_relax_wmd_no_rev,rdnb_no_rev)
        
        pvalue_table.append(['TransBoostler* Relax-WMD', '%.3f' % (pvalue), '%.3f' % (pvalue2)])
        del pvalue,pvalue2
        
        display(pd.DataFrame(pvalue_table, columns=['p-value', 'TreeBoostler', 'RDN-B']))
        print('TransBoostler Soft Cosine', ','.join(str(v) for v in trans_softcosine))
        
        print('TransBoostler Euclidean', ','.join(str(v) for v in trans_euclidean))
        
        print('TransBoostler WMD', ','.join(str(v) for v in trans_wmd))
        
        print('TransBoostler Relax-WMD', ','.join(str(v) for v in trans_relax_wmd))
        
        print('TreeBoostler', ','.join(str(v) for v in tree))
        
        print('TransBoostler* Soft Cosine', ','.join(str(v) for v in trans_softcosine_no_rev))
        print('TransBoostler* Euclidean', ','.join(str(v) for v in trans_euclidean_no_rev))
        print('TransBoostler* WMD', ','.join(str(v) for v in trans_wmd_no_rev))
        print('TransBoostler* WMD', ','.join(str(v) for v in trans_relax_wmd_no_rev))
        
        print('TreeBoostler*', ','.join(str(v) for v in tree_no_rev))
        
        print('RDN-B', ','.join(str(v) for v in rdnb))
       

# Results for 10_nell_finances_nell_sports

## CLL

Unnamed: 0,p-value,TreeBoostler,RDN-B
0,TransBoostler Soft Cosine,0.0,0.445
1,TransBoostler Euclidean,0.0,0.574
2,TransBoostler WMD,0.0,0.297
3,TransBoostler Relax-WMD,0.0,0.328
4,TransBoostler* Soft Cosine,0.0,0.0
5,TransBoostler* Euclidean,0.0,0.0
6,TransBoostler* WMD,0.0,0.0
7,TransBoostler* Relax-WMD,0.0,0.0


TransBoostler Soft Cosine -0.08438733333333333,-0.14791866666666667,-0.08346733333333334
TransBoostler Euclidean -0.085682,-0.08486333333333333,-0.08446466666666667
TransBoostler WMD -0.08638733333333333,-0.08490833333333332,-0.08464733333333334
TransBoostler Relax-WMD -0.084913,-0.08352833333333333,-0.08502066666666667
TreeBoostler 0.0,0.0,0.0
TransBoostler* Soft Cosine -0.373484,-0.36630633333333334,-0.372123
TransBoostler* Euclidean -0.3653056666666667,-0.370523,-0.368628
TransBoostler* WMD -0.37368666666666667,-0.37293933333333334,-0.3751393333333333
TransBoostler* WMD -0.363486,-0.3689103333333333,-0.365693
TreeBoostler* 0.0,0.0,0.0
RDN-B -0.085418,-0.08386833333333334,-0.08489866666666668


## AUC ROC

Unnamed: 0,p-value,TreeBoostler,RDN-B
0,TransBoostler Soft Cosine,0.0,0.485
1,TransBoostler Euclidean,0.0,0.812
2,TransBoostler WMD,0.0,0.29
3,TransBoostler Relax-WMD,0.0,0.185
4,TransBoostler* Soft Cosine,0.0,0.0
5,TransBoostler* Euclidean,0.0,0.0
6,TransBoostler* WMD,0.0,0.0
7,TransBoostler* Relax-WMD,0.0,0.0


TransBoostler Soft Cosine 0.9960656666666666,0.957949,0.9959233333333334
TransBoostler Euclidean 0.9956663333333333,0.9918713333333334,0.9925756666666666
TransBoostler WMD 0.994744,0.9928366666666667,0.9938146666666667
TransBoostler Relax-WMD 0.9939096666666667,0.9918619999999999,0.9936153333333334
TreeBoostler 0.0,0.0,0.0
TransBoostler* Soft Cosine 0.48187,0.4839166666666667,0.4872373333333333
TransBoostler* Euclidean 0.48196333333333335,0.48771766666666666,0.4953343333333333
TransBoostler* WMD 0.4816766666666667,0.48927699999999996,0.48981366666666665
TransBoostler* WMD 0.48569833333333334,0.4962856666666666,0.492752
TreeBoostler* 0.0,0.0,0.0
RDN-B 0.9946986666666667,0.9922093333333333,0.9937016666666666


## AUC PR

Unnamed: 0,p-value,TreeBoostler,RDN-B
0,TransBoostler Soft Cosine,0.0,0.806
1,TransBoostler Euclidean,0.0,0.466
2,TransBoostler WMD,0.0,0.855
3,TransBoostler Relax-WMD,0.0,0.72
4,TransBoostler* Soft Cosine,0.0,0.0
5,TransBoostler* Euclidean,0.0,0.0
6,TransBoostler* WMD,0.0,0.0
7,TransBoostler* Relax-WMD,0.0,0.0


TransBoostler Soft Cosine 0.33943833333333334,0.2075823333333333,0.35328933333333334
TransBoostler Euclidean 0.30529000000000006,0.26551033333333335,0.3219096666666667
TransBoostler WMD 0.31409533333333334,0.273001,0.34030933333333335
TransBoostler Relax-WMD 0.3297853333333333,0.333119,0.2925526666666667
TreeBoostler 0.0,0.0,0.0
TransBoostler* Soft Cosine 0.002353333333333333,0.002316,0.00237
TransBoostler* Euclidean 0.0023586666666666665,0.002396333333333333,0.0024386666666666667
TransBoostler* WMD 0.0023393333333333335,0.0023983333333333335,0.002415
TransBoostler* WMD 0.0023453333333333334,0.0024246666666666666,0.002387666666666667
TreeBoostler* 0.0,0.0,0.0
RDN-B 0.31262266666666666,0.31537699999999996,0.312174
