In [1]:
from IPython.display import display, Markdown, Latex
import pandas as pd
import numpy as np
import json
import os

## Experiments and Functions

In [2]:
experiments = [
            {'id': '1', 'source':'imdb', 'target':'uwcse', 'predicate':'workedunder', 'to_predicate':'advisedby', 'arity': 2},
            #{'id': '2', 'source':'uwcse', 'target':'imdb', 'predicate':'advisedby', 'to_predicate':'workedunder', 'arity': 2},
            {'id': '3', 'source':'imdb', 'target':'cora', 'predicate':'workedunder', 'to_predicate':'samevenue', 'arity': 2},
            #{'id': '4', 'source':'cora', 'target':'imdb', 'predicate':'samevenue', 'to_predicate':'workedunder', 'arity': 2},
            ##{'id': '5', 'source':'cora', 'target':'imdb', 'predicate':'sametitle', 'to_predicate':'workedunder', 'arity': 2},
            ##{'id': '6', 'source':'imdb', 'target':'cora', 'predicate':'workedunder', 'to_predicate':'sametitle', 'arity': 2},
            ##{'id': '5', 'source':'uwcse', 'target':'cora', 'predicate':'advisedby', 'to_predicate':'samevenue', 'arity': 2},
            ##{'id': '6', 'source':'cora', 'target':'uwcse', 'predicate':'samevenue', 'to_predicate':'advisedby', 'arity': 2},
            #{'id': '7', 'source':'yeast', 'target':'twitter', 'predicate':'proteinclass', 'to_predicate':'accounttype', 'arity': 2},
            #{'id': '8', 'source':'twitter', 'target':'yeast', 'predicate':'accounttype', 'to_predicate':'proteinclass', 'arity': 2},
            {'id': '9', 'source':'nell_sports', 'target':'nell_finances', 'predicate':'teamplayssport', 'to_predicate':'companyeconomicsector', 'arity': 2},
            {'id': '10', 'source':'nell_finances', 'target':'nell_sports', 'predicate':'companyeconomicsector', 'to_predicate':'teamplayssport', 'arity': 2},
]

In [3]:
def load_data(experiments, metric, directory):
    data = {}
    for item in experiments:
        experiment_title = item['id'] + '_' + item['source'] + '_' + item['target']
        
        if(metric != ''):
            file_path = os.getcwd() + directory + experiment_title + '/' + experiment_title + '_fasttext_{}.json'.format(metric)
        else:
            file_path = os.getcwd() + directory + experiment_title + '/' + experiment_title + '.json'
            
        #if(os.path.isfile(file_path)):
        with open(file_path, 'r') as fp:
            results = json.load(fp)

            data[experiment_title] = results
    return data

## Estudo dos Mapeamentos

### Depth-First Mapping

In [4]:
# Setting paths parameters
mapping_order = 'deep'
k = 1
compare_stopwords = False

#Removing Yeast and Twitter pair of experiments because there is no stopwords in predicates
#exp = experiments.copy()
#exp.remove({'id': '7', 'source':'yeast', 'target':'twitter', 'predicate':'proteinclass', 'to_predicate':'accounttype', 'arity': 2})
#exp.remove({'id': '8', 'source':'twitter', 'target':'yeast', 'predicate':'accounttype', 'to_predicate':'proteinclass', 'arity': 2})

In [12]:
# Using stopwords
path = f'/results/transfer/{mapping_order}/w_stopwords/k_{k}/'
data_softcosine_stopwords = load_data(experiments, 'softcosine', path)
data_euclidean_stopwords = load_data(experiments, 'euclidean', path)
data_wmd_stopwords = load_data(experiments, 'wmd', path)
data_rwmd_stopwords = load_data(experiments, 'relax-wmd', path)

if compare_stopwords:
    # Removing stopwords
    path = f'/results/transfer/{mapping_order}/no_stopwords/k_{k}/'
    data_softcosine_no_stopwords = load_data(experiments, 'softcosine', path)
    data_euclidean_no_stopwords = load_data(experiments, 'euclidean', path)
    data_wmd_no_stopwords = load_data(experiments, 'wmd', path)
    data_rwmd_no_stopwords = load_data(experiments, 'relax-wmd', path)

baseline_path = f'/results/baselines/transfer/'
baseline_data = load_data(experiments, '', baseline_path)

for metric in ['AUC ROC', 'AUC PR', 'CLL', 'Learning time']: #, 'Recall', 'F1', 'Precision', 'Learning and Revision time', 'Inference time']:
    display(Markdown('# Results for ' + metric))
    table = []
    for j in range(len(experiments)):
            
        dataset = experiments[j]['id'] + '_' + experiments[j]['source'] + '_' + experiments[j]['target']
        if dataset in data_euclidean_stopwords:
            mapping_time = ''
            results_using_stopwords = {}
            results_removing_stopwords = {}

            # Results using Stopwords
            results_using_stopwords['softcosine'] = np.array([(np.array([item['transfer'][metric] for item in sublist])).mean() for sublist in data_softcosine_stopwords[str(dataset)]])
            results_using_stopwords['euclidean'] = np.array([(np.array([item['transfer'][metric] for item in sublist])).mean() for sublist in data_euclidean_stopwords[str(dataset)]])
            results_using_stopwords['wmd'] = np.array([(np.array([item['transfer'][metric] for item in sublist])).mean() for sublist in data_wmd_stopwords[str(dataset)]])
            results_using_stopwords['relax-wmd'] = np.array([(np.array([item['transfer'][metric] for item in sublist])).mean() for sublist in data_rwmd_stopwords[str(dataset)]])

            #results_using_stopwords['softcosine no revision'] = np.array([(np.array([item['transfer']['parameter'][metric] for item in sublist])).mean() for sublist in data_softcosine_stopwords[str(dataset)]])
            #results_using_stopwords['euclidean no revision'] = np.array([(np.array([item['transfer']['parameter'][metric] for item in sublist])).mean() for sublist in data_euclidean_stopwords[str(dataset)]])
            #results_using_stopwords['wmd no revision'] = np.array([(np.array([item['transfer']['parameter'][metric] for item in sublist])).mean() for sublist in data_wmd_stopwords[str(dataset)]])
            #results_using_stopwords['relax-wmd no revision'] = np.array([(np.array([item['transfer']['parameter'][metric] for item in sublist])).mean() for sublist in data_rwmd_stopwords[str(dataset)]])

            if compare_stopwords:
                # Results removing stopwords
                results_removing_stopwords['softcosine'] = np.array([(np.array([item['transfer'][metric] for item in sublist])).mean() for sublist in data_softcosine_no_stopwords[str(dataset)]])
                results_removing_stopwords['euclidean'] = np.array([(np.array([item['transfer'][metric] for item in sublist])).mean() for sublist in data_euclidean_no_stopwords[str(dataset)]])
                results_removing_stopwords['wmd'] = np.array([(np.array([item['transfer'][metric] for item in sublist])).mean() for sublist in data_wmd_no_stopwords[str(dataset)]])
                results_removing_stopwords['relax-wmd'] = np.array([(np.array([item['transfer'][metric] for item in sublist])).mean() for sublist in data_rwmd_no_stopwords[str(dataset)]])

            if(experiments[j]['id'] != '10'):
                treeboostler = np.array([(np.array([item['transfer'][metric] for item in sublist])).mean() for sublist in baseline_data[str(dataset)]])
                treeboostler_no_revision = np.array([(np.array([item['transfer']['parameter'][metric] for item in sublist])).mean() for sublist in baseline_data[str(dataset)]])
                #print(treeboostler)
            rdnb = np.array([(np.array([item['rdn_b'][metric] for item in sublist])).mean() for sublist in baseline_data[str(dataset)]])
            
            if(experiments[j]['id'] != '10'):
                table.append([dataset,
                '%.3f +/- %.3f' % (results_using_stopwords['softcosine'].mean(), 2 * results_using_stopwords['softcosine'].std()),
                '%.3f +/- %.3f' % (results_using_stopwords['euclidean'].mean(), 2 * results_using_stopwords['euclidean'].std()),
                '%.3f +/- %.3f' % (results_using_stopwords['wmd'].mean(), 2 * results_using_stopwords['wmd'].std()),
                '%.3f +/- %.3f' % (results_using_stopwords['relax-wmd'].mean(), 2 * results_using_stopwords['relax-wmd'].std()),
                '%.3f +/- %.3f' % (treeboostler.mean(), 2 * treeboostler.std()),
                '%.3f +/- %.3f' % (rdnb.mean(), 2 * rdnb.std()),
                             ])
            else:
                table.append([dataset,
                '%.3f +/- %.3f' % (results_using_stopwords['softcosine'].mean(), 2 * results_using_stopwords['softcosine'].std()),
                '%.3f +/- %.3f' % (results_using_stopwords['euclidean'].mean(), 2 * results_using_stopwords['euclidean'].std()),
                '%.3f +/- %.3f' % (results_using_stopwords['wmd'].mean(), 2 * results_using_stopwords['wmd'].std()),
                '%.3f +/- %.3f' % (results_using_stopwords['relax-wmd'].mean(), 2 * results_using_stopwords['relax-wmd'].std()),
                'NaN +/- NaN',
                '%.3f +/- %.3f' % (rdnb.mean(), 2 * rdnb.std()),
                             ])
            
            #table.append(['', '', '', '', '', '', ''])
           
            if(experiments[j]['id'] != '10' and compare_stopwords):
                table.append([dataset + '*',
                '%.3f +/- %.3f' % (results_removing_stopwords['softcosine'].mean(), 2 * results_removing_stopwords['softcosine'].std()),
                '%.3f +/- %.3f' % (results_removing_stopwords['euclidean'].mean(), 2 * results_removing_stopwords['euclidean'].std()),
                '%.3f +/- %.3f' % (results_removing_stopwords['wmd'].mean(), 2 * results_removing_stopwords['wmd'].std()),
                '%.3f +/- %.3f' % (results_removing_stopwords['relax-wmd'].mean(), 2 * results_removing_stopwords['relax-wmd'].std()),
                '%.3f +/- %.3f' % (treeboostler.mean(), 2 * treeboostler.std()),
                '%.3f +/- %.3f' % (rdnb.mean(), 2 * rdnb.std()),
                             ])
            elif(compare_stopwords):
                table.append([dataset + '*',
                '%.3f +/- %.3f' % (results_removing_stopwords['softcosine'].mean(), 2 * results_removing_stopwords['softcosine'].std()),
                '%.3f +/- %.3f' % (results_removing_stopwords['euclidean'].mean(), 2 * results_removing_stopwords['euclidean'].std()),
                '%.3f +/- %.3f' % (results_removing_stopwords['wmd'].mean(), 2 * results_removing_stopwords['wmd'].std()),
                '%.3f +/- %.3f' % (results_removing_stopwords['relax-wmd'].mean(), 2 * results_removing_stopwords['relax-wmd'].std()),
                'NaN +/- NaN',
                '%.3f +/- %.3f' % (rdnb.mean(), 2 * rdnb.std()),
                             ])
            
            # No Revision
            #table.append([dataset + '_NR',
            #    '%.3f +/- %.3f' % (results_using_stopwords['softcosine no revision'].mean(), 2 * results_using_stopwords['softcosine'].std()),
            #    '%.3f +/- %.3f' % (results_using_stopwords['euclidean no revision'].mean(), 2 * results_using_stopwords['euclidean'].std()),
            #    '%.3f +/- %.3f' % (results_using_stopwords['wmd no revision'].mean(), 2 * results_using_stopwords['wmd'].std()),
            #    '%.3f +/- %.3f' % (results_using_stopwords['relax-wmd no revision'].mean(), 2 * results_using_stopwords['relax-wmd'].std()),
            #    '%.3f +/- %.3f' % (treeboostler_no_revision.mean(), 2 * treeboostler_no_revision.std()),
            #    '%.3f +/- %.3f' % (rdnb.mean(), 2 * rdnb.std()),
            #])
            #table.append(['', '', '', '', '', '', ''])

    #display(pd.DataFrame(table, columns=['Experiment', 'RDN-B'])) #, 'Transfer Learning with Revision Theory - Relax WMD'])) #, 'Learning from scratch (RDN-B)']))
    display(pd.DataFrame(table, columns=['Experiment', 'Transfer Learning - SoftCosine', 'Transfer Learning - Euclidean', 'Transfer Learning - WMD', 'Transfer Learning - Relax-WMD', 'TreeBoostler', 'RDN-B'])) #, 'Learning from scratch (RDN-B)']))
    #display(pd.DataFrame(table, columns=['Experiment', 'Transfer Learning', 'Transfer Learning with Revision Theory']))

# Results for AUC ROC

Unnamed: 0,Experiment,Transfer Learning - SoftCosine,Transfer Learning - Euclidean,Transfer Learning - WMD,Transfer Learning - Relax-WMD,TreeBoostler,RDN-B
0,1_imdb_uwcse,0.937 +/- 0.002,0.947 +/- 0.001,0.937 +/- 0.003,0.939 +/- 0.004,0.939 +/- 0.005,0.936 +/- 0.006
1,3_imdb_cora,0.591 +/- 0.055,0.595 +/- 0.054,0.592 +/- 0.056,0.611 +/- 0.005,0.576 +/- 0.048,0.566 +/- 0.041
2,9_nell_sports_nell_finances,0.731 +/- 0.017,0.733 +/- 0.058,0.737 +/- 0.089,0.739 +/- 0.038,0.746 +/- 0.019,0.762 +/- 0.041
3,10_nell_finances_nell_sports,0.983 +/- 0.030,0.993 +/- 0.001,0.994 +/- 0.001,0.993 +/- 0.002,NaN +/- NaN,0.994 +/- 0.001


# Results for AUC PR

Unnamed: 0,Experiment,Transfer Learning - SoftCosine,Transfer Learning - Euclidean,Transfer Learning - WMD,Transfer Learning - Relax-WMD,TreeBoostler,RDN-B
0,1_imdb_uwcse,0.269 +/- 0.020,0.334 +/- 0.033,0.258 +/- 0.046,0.289 +/- 0.040,0.287 +/- 0.037,0.272 +/- 0.026
1,3_imdb_cora,0.457 +/- 0.052,0.459 +/- 0.050,0.456 +/- 0.052,0.474 +/- 0.003,0.450 +/- 0.077,0.431 +/- 0.033
2,9_nell_sports_nell_finances,0.077 +/- 0.014,0.085 +/- 0.027,0.093 +/- 0.041,0.080 +/- 0.021,0.092 +/- 0.009,0.089 +/- 0.017
3,10_nell_finances_nell_sports,0.300 +/- 0.070,0.298 +/- 0.035,0.309 +/- 0.049,0.318 +/- 0.036,NaN +/- NaN,0.313 +/- 0.047


# Results for CLL

Unnamed: 0,Experiment,Transfer Learning - SoftCosine,Transfer Learning - Euclidean,Transfer Learning - WMD,Transfer Learning - Relax-WMD,TreeBoostler,RDN-B
0,1_imdb_uwcse,-0.261 +/- 0.020,-0.243 +/- 0.009,-0.252 +/- 0.009,-0.246 +/- 0.014,-0.248 +/- 0.022,-0.262 +/- 0.025
1,3_imdb_cora,-0.679 +/- 0.028,-0.675 +/- 0.032,-0.676 +/- 0.034,-0.666 +/- 0.014,-0.682 +/- 0.028,-0.688 +/- 0.021
2,9_nell_sports_nell_finances,-0.327 +/- 0.004,-0.320 +/- 0.017,-0.319 +/- 0.019,-0.323 +/- 0.010,-0.307 +/- 0.003,-0.303 +/- 0.006
3,10_nell_finances_nell_sports,-0.105 +/- 0.059,-0.085 +/- 0.002,-0.085 +/- 0.001,-0.084 +/- 0.001,NaN +/- NaN,-0.085 +/- 0.001


# Results for Learning time

Unnamed: 0,Experiment,Transfer Learning - SoftCosine,Transfer Learning - Euclidean,Transfer Learning - WMD,Transfer Learning - Relax-WMD,TreeBoostler,RDN-B
0,1_imdb_uwcse,12.386 +/- 1.135,13.195 +/- 3.957,13.831 +/- 0.970,37.649 +/- 0.748,15.725 +/- 12.166,10.457 +/- 1.078
1,3_imdb_cora,110.966 +/- 8.470,112.129 +/- 18.092,112.681 +/- 11.092,137.193 +/- 16.142,247.337 +/- 126.544,133.098 +/- 12.012
2,9_nell_sports_nell_finances,159.339 +/- 55.864,127.456 +/- 7.967,153.813 +/- 25.551,214.023 +/- 51.197,278.228 +/- 97.144,42.075 +/- 9.081
3,10_nell_finances_nell_sports,590.475 +/- 133.093,640.814 +/- 252.994,496.680 +/- 166.363,486.343 +/- 136.785,NaN +/- NaN,349.809 +/- 11.872


### Ranked-First Mapping

In [None]:
# Setting paths parameters
mapping_order = 'most_similar'
k = 1

compare_stopwords = False

In [None]:
# Using stopwords
path = f'/results/transfer/{mapping_order}/w_stopwords/k_{k}/'
data_softcosine_stopwords = load_data(experiments, 'softcosine', path)
data_euclidean_stopwords = load_data(experiments, 'euclidean', path)
data_wmd_stopwords = load_data(experiments, 'wmd', path)
data_rwmd_stopwords = load_data(experiments, 'relax-wmd', path)

if compare_stopwords:
    # Removing stopwords
    path = f'/results/transfer/{mapping_order}/no_stopwords/k_{k}/'
    data_softcosine_no_stopwords = load_data(experiments, 'softcosine', path)
    data_euclidean_no_stopwords = load_data(experiments, 'euclidean', path)
    data_wmd_no_stopwords = load_data(experiments, 'wmd', path)
    data_rwmd_no_stopwords = load_data(experiments, 'relax-wmd', path)

baseline_path = f'/results/baselines/transfer/'
baseline_data = load_data(experiments, '', baseline_path)

for metric in ['AUC ROC', 'AUC PR', 'CLL', 'Learning time']: #, 'Recall', 'F1', 'Precision', 'Learning and Revision time', 'Inference time']:
    display(Markdown('# Results for ' + metric))
    table = []
    
    for j in range(len(experiments)):
            
        dataset = experiments[j]['id'] + '_' + experiments[j]['source'] + '_' + experiments[j]['target']
        if dataset in data_euclidean_stopwords:
            mapping_time = ''
            results_using_stopwords = {}
            results_removing_stopwords = {}

            # Results using Stopwords
            results_using_stopwords['softcosine'] = np.array([(np.array([item['transfer'][metric] for item in sublist])).mean() for sublist in data_softcosine_stopwords[str(dataset)]])
            results_using_stopwords['euclidean'] = np.array([(np.array([item['transfer'][metric] for item in sublist])).mean() for sublist in data_euclidean_stopwords[str(dataset)]])
            results_using_stopwords['wmd'] = np.array([(np.array([item['transfer'][metric] for item in sublist])).mean() for sublist in data_wmd_stopwords[str(dataset)]])
            results_using_stopwords['relax-wmd'] = np.array([(np.array([item['transfer'][metric] for item in sublist])).mean() for sublist in data_rwmd_stopwords[str(dataset)]])
            
            if compare_stopwords:
                # Results removing stopwords
                results_removing_stopwords['softcosine'] = np.array([(np.array([item['transfer'][metric] for item in sublist])).mean() for sublist in data_softcosine_no_stopwords[str(dataset)]])
                results_removing_stopwords['euclidean'] = np.array([(np.array([item['transfer'][metric] for item in sublist])).mean() for sublist in data_euclidean_no_stopwords[str(dataset)]])
                results_removing_stopwords['wmd'] = np.array([(np.array([item['transfer'][metric] for item in sublist])).mean() for sublist in data_wmd_no_stopwords[str(dataset)]])
                results_removing_stopwords['relax-wmd'] = np.array([(np.array([item['transfer'][metric] for item in sublist])).mean() for sublist in data_rwmd_no_stopwords[str(dataset)]])
            
            if(experiments[j]['id'] != '10'):
                treeboostler = np.array([(np.array([item['transfer'][metric] for item in sublist])).mean() for sublist in baseline_data[str(dataset)]])
                #treeboostler_no_revision = np.array([(np.array([item['transfer_{}'.format(amount)]['parameter'][metric] for item in sublist])).mean() for sublist in treeboostler_data[str(dataset)]])
                #print(treeboostler)
            rdnb = np.array([(np.array([item['rdn_b'][metric] for item in sublist])).mean() for sublist in baseline_data[str(dataset)]])
            
            if(experiments[j]['id'] != '10'):
                table.append([dataset,
                '%.3f +/- %.3f' % (results_using_stopwords['softcosine'].mean(), 2 * results_using_stopwords['softcosine'].std()),
                '%.3f +/- %.3f' % (results_using_stopwords['euclidean'].mean(), 2 * results_using_stopwords['euclidean'].std()),
                '%.3f +/- %.3f' % (results_using_stopwords['wmd'].mean(), 2 * results_using_stopwords['wmd'].std()),
                '%.3f +/- %.3f' % (results_using_stopwords['relax-wmd'].mean(), 2 * results_using_stopwords['relax-wmd'].std()),
                '%.3f +/- %.3f' % (treeboostler.mean(), 2 * treeboostler.std()),
                '%.3f +/- %.3f' % (rdnb.mean(), 2 * rdnb.std()),
                             ])
            else:
                table.append([dataset,
                '%.3f +/- %.3f' % (results_using_stopwords['softcosine'].mean(), 2 * results_using_stopwords['softcosine'].std()),
                '%.3f +/- %.3f' % (results_using_stopwords['euclidean'].mean(), 2 * results_using_stopwords['euclidean'].std()),
                '%.3f +/- %.3f' % (results_using_stopwords['wmd'].mean(), 2 * results_using_stopwords['wmd'].std()),
                '%.3f +/- %.3f' % (results_using_stopwords['relax-wmd'].mean(), 2 * results_using_stopwords['relax-wmd'].std()),
                #'%.3f +/- %.3f' % (treeboostler.mean(), 2 * treeboostler.std()),
                '%.3f +/- %.3f' % (rdnb.mean(), 2 * rdnb.std()),
                             ])
            
            #table.append(['', '', '', '', '', '', ''])
           
            if(experiments[j]['id'] != '10' and compare_stopwords):
                table.append([dataset + '*',
                '%.3f +/- %.3f' % (results_removing_stopwords['softcosine'].mean(), 2 * results_removing_stopwords['softcosine'].std()),
                '%.3f +/- %.3f' % (results_removing_stopwords['euclidean'].mean(), 2 * results_removing_stopwords['euclidean'].std()),
                '%.3f +/- %.3f' % (results_removing_stopwords['wmd'].mean(), 2 * results_removing_stopwords['wmd'].std()),
                '%.3f +/- %.3f' % (results_removing_stopwords['relax-wmd'].mean(), 2 * results_removing_stopwords['relax-wmd'].std()),
                '%.3f +/- %.3f' % (treeboostler.mean(), 2 * treeboostler.std()),
                '%.3f +/- %.3f' % (rdnb.mean(), 2 * rdnb.std()),
                             ])
            elif compare_stopwords:
                table.append([dataset + '*',
                '%.3f +/- %.3f' % (results_removing_stopwords['softcosine'].mean(), 2 * results_removing_stopwords['softcosine'].std()),
                '%.3f +/- %.3f' % (results_removing_stopwords['euclidean'].mean(), 2 * results_removing_stopwords['euclidean'].std()),
                '%.3f +/- %.3f' % (results_removing_stopwords['wmd'].mean(), 2 * results_removing_stopwords['wmd'].std()),
                '%.3f +/- %.3f' % (results_removing_stopwords['relax-wmd'].mean(), 2 * results_removing_stopwords['relax-wmd'].std()),
                #'%.3f +/- %.3f' % (treeboostler.mean(), 2 * treeboostler.std()),
                '%.3f +/- %.3f' % (rdnb.mean(), 2 * rdnb.std()),
                             ])

    #display(pd.DataFrame(table, columns=['Experiment', 'RDN-B'])) #, 'Transfer Learning with Revision Theory - Relax WMD'])) #, 'Learning from scratch (RDN-B)']))
    display(pd.DataFrame(table, columns=['Experiment', 'Transfer Learning - SoftCosine', 'Transfer Learning - Euclidean', 'Transfer Learning - WMD', 'Transfer Learning - Relax-WMD', 'TreeBoostler', 'RDN-B'])) #, 'Learning from scratch (RDN-B)']))
    #display(pd.DataFrame(table, columns=['Experiment', 'Transfer Learning', 'Transfer Learning with Revision Theory']))