In [None]:
import sys, csv, os
from nplinker.nplinker import NPLinker
from nplinker.logconfig import LogConfig
%reload_ext autoreload
%autoreload 2

In [None]:
npl = NPLinker('local_fbmn.toml')
npl.load_data()
# don't need to call process_dataset any more, handled internally

In [None]:
# get a list of the names of the available scoring methods
# (defined in nplinker/scoring/methods.py)
print('Available scoring methods:')
for m in npl.scoring_methods:
    print(' - {}'.format(m))
    
# to get an instance of a particular method, just pass the
# name to scoring_method()...
mc = npl.scoring_method('metcalf')
test = npl.scoring_method('testscore')

# once you have an instance of a method you're free to change
# its various parameters. for metcalf scoring this currently
# includes:
# - cutoff (float): the scoring threshold, links with scores less than this are excluded
# - standardised (bool): set to True to use standardised scores (default), False for regular
mc.cutoff = 3.5
mc.standardised = True

# the testscore method is just a wrapper around a metcalf object to give a second
# method for testing. you can configure the metcalf params for it using the .mc
# attribute like this...
test.mc.cutoff = 3.5
# ... and it also has a "value" parameter (range 0-1.0) which indicates which
# proportion of the metcalf results will be returned. So setting it to 0.5 means
# it will return only the first 50% of whatever the actual metcalf scoring produces.
# (this is not very useful, it's just a way to have two methods that can produce different
# results for testing)
test.value = 0.5

# examples of different ways you can tell nplinker to generate scoring results

# 1. simplest case: 1 set of objects and 1 scoring method. if the 3rd parameter
# to get_links is not given, it defaults to ANDing results from different methods
# but this obviously doesn't matter if there's only one of them
results = npl.get_links(npl.gcfs[:10], mc) 

# The "results" object is an instance of a class called LinkCollection, defined in
# nplinker/scoring/methods.py. 
#
# Useful attributes/methods:
# - len(results) or .source_count will tell you how many of the input_objects were found to have links
# - .sources will return a list of those objects
# - .links will return a dict with structure {input_object: {linked_object: ObjectLink}}
# - .get_all_targets() will return *all* the linked objects (for all sources)
#
# The link data itself is stored in ObjectLink objects, which have the following
# basic attributes:
# - .source: the input object to the method
# - .target: the linked object
# - .methods: a list of the methods that found this link
# 
# You can retrieve any method-specific info for a link by subscripting these objects with 
# the appropriate method object (see below)

# iterate over the link information. <obj> is the original object supplied to get_links
# and <result> is a dict with structure {linked_object: ObjectLink}
for obj, result in results.links.items():
    # display the object, the number of links it has, and the number of methods that were used to get them
    print('Results for object: {}, {} total links, {} methods used'.format(obj, len(result), results.method_count))
    
    # sorting is method-dependent since they might have very different "scores", so you should
    # use the original object to do this. for metcalf scoring, this will return the ObjectLinks sorted
    # by score in descending order by default
    sorted_links = results.get_sorted_links(mc, obj)
    
    # display some link info for each link this object has. link_data[method] will return
    # the per-link data generated by that method (in this case the metcalf score)
    for link_data in sorted_links:
        print('  --> [{}] {} | {} | shared strains = {}'.format(','.join(method.name for method in link_data.methods), link_data.target, link_data[mc], len(link_data.shared_strains)))
        
    # alternatively, if you don't care about ordering, you can just iterate directly over the 
    # linked objects like this:
    #for link_target, link_data in result.items():
    #    print(link_target, link_data)
    
# you can do some simple filtering on the results (NOTE: these will all modify the original data)
# a) exclude any sources for which an arbitrary function is false (here sources are Spectrum objects)
results.filter_sources(lambda source: source.id % 2 == 0)
# b) exclude any linked objects for which an arbitrary function is false (here these are GCFs)
results.filter_targets(lambda target: target.id % 2 == 0)
# c) exclude any links for which an arbitrary function is false (the "link" here is an ObjectLink)
results.filter_links(lambda link: link[mc] > 3.6)
            
# 2. use the same set of objects with two different methods, and AND the results
# together so that objects will only be returned which have links according to 
# both of the supplied methods. 
#
# If you provide 2 or more scoring methods but only a single set of objects,
# that set will be used as input to every method.
results = npl.get_links(npl.gcfs[:10], [test, mc], and_mode=True)

# 3. same thing as #2 but showing how to provide potentially different sets of objects
# to each method, and ORing the results instead so that different methods can return
# different numbers of results
results = npl.get_links([npl.gcfs[:10], npl.gcfs[:10]], [test, mc], and_mode=False)