In [1]:
import csv
import json
import pandas as pd
import networkx as nx
from collections import Counter

In [2]:
g = nx.MultiDiGraph()

In [3]:
df_metanodes = pd.read_csv('data/metanodes.tsv', sep='\t')

In [4]:
df_metanodes

Unnamed: 0,id,type,input,output,label,description,color
0,ARCHS4SignatureResults,Data,,,ARCHS4 Signature Search Results,ARCHS4 Signatures Query Results,
1,AlleleRegistryExternalRecordsTable,Data,,,AlleleRegistryExternalRecordsTable,,
2,AlleleSpecificEvidencesTable,Data,,,Allele Specific Evidences Table,A table of allele specific evidences,
3,AnnData,Data,,,Annotated data,A gene count matrix paired with sample annotat...,
4,BokehPlot,Data,,,Bokeh Plot,A figure created with the [Bokeh Library](http...,
...,...,...,...,...,...,...,...
351,VariantInfoFromVariantTerm,Resolver,"{""variant"":""Term[Variant]""}",VariantInfo,Resolve Variant Info from Term,Resolve variant info (Allele registry API) fro...,
352,VisualizeLibrarySizes,Resolver,"{""matrix"":""GeneCountMatrix""}",PlotlyPlot,Library Size Bar Plot from Gene Count Matrix,Construct a bar plot which displays the total ...,
353,VisualizeLibrarySizesFromAnnData,Resolver,"{""anndata"":""AnnData""}",PlotlyPlot,Library Size Bar Plot from AnnData File,Construct a bar plot which displays the total ...,
354,VolcanoPlot,Resolver,"{""sig"":""GeneSignature""}",PlotlyPlot,Volcano Plot from Differential Expression Table,Construct a scatter plot which displays the lo...,


In [5]:
for id, node in df_metanodes.iterrows():
  g.add_node(id, **node.to_dict())

metanodes = df_metanodes.set_index('id')
metanode_lookup = df_metanodes.reset_index().set_index('id')['index'].to_dict()

In [6]:
df = pd.read_csv('data/dump.csv', header=None, names=['id', 'parent', 'process_type', 'input_data_type'])

In [7]:
df

Unnamed: 0,id,parent,process_type,input_data_type
0,3af79be2-c018-4926-6298-076b7c8b4dbb,6643d55a-355e-3371-c6b7-45fc96a74513,OneScoredT[Scored[Gene]],Term[Gene]
1,15293aeb-7703-3a75-e4d4-bf8028159a0c,,FileInput,FileURL
2,ecef3f2a-4357-88b6-97d5-4199f370d6d5,15293aeb-7703-3a75-e4d4-bf8028159a0c,GeneCountMatrixFromFile,
3,031adf2a-6035-1bd4-1d7e-22db34991c80,,FileInput,FileURL
4,20946800-c926-b0cb-2ea4-2416250ca2c4,031adf2a-6035-1bd4-1d7e-22db34991c80,GeneCountMatrixFromFile,
...,...,...,...,...
2435,c689c679-f02a-3f90-06b1-873a5d94cb53,e4970b43-04fd-0362-d73a-bab64815ae3b,ExtractEnrichrTermSearch[LINCS_L1000_Chem_Pert...,
2436,5a11a546-052a-0a0c-799c-ec8fcfaef122,c689c679-f02a-3f90-06b1-873a5d94cb53,EnrichrSetTToGMT[Drug],
2437,725f2705-8a6f-1d6e-53b3-142ae21e108c,5a11a546-052a-0a0c-799c-ec8fcfaef122,GenesetsToGMT,GMT
2438,c06b0d4b-ef84-4a56-0f60-afecab26181c,725f2705-8a6f-1d6e-53b3-142ae21e108c,GMTConcatenate,


In [8]:
df['input'] = df['process_type'].apply(lambda spec: metanodes.loc[spec, 'input'] if spec in metanodes.index else None)
df['output'] = df['process_type'].apply(lambda spec: metanodes.loc[spec, 'output'] if spec in metanodes.index else None)

In [9]:
for _, edge in df.iterrows():
  if edge['process_type'] not in metanode_lookup: continue
  process_type_id = metanode_lookup[edge['process_type']]
  if edge['output']:
    g.add_edge(process_type_id, metanode_lookup[edge['output']])
  if edge['input']:
    for arg, input_type in json.loads(edge['input']).items():
      if type(input_type) == list:
        g.add_edge(metanode_lookup[input_type[0]], process_type_id)
      else:
        g.add_edge(metanode_lookup[input_type], process_type_id)

In [10]:
nx.write_graphml(g, 'data/output.graphml')

In [11]:
def n_gram(n, iterable):
  gram = []
  for item in iterable:
    gram.append(item)
    if len(gram) == n:
      yield tuple(gram)
      gram.pop(0)

In [16]:
with open('data/analytics.tsv', 'r') as fr:
  grams = {1: Counter(), 2: Counter(), 3: Counter()}
  for hits, *ids in csv.reader(fr, delimiter='\t'):
    for n, c in grams.items():
      for g in n_gram(n, ids):
        c.update({ g: int(hits) })

In [17]:
grams[1].most_common()

[(('FileInput',), 4189),
 (('EnrichrSetTToGMT[Phenotype]',), 2624),
 (('GMTUnion',), 2348),
 (('GeneCountMatrixFromFile',), 1907),
 (('EnrichrGenesetSearch',), 1855),
 (('Input[Gene]',), 1746),
 (('ExtractEnrichrTermSearch[MGI_Mammalian_Phenotype_Level_4_2019]',), 1394),
 (('Input[Variant]',), 1360),
 (('TargetRangerScreenTargets[GTEx_transcriptomics]',), 1293),
 (('GeneSigFromFile',), 1239),
 (('OneScoredT[Scored[Gene]]',), 1133),
 (('LINCSL1000ReverseSearch',), 1060),
 (('Input[Drug]',), 958),
 (('GeneTermFromVariantTerm',), 901),
 (('ExtractEnrichrTermSearch[Human_Phenotype_Ontology]',), 838),
 (('Input[Pathway]',), 832),
 (('EnrichrTermSearch[Pathway]',), 827),
 (('GTExTissueExpressionFromGene',), 732),
 (('UpGeneSetFromSignature',), 695),
 (('ExtractEnrichrGenesetSearch[LINCS_L1000_Chem_Pert_Consensus_Sigs]',), 664),
 (('ExtractEnrichrTermSearch[GWAS_Catalog_2019]',), 630),
 (('EnrichrSetTToGMT[Pathway]',), 605),
 (('KFTumorExpressionFromGene',), 589),
 (('Input[Phenotype]',), 585

In [18]:
grams[2].most_common()

[(('FileInput', 'GeneCountMatrixFromFile'), 1887),
 (('GeneCountMatrixFromFile',
   'TargetRangerScreenTargets[GTEx_transcriptomics]'),
  1293),
 (('FileInput', 'GeneSigFromFile'), 1239),
 (('EnrichrSetTToGMT[Phenotype]', 'GMTUnion'), 1154),
 (('TargetRangerScreenTargets[GTEx_transcriptomics]',
   'OneScoredT[Scored[Gene]]'),
  963),
 (('ExtractEnrichrTermSearch[MGI_Mammalian_Phenotype_Level_4_2019]',
   'EnrichrSetTToGMT[Phenotype]'),
  914),
 (('Input[Variant]', 'GeneTermFromVariantTerm'), 846),
 (('Input[Pathway]', 'EnrichrTermSearch[Pathway]'), 803),
 (('OneScoredT[Scored[Gene]]', 'LINCSL1000ReverseSearch'), 792),
 (('EnrichrGenesetSearch',
   'ExtractEnrichrGenesetSearch[LINCS_L1000_Chem_Pert_Consensus_Sigs]'),
  664),
 (('GMTUnion', 'GMTUnion'), 636),
 (('GeneSigFromFile', 'FileInput'), 605),
 (('EnrichrSetTToGMT[Pathway]', 'GMTUnion'), 540),
 (('EnrichrSetTToGMT[Phenotype]', 'EnrichrSetTToGMT[Phenotype]'), 501),
 (('EnrichrGenesetSearch',
   'ExtractEnrichrGenesetSearch[LINCS_L1

In [19]:
grams[3].most_common()

[(('FileInput',
   'GeneCountMatrixFromFile',
   'TargetRangerScreenTargets[GTEx_transcriptomics]'),
  1293),
 (('GeneCountMatrixFromFile',
   'TargetRangerScreenTargets[GTEx_transcriptomics]',
   'OneScoredT[Scored[Gene]]'),
  963),
 (('TargetRangerScreenTargets[GTEx_transcriptomics]',
   'OneScoredT[Scored[Gene]]',
   'LINCSL1000ReverseSearch'),
  641),
 (('FileInput', 'GeneSigFromFile', 'FileInput'), 605),
 (('ExtractEnrichrTermSearch[MGI_Mammalian_Phenotype_Level_4_2019]',
   'EnrichrSetTToGMT[Phenotype]',
   'GMTUnion'),
  558),
 (('GeneSigFromFile', 'FileInput', 'GeneSigFromFile'), 546),
 (('Input[Pathway]',
   'EnrichrTermSearch[Pathway]',
   'ExtractEnrichrTermSearch[Human_Phenotype_Ontology]'),
  463),
 (('EnrichrTermSearch[Pathway]',
   'ExtractEnrichrTermSearch[Human_Phenotype_Ontology]',
   'ExtractEnrichrTermSearch[MGI_Mammalian_Phenotype_Level_4_2019]'),
  436),
 (('ExtractEnrichrTermSearch[Human_Phenotype_Ontology]',
   'ExtractEnrichrTermSearch[MGI_Mammalian_Phenotype_L