In [1]:
import csv
import json
import pandas as pd
import networkx as nx
from collections import Counter

In [2]:
g = nx.MultiDiGraph()

In [3]:
df_metanodes = pd.read_csv('data/metanodes.tsv', sep='\t')

In [4]:
df_metanodes

Unnamed: 0,id,type,input,output,label,description,color
0,ARCHS4SignatureResults,Data,,,ARCHS4 Signature Search Results,ARCHS4 Signatures Query Results,
1,AlleleRegistryExternalRecordsTable,Data,,,AlleleRegistryExternalRecordsTable,,
2,AlleleSpecificEvidencesTable,Data,,,Allele Specific Evidences Table,A table of allele specific evidences,
3,AnnData,Data,,,Annotated data,A gene count matrix paired with sample annotat...,
4,BokehPlot,Data,,,Bokeh Plot,A figure created with the [Bokeh Library](http...,
...,...,...,...,...,...,...,...
351,VariantInfoFromVariantTerm,Resolver,"{""variant"":""Term[Variant]""}",VariantInfo,Resolve Variant Info from Term,Resolve variant info (Allele registry API) fro...,
352,VisualizeLibrarySizes,Resolver,"{""matrix"":""GeneCountMatrix""}",PlotlyPlot,Library Size Bar Plot from Gene Count Matrix,Construct a bar plot which displays the total ...,
353,VisualizeLibrarySizesFromAnnData,Resolver,"{""anndata"":""AnnData""}",PlotlyPlot,Library Size Bar Plot from AnnData File,Construct a bar plot which displays the total ...,
354,VolcanoPlot,Resolver,"{""sig"":""GeneSignature""}",PlotlyPlot,Volcano Plot from Differential Expression Table,Construct a scatter plot which displays the lo...,


In [5]:
for id, node in df_metanodes.iterrows():
  g.add_node(id, **node.to_dict())

metanodes = df_metanodes.set_index('id')
metanode_lookup = df_metanodes.reset_index().set_index('id')['index'].to_dict()

In [6]:
df = pd.read_csv('data/dump.csv', header=None, names=['id', 'parent', 'process_type', 'input_data_type'])

In [7]:
df

Unnamed: 0,id,parent,process_type,input_data_type
0,3af79be2-c018-4926-6298-076b7c8b4dbb,6643d55a-355e-3371-c6b7-45fc96a74513,OneScoredT[Scored[Gene]],Term[Gene]
1,15293aeb-7703-3a75-e4d4-bf8028159a0c,,FileInput,FileURL
2,ecef3f2a-4357-88b6-97d5-4199f370d6d5,15293aeb-7703-3a75-e4d4-bf8028159a0c,GeneCountMatrixFromFile,
3,031adf2a-6035-1bd4-1d7e-22db34991c80,,FileInput,FileURL
4,20946800-c926-b0cb-2ea4-2416250ca2c4,031adf2a-6035-1bd4-1d7e-22db34991c80,GeneCountMatrixFromFile,
...,...,...,...,...
2435,c689c679-f02a-3f90-06b1-873a5d94cb53,e4970b43-04fd-0362-d73a-bab64815ae3b,ExtractEnrichrTermSearch[LINCS_L1000_Chem_Pert...,
2436,5a11a546-052a-0a0c-799c-ec8fcfaef122,c689c679-f02a-3f90-06b1-873a5d94cb53,EnrichrSetTToGMT[Drug],
2437,725f2705-8a6f-1d6e-53b3-142ae21e108c,5a11a546-052a-0a0c-799c-ec8fcfaef122,GenesetsToGMT,GMT
2438,c06b0d4b-ef84-4a56-0f60-afecab26181c,725f2705-8a6f-1d6e-53b3-142ae21e108c,GMTConcatenate,


In [8]:
df['input'] = df['process_type'].apply(lambda spec: metanodes.loc[spec, 'input'] if spec in metanodes.index else None)
df['output'] = df['process_type'].apply(lambda spec: metanodes.loc[spec, 'output'] if spec in metanodes.index else None)

In [9]:
for _, edge in df.iterrows():
  if edge['process_type'] not in metanode_lookup: continue
  process_type_id = metanode_lookup[edge['process_type']]
  if edge['output']:
    g.add_edge(process_type_id, metanode_lookup[edge['output']])
  if edge['input']:
    for arg, input_type in json.loads(edge['input']).items():
      if type(input_type) == list:
        g.add_edge(metanode_lookup[input_type[0]], process_type_id)
      else:
        g.add_edge(metanode_lookup[input_type], process_type_id)

In [10]:
nx.write_graphml(g, 'data/output.graphml')

In [11]:
def n_gram(n, iterable):
  gram = []
  for item in iterable:
    gram.append(item)
    if len(gram) == n:
      yield tuple(gram)
      gram.pop(0)

In [12]:
with open('data/analytics.tsv', 'r') as fr:
  grams = {1: Counter(), 2: Counter(), 3: Counter()}
  for hits, *ids in csv.reader(fr, delimiter='\t'):
    for n, c in grams.items():
      for g in n_gram(n, ['Start', *ids]):
        c.update({ g: int(hits) })

In [14]:
pd.DataFrame(grams[1].most_common(), columns=['Path', 'Hits'])

Unnamed: 0,Path,Hits
0,"(Start,)",7576
1,"(FileInput,)",4189
2,"(EnrichrSetTToGMT[Phenotype],)",2624
3,"(GMTUnion,)",2348
4,"(GeneCountMatrixFromFile,)",1907
...,...,...
169,"(BarplotFrom[Scored[Pathway]],)",1
170,"(ExtractEnrichrTermSearch[ARCHS4_Tissues],)",1
171,"(GeneTermFromMyVariantInfo,)",1
172,"(SomeSetT[Set[Gene]],)",1


In [16]:
pd.DataFrame(grams[2].most_common(), columns=['Path', 'Hits'])

Unnamed: 0,Path,Hits
0,"(Start, FileInput)",3095
1,"(FileInput, GeneCountMatrixFromFile)",1887
2,"(GeneCountMatrixFromFile, TargetRangerScreenTa...",1293
3,"(FileInput, GeneSigFromFile)",1239
4,"(Start, Input[Gene])",1206
...,...,...
509,"(Input[Set[Gene]], SomeSetT[Set[Gene]])",1
510,"(Input[Set[Gene]], GeneshotGeneSetAugmentation...",1
511,"(MetaboliteSetInfo, Call_MetENP_on_MetSet)",1
512,"(SupervennFromGMT, Input[Gene])",1


In [17]:
pd.DataFrame(grams[3].most_common(), columns=['Path', 'Hits'])

Unnamed: 0,Path,Hits
0,"(Start, FileInput, GeneCountMatrixFromFile)",1802
1,"(FileInput, GeneCountMatrixFromFile, TargetRan...",1293
2,"(GeneCountMatrixFromFile, TargetRangerScreenTa...",963
3,"(Start, Input[Pathway], EnrichrTermSearch[Path...",795
4,"(Start, FileInput, GeneSigFromFile)",693
...,...,...
685,"(Start, Input[Set[Gene]], SomeSetT[Set[Gene]])",1
686,"(Start, Input[Set[Gene]], GeneshotGeneSetAugme...",1
687,"(MGMetTable2MetSet, MetaboliteSetInfo, Call_Me...",1
688,"(GMTConcatenate, SupervennFromGMT, Input[Gene])",1


In [18]:
d = pd.DataFrame(grams[2].most_common(), columns=['Path', 'Hits'])
d['Head'] = d['Path'].apply(lambda p: p[-1])
d['Tail'] = d['Path'].apply(lambda p: p[0])
_2grams = {tail: { row['Head']: row['Hits'] for _, row in records.iterrows() } for tail, records in d.groupby('Tail')}

In [19]:
d = pd.DataFrame(grams[3].most_common(), columns=['Path', 'Hits'])
d['Head'] = d['Path'].apply(lambda p: p[-1])
d['Tail'] = d['Path'].apply(lambda p: ' '.join(p[:-1]))
_3grams = {tail: { row['Head']: row['Hits'] for _, row in records.iterrows() } for tail, records in d.groupby('Tail')}

In [20]:
with open('../app/public/weights.json', 'w') as fw:
  json.dump(dict(**_3grams, **_2grams), fw)