# Using DotMotif to search for motifs in a custom graph

[DotMotif](https://github.com/aplbrain/dotmotif) is a performant, powerful query framework to search for network motifs.


In [1]:
%%capture
# Install dotmotif with one line.
# On your own computer, you can run this in the terminal.
!pip3 install dotmotif networkx

import networkx as nx
from dotmotif import Motif, GrandIsoExecutor

In [2]:

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
import csv
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle
import matplotlib.pyplot as plt
from scipy.stats import ranksums
import statsmodels.stats.multitest as smm

In [4]:
import numpy as np

#table = pd.read_feather('gdrive/My Drive/Allen Institute- Internship/Data/v1dd_with_additional_coregistered_neurons/additional_coregistered_neurons/synapse_table_v1dd_targeting_proofread_True_only_from_proofread_True_additional_cells.feather')

In [5]:
julian_v1_file = np.load('gdrive/My Drive/Allen Institute- Internship/Downloads/Data/ground_truth_connectome_v1dd_1 (1).npy',allow_pickle = 'True').clip(0,1)

In [6]:
cell_type = np.load('gdrive/My Drive/Allen Institute- Internship/Downloads/cell_types_v1dd.npy', allow_pickle=True)
cell_layers = np.load('gdrive/My Drive/Allen Institute- Internship/Downloads/cell_layers_v1dd.npy', allow_pickle=True)

In [7]:
mask =  np.isin(cell_layers,'23')
v1dd_layers_23 = julian_v1_file[mask][:,mask]

In [8]:
indices = np.where(mask)[0]    #gives no. where mask is TRUE

In [51]:
'''indtst = np.where(mask)
indtst

(array([14]),)

In [9]:
remap = []
ind_23 = []
for i,j in zip(range(len(indices)), indices):
  ind_23.append(i)
  remap.append(j)



In [10]:
remapped_indices = pd.DataFrame({'index_l23': ind_23, 'index_v1dd': remap})
remapped_indices

Unnamed: 0,index_l23,index_v1dd
0,0,2
1,1,5
2,2,6
3,3,8
4,4,10
...,...,...
238,238,658
239,239,659
240,240,664
241,241,666


In [11]:
np.shape(v1dd_layers_23)

(243, 243)

In [12]:
l23_connectome_network = nx.from_numpy_array(v1dd_layers_23, create_using= nx.DiGraph)

In [6]:
#WHEN YOU WANT TO TEST FOR 668x668
'''
#build the network graph!
network = nx.from_numpy_array(julian_v1_file,create_using= nx.DiGraph)

In [7]:
'''
network

<networkx.classes.digraph.DiGraph at 0x7966707392d0>

In [8]:
'''
network_toarray = nx.to_numpy_array(network)

In [9]:
'''
network_toarray

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [10]:
'''
np.shape(network_toarray)

(668, 668)

In [11]:
'''
network_toarray[0][3]

1.0

In [97]:
'''
diff_adj = julian_v1_file - network_toarray

In [103]:
'''
np.max(diff_adj**2)

0.0

# **BUILDING THE SAME NETWORK USING ADJACENCY INSTEAD OF from_numpy_array**

In [11]:
'''
network_adjacency = nx.adjacency_matrix(network)
network_adjacency

<668x668 sparse array of type '<class 'numpy.float64'>'
	with 37383 stored elements in Compressed Sparse Row format>

In [None]:
'''connectme_numpy = np.array(connectome_adjacency.todense()).clip(0,1)
connectme_numpy'''

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [None]:
''''diff_adj = julian_v1_file - connectme_numpy
diff_adj'''

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

#**Starting dotMotif Analysis**

In [76]:
from dotmotif import Motif, GrandIsoExecutor
from dotmotif.ingest import CSVEdgelistConverter

In [None]:
#Since, we already created the graph n/w this step is not needed, go directly to next step
''''graph = CSVEdgelistConverter(
    connectome,
    # Tell DotMotif which columns represent the "source" and "target"
    # of the edgelist:

).to_graph()

In [13]:
# Create the search engine.
E = GrandIsoExecutor(graph=l23_connectome_network)

In [14]:
# Build your own motif here!
motif = Motif("""

# Example motif:

onewayEdge(a, b, c) {
    # An edge that only points in one direction,
    # with no reciprocal edge:
    a -> b
    b -> c
}

# A triangle that only has edges pointing
# in one direction:
onewayEdge(A, B, C)


""")

In [15]:
results = E.find(motif)

In [16]:
print(len(results))

378725


In [45]:
dotMotif_chains = pd.DataFrame(results)

In [46]:
combine = []
for i,j,k in zip(dotMotif_chains['A'], dotMotif_chains['B'], dotMotif_chains['C']):
  combine.append([i,j,k])

In [47]:
dotMotif_chains['motif_cells'] = combine

In [48]:
dotMotif_chains

Unnamed: 0,A,B,C,motif_cells
0,0,1,12,"[0, 1, 12]"
1,0,1,40,"[0, 1, 40]"
2,0,1,45,"[0, 1, 45]"
3,0,1,49,"[0, 1, 49]"
4,0,1,73,"[0, 1, 73]"
...,...,...,...,...
378720,242,241,232,"[242, 241, 232]"
378721,242,241,233,"[242, 241, 233]"
378722,242,241,234,"[242, 241, 234]"
378723,242,241,236,"[242, 241, 236]"


#Remap 243x243 cells to 668x668 connectome

In [21]:
A_remap =pd.DataFrame()
B_remap =pd.DataFrame()
C_remap =pd.DataFrame()
for i,j,k in  tqdm(zip(dotMotif_chains['A'], dotMotif_chains['B'], dotMotif_chains['C'])):
  #a
  mask1 = remapped_indices.index_l23.apply(lambda x: i == x)
  get_idx1 = remapped_indices.index_v1dd[mask1]
  A_remap = pd.concat([A_remap,get_idx1])

  #b
  mask2 = remapped_indices.index_l23.apply(lambda x: j == x)
  get_idx2 = remapped_indices.index_v1dd[mask2]
  B_remap = pd.concat([B_remap,get_idx2])

  #c
  mask3 = remapped_indices.index_l23.apply(lambda x: k == x)
  get_idx3 = remapped_indices.index_v1dd[mask3]
  C_remap = pd.concat([C_remap,get_idx3])

378725it [33:37, 187.76it/s]


In [None]:
dotMotif_chains['A_remap'] = np.array(A_remap)
dotMotif_chains['B_remap'] = np.array(B_remap)
dotMotif_chains['C_remap'] = np.array(C_remap)


In [50]:
dot

Unnamed: 0,A,B,C,motif_cells,A_remap,B_remap,C_remap
0,0,1,12,"[0, 1, 12]",2.0,5.0,31.0
1,0,1,40,"[0, 1, 40]",2.0,5.0,108.0
2,0,1,45,"[0, 1, 45]",2.0,5.0,125.0
3,0,1,49,"[0, 1, 49]",2.0,5.0,146.0
4,0,1,73,"[0, 1, 73]",2.0,5.0,198.0
...,...,...,...,...,...,...,...
378720,242,241,232,"[242, 241, 232]",667.0,666.0,641.0
378721,242,241,233,"[242, 241, 233]",667.0,666.0,644.0
378722,242,241,234,"[242, 241, 234]",667.0,666.0,650.0
378723,242,241,236,"[242, 241, 236]",667.0,666.0,654.0


In [53]:
combine_remap = []
for i,j,k in zip(dotMotif_chains['A_remap'], dotMotif_chains['B_remap'], dotMotif_chains['C_remap']):
  combine_remap.append([i,j,k])

In [54]:
dotMotif_chains['motif_cells_remapped'] = combine_remap

In [55]:
dotMotif_chains

Unnamed: 0,A,B,C,motif_cells,A_remap,B_remap,C_remap,motif_cells_remapped
0,0,1,12,"[0, 1, 12]",2.0,5.0,31.0,"[2.0, 5.0, 31.0]"
1,0,1,40,"[0, 1, 40]",2.0,5.0,108.0,"[2.0, 5.0, 108.0]"
2,0,1,45,"[0, 1, 45]",2.0,5.0,125.0,"[2.0, 5.0, 125.0]"
3,0,1,49,"[0, 1, 49]",2.0,5.0,146.0,"[2.0, 5.0, 146.0]"
4,0,1,73,"[0, 1, 73]",2.0,5.0,198.0,"[2.0, 5.0, 198.0]"
...,...,...,...,...,...,...,...,...
378720,242,241,232,"[242, 241, 232]",667.0,666.0,641.0,"[667.0, 666.0, 641.0]"
378721,242,241,233,"[242, 241, 233]",667.0,666.0,644.0,"[667.0, 666.0, 644.0]"
378722,242,241,234,"[242, 241, 234]",667.0,666.0,650.0,"[667.0, 666.0, 650.0]"
378723,242,241,236,"[242, 241, 236]",667.0,666.0,654.0,"[667.0, 666.0, 654.0]"


In [77]:
for i,j,k in tqdm(zip(dotMotif_chains['A'], dotMotif_chains['B'], dotMotif_chains['C'])):
  if v1dd_layers_23[i][j] == 0:
    print(i,j)
  if v1dd_layers_23[j][k] == 0:
    print(j,k)

378725it [00:00, 698475.87it/s]


In [None]:
#####-----LOAD *SAI- MOTIF*-----------######

In [59]:
##chain_motif_oldCode_wthDLT = pd.read_pickle('gdrive/My Drive/Allen Institute- Internship/Downloads/Data/chain_motifs_og.pkl')

In [60]:
chain_motif_newCode_wthIDX = pd.read_pickle('gdrive/My Drive/chain_motifs_idx_set.pkl')

In [62]:
chain_motif_newCode_wthIDX

Unnamed: 0,motif_type,motif_cells,neu_cat0,neu_cat1,neu_cat2,cell_types,cell_type_tuple
5308,chain,"[21, 62, 30]",21,62,30,"[BC-23, BC-23, BC-23]","(BC-23, BC-23, BC-23)"
37018,chain,"[106, 483, 10]",106,483,10,"[BC-23, BC-23, BC-23]","(BC-23, BC-23, BC-23)"
37017,chain,"[106, 377, 567]",106,377,567,"[BC-23, BC-23, BC-23]","(BC-23, BC-23, BC-23)"
37016,chain,"[106, 377, 553]",106,377,553,"[BC-23, BC-23, BC-23]","(BC-23, BC-23, BC-23)"
37015,chain,"[106, 377, 515]",106,377,515,"[BC-23, BC-23, BC-23]","(BC-23, BC-23, BC-23)"
...,...,...,...,...,...,...,...
116588,chain,"[500, 464, 235]",500,464,235,"[PYC-23, PYC-23, PYC-23]","(PYC-23, PYC-23, PYC-23)"
116589,chain,"[500, 464, 239]",500,464,239,"[PYC-23, PYC-23, PYC-23]","(PYC-23, PYC-23, PYC-23)"
116590,chain,"[500, 464, 258]",500,464,258,"[PYC-23, PYC-23, PYC-23]","(PYC-23, PYC-23, PYC-23)"
116579,chain,"[500, 464, 173]",500,464,173,"[PYC-23, PYC-23, PYC-23]","(PYC-23, PYC-23, PYC-23)"


In [None]:
chain_motif_oldCode_wthDLT

# **CAUTION!!**  - CHECK IF INDICES REMAPPED!!!!!

In [66]:
#INDICES ARE NOT REMAPPED!!!!!


chain_check = []
for i,j in zip(chain_motif_newCode_wthIDX['motif_cells'],dotMotif_chains['motif_cells_remapped']) :
  k = np.intersect1d(i,j)
  if k.any() == True:
    chain_check.append(i)

In [None]:
chain_check

In [82]:
len(chain_check)

1983

In [69]:
chain_not_found_check = []
for i,j in zip(chain_motif_newCode_wthIDX['motif_cells'],dotMotif_chains['motif_cells_remapped']) :
  k = np.intersect1d(i,j)
  if k.any() == False:
    chain_not_found_check.append(i)

In [74]:
len(chain_not_found_check)

153844

In [78]:
np.save('gdrive/My Drive/Allen Institute- Internship/Downloads/Data/chains_not_overlapping_Mcode', chain_not_found_check)

In [72]:
#Check if all the cinnections are valid in Groundtruth connectome
for i,j,k in zip(chain_motif_newCode_wthIDX['neu_cat0'], chain_motif_newCode_wthIDX['neu_cat1'], chain_motif_newCode_wthIDX['neu_cat2']):
  if julian_v1_file[j][i] == 0:
    print(i,j)
  if julian_v1_file[i][k] == 0:
    print(j,k)

In [None]:
#Check if all the cinnections are valid in NetworkX connectome
for i,j,k in zip(chain_motif_newCode_wthIDX['neu_cat0'], chain_motif_newCode_wthIDX['neu_cat1'], chain_motif_newCode_wthIDX['neu_cat2']):
  if network_toarray[j][i] == 0:
    print(i,j)
  if network_toarray[i][k] == 0:
    print(j,k)

#*JUST GARBAGE*

In [14]:
len(cell_layers)

668

In [4]:
import numpy as np
import networkx as nx
from dotmotif import Motif, GrandIsoExecutor
import pandas
from tqdm import tqdm
import pickle
import matplotlib.pyplot as plt
from scipy.stats import ranksums
import statsmodels.stats.multitest as smm

In [5]:
# Make a graph of just excitatory cells
cell_table = pd.read_feather('gdrive/My Drive/Allen Institute- Internship/Downloads/Data/pre_cell_table_v1dd_proofread_True_668.feather')
cell_table['connectome_index'] = cell_table.index
cell_table = cell_table.query('cell_type == "PYC"')[['connectome_index', 'pt_root_id', 'soma_layer']].reset_index()
synapse_table = pd.read_feather('gdrive/My Drive/Allen Institute- Internship/Downloads/Data/synapse_table_668.feather')
#pyr_graph = nx.from_numpy_array(adjacency_matrix, create_using=nx.DiGraph)

In [7]:
cells_23 = cell_table.query("(soma_layer == '23')")

In [18]:
len(cell_table)

410

In [16]:
np.sum(cells_23.value_counts())

166

In [9]:
synapse_table

Unnamed: 0,id,pre_pt_root_id,post_pt_root_id,size,ctr_pt_position
0,490761772,864691132665392728,864691132739309209,254,"[947.453118013254, 262.422830509623, -90.69420..."
1,506916116,864691132665392728,864691132719214835,836,"[969.4904036444191, 277.73379460356006, 38.876..."
2,504382711,864691132665392728,864691132623038658,407,"[971.2679645669083, 385.4028553115192, -7.9926..."
3,367290238,864691132665392728,864691130302199283,211,"[771.7989877306841, 203.08676281174291, -8.860..."
4,502435137,864691132665392728,864691132687656170,244,"[967.0521061345537, 228.29936221721715, -44.48..."
...,...,...,...,...,...
1497050,393279538,864691133121018469,864691132833446308,363,"[820.700894202906, 326.578901884546, 71.749369..."
1497051,376121375,864691133121018469,864691132833446308,413,"[805.6361326758964, 289.45799227634546, 69.986..."
1497052,402592644,864691132779328111,864691132833446308,1164,"[843.6520494252051, 326.71368238429403, 58.042..."
1497053,403791524,864691132903015812,864691132833446308,1274,"[839.3116273678567, 304.38019459612747, 78.171..."


In [66]:
julian_v1_file[666][:] = 0

In [67]:
julian_v1_file

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

#**OLD ANALYSIS - DO NOT REFER TO ANYTHING FROM THIS POINT!!!**
**Checking chains in old motif code**

In [54]:
status = []
for i,j in zip(chain_motif_newCode_wthIDX['motif_cells'],dotMotif_chains['motif_cells']):
  found = np.intersect1d(i,j)
  if found.any() == True:
    status.append('In dotMotif')
  else:
      status.append('Not in dotMotif')

In [55]:
chain_motif_newCode_wthIDX['status'] = status

In [67]:
chains_not_in_DM = chain_motif_newCode_wthIDX.query("(status == 'Not in dotMotif')")

In [78]:
chains_layer23_DM = chain_motif_newCode_wthIDX.query("(status == 'In dotMotif')")

In [68]:
len(chains_not_in_DM)

155986

In [69]:
len(np.unique(chains_not_in_DM['motif_cells']))

155986

*chains are not being repeated in the ones not found in DM

In [70]:
len(np.unique(chain_motif_newCode_wthIDX['motif_cells']))

159274

In [74]:
X = []
for i,j in zip(chains_not_in_DM['neu_cat0'], dotMotif_chains['B']):
  KK = np.intersect1d(i,j)
  if KK.any() == True:
    X.append(i)


In [76]:
np.unique(X)

array([ 42,  91, 123, 146, 217, 351, 380, 488, 500, 539, 557, 559, 569,
       577, 622])

In [81]:
len(np.unique(chains_not_in_DM['neu_cat0']))

239

In [82]:
len(np.unique(chain_motif_newCode_wthIDX['neu_cat0']))

239

In [79]:
np.unique(chains_layer23_DM['neu_cat0'])

array([  2,   5,   6,   8,  10,  13,  17,  20,  21,  26,  28,  31,  32,
        35,  38,  39,  42,  51,  53,  60,  61,  67,  76,  77,  81,  84,
        89,  91,  94,  98, 101, 103, 106, 110, 112, 123, 134, 144, 146,
       149, 151, 159, 160, 170, 172, 173, 179, 180, 182, 192, 193, 194,
       195, 196, 204, 208, 210, 212, 217, 221, 224, 226, 233, 234, 249,
       253, 257, 258, 259, 266, 268, 270, 280, 281, 283, 285, 289, 292,
       293, 294, 296, 299, 311, 316, 320, 321, 326, 329, 330, 336, 338,
       340, 342, 343, 347, 351, 352, 354, 358, 362, 364, 365, 377, 379,
       380, 394, 399, 400, 408, 420, 421, 424, 431, 434, 435, 438, 442,
       443, 458, 464, 466, 470, 475, 479, 484, 485, 486, 488, 492, 493,
       497, 499, 500, 502, 503, 505, 507, 516, 517, 518, 525, 530, 531,
       538, 539, 541, 549, 554, 557, 558, 559, 561, 569, 571, 572, 576,
       577, 579, 581, 582, 588, 592, 595, 597, 598, 604, 611, 622, 628,
       641, 651, 654])

*chains are not being repeated in Motif code

In [19]:
table = np.load('gdrive/My Drive/Allen Institute- Internship/Downloads/Data/chain_motifs.npy', allow_pickle='True')

In [None]:
pd.DataFrame(table)

Unnamed: 0,motif_type,motif_cells,neu_cat0,neu_cat1,neu_cat2,cell_types,assembly_cell_id,assembly_id
3816,chain,"[5, 20, 30]",5,20,30,"[PYC-23, BC-23, BC-23]",5,()
3817,chain,"[5, 20, 291]",5,20,291,"[PYC-23, BC-23, BC-23]",5,()
3818,chain,"[5, 25, 30]",5,25,30,"[PYC-23, BC-23, BC-23]",5,()
3819,chain,"[5, 25, 291]",5,25,291,"[PYC-23, BC-23, BC-23]",5,()
3821,chain,"[5, 30, 291]",5,30,291,"[PYC-23, BC-23, BC-23]",5,()
...,...,...,...,...,...,...,...,...
258299,chain,"[588, 314, 170]",588,314,170,"[MC-23, PYC-23, PYC-23]",170,()
258313,chain,"[588, 420, 170]",588,420,170,"[MC-23, PYC-23, PYC-23]",170,()
258327,chain,"[588, 513, 170]",588,513,170,"[MC-23, PYC-23, PYC-23]",170,()
258341,chain,"[588, 577, 170]",588,577,170,"[MC-23, PYC-23, PYC-23]",170,()


In [None]:
table = pd.read_pickle('gdrive/My Drive/Allen Institute- Internship/Data/chain_motifs_og.pkl')

In [None]:
pd.DataFrame(table)

Unnamed: 0,motif_type,motif_cells,neu_cat0,neu_cat1,neu_cat2,cell_types
35,chain,"[2, 25, 30]",2,25,30,"[MC-23, BC-23, BC-23]"
36,chain,"[2, 25, 34]",2,25,34,"[MC-23, BC-23, BC-23]"
37,chain,"[2, 25, 37]",2,25,37,"[MC-23, BC-23, BC-23]"
38,chain,"[2, 25, 41]",2,25,41,"[MC-23, BC-23, BC-23]"
39,chain,"[2, 25, 75]",2,25,75,"[MC-23, BC-23, BC-23]"
...,...,...,...,...,...,...
279647,chain,"[664, 500, 234]",664,500,234,"[PYC-23, PYC-23, PYC-23]"
279648,chain,"[664, 559, 123]",664,559,123,"[PYC-23, PYC-23, PYC-23]"
279649,chain,"[664, 559, 234]",664,559,234,"[PYC-23, PYC-23, PYC-23]"
279650,chain,"[664, 604, 123]",664,604,123,"[PYC-23, PYC-23, PYC-23]"


In [None]:
table = pd.read_pickle('gdrive/My Drive/Allen Institute- Internship/Downloads/Data/chain_motifs.pkl')

In [None]:
pd.DataFrame(table)

Unnamed: 0,motif_type,motif_cells,neu_cat0,neu_cat1,neu_cat2,cell_types
1231,chain,"[2, 25, 30]",2,25,30,"[MC-23, BC-23, BC-23]"
1232,chain,"[2, 25, 34]",2,25,34,"[MC-23, BC-23, BC-23]"
1233,chain,"[2, 25, 37]",2,25,37,"[MC-23, BC-23, BC-23]"
1234,chain,"[2, 25, 41]",2,25,41,"[MC-23, BC-23, BC-23]"
1235,chain,"[2, 25, 75]",2,25,75,"[MC-23, BC-23, BC-23]"
...,...,...,...,...,...,...
169953,chain,"[664, 500, 234]",664,500,234,"[PYC-23, PYC-23, PYC-23]"
169954,chain,"[664, 559, 123]",664,559,123,"[PYC-23, PYC-23, PYC-23]"
169955,chain,"[664, 559, 234]",664,559,234,"[PYC-23, PYC-23, PYC-23]"
169956,chain,"[664, 604, 123]",664,604,123,"[PYC-23, PYC-23, PYC-23]"


In [None]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,):print(table)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

