# Ride or die neurons in the Celltype connectome
This notebook uses the ride_or_die.ipynb notebook written by Dr. Gabrielle J Gutierrez as a basis. The goal is the find the "ride or die" celltypes in the oviINs connectome of celltypes. These are the celltypes that stick together throughout all the modularity resolutions.

1. Set the maximum resolution to use to asses wether neurons stuck together or not
2. Within each module at the maximum resolution, find which neurons traveled together consistently through the same modules at lower resolutions. \
    a. If yes, they are "ride or die" celltype. \
    b. if no, the whole module is discarded.
3. Plot a sankey of the ride or die contigent to check results

We have gone over this file (6/13/24) and decided that this isn't doing what we want fully. We need a pairwise analysis instead of a module path based filtering

In [203]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import plotly.graph_objects as go

In [236]:
# Use on lab computer
ovi_cell  = pd.read_csv('/Users/rw2822/Documents/GitHub/flybrain-clustering/oviIN_celltype/oviIN_combined/full/preprocessed-v1.2.1/preprocessed_nodes.csv', index_col=0)

# import excel file
df_test = pd.read_excel('sampledata_communitypillars.xlsx', )

df_test = df_test.rename(columns={0: '0.0', 0.5:'0.5', 1:'1.0'})

# make id column the index
df_test = df_test.set_index('id')
df_test

# Use on personal computer
#ovi_cell  = pd.read_csv('/Users/rhessa/flybrain-clustering/oviIN_celltype/oviIN_combined/full/preprocessed-v1.2.1/preprocessed_nodes.csv')

Unnamed: 0_level_0,0.0,0.5,1.0
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1000,1,1,1
2000,1,1,2
3000,1,1,3
4000,1,2,4
5000,1,2,5
6000,2,2,6
7000,2,2,5
8000,2,3,7


In [242]:
ovi_cell[ovi_cell['celltype'] == 'FS1A']

Unnamed: 0,celltype,key,0.75,0.05,0.1,0.25,0.5,1.0,0.0
409,FS1A,410,236,6,13,37,186,263,6


In [244]:
ovi_cell[ovi_cell['1.0'] == 263]

Unnamed: 0,celltype,key,0.75,0.05,0.1,0.25,0.5,1.0,0.0
409,FS1A,410,236,6,13,37,186,263,6
1454,oviIN,1455,236,2,2,37,186,263,2


In [243]:
# Finding the FS1A celltype
fs_test = ovi_cell[ovi_cell['0.0'] == 6]
fs_test[fs_test['1.0'] == 263]  

Unnamed: 0,celltype,key,0.75,0.05,0.1,0.25,0.5,1.0,0.0
409,FS1A,410,236,6,13,37,186,263,6


## Sankey function that accepts in a dataframe and a list of chi values

This is a simple function tested in sankey_sandbox.ipynb!

In [206]:
# Function for creating the sankey diagram
def create_sankey(df,  prominent=False, prom_types=None, width = None, height = None, title=None):
    """This function creates a sankey diagram from the data in the dataframe df.
    The columns to be used are in the list chis. If prominent is True, then only 
    the prominent types are used.
    
    df: dataframe with partition data
    prominent: boolean, whether to use only prominent types
    prom_types: dataframe of prominent types and their weights, columns are 'type_pre' and 'weight'
    FigSize: int of figure size

    Returns: Sankey Figure
    """

    # Columns to be used
    chis = df.columns

    # prominent types only option:
    if prominent:
        ovi_chunk_df = df[df['celltype'].isin(prom_types['type_pre'])]

        # Take only the columns that are needed
        ovi_chunk_df = ovi_chunk_df[chis]
    
    else:
        ovi_chunk_df = df[chis]

    # Append correct _r values to the columns
    for i, x in enumerate(chis):
        ovi_chunk_df[x] = ovi_chunk_df[x].astype(str) + '_r' + str(x)

    # add a column of ones to ovi_HB_chunk
    ovi_chunk_df['counts'] = 1
    
    # find all the unique nodes
    nodes = []
    for chi in chis:    
        nodes += ovi_chunk_df[chi].unique().tolist()


    # Finding the links and putting it into a dataframe
    links = pd.DataFrame()
    for i in range(len(chis)-1):
        df = ovi_chunk_df.groupby([chis[i],chis[i+1]])['counts'].count().reset_index().rename(columns={chis[i]:'source',chis[i+1]:'target','counts':'value'})
        links = pd.concat([links, df], axis=0)

    # create a mapping dictionary
    mapping_dict = {k: v for v, k in enumerate(nodes)}

    # replace source and target with mapping dictionary
    links['source'] = links['source'].map(mapping_dict)
    links['target'] = links['target'].map(mapping_dict)

    # turn this table into a dictionary for making the sankey diagram
    links_dict = links.to_dict(orient='list')

    fig = go.Figure(data=[go.Sankey(
        node = dict(
            pad = 15,
            thickness=20,
            line=dict(width=0.5),
            label = nodes,
        ),
        link = dict(
            source= links_dict['source'],
            target = links_dict['target'],
            value = links_dict['value']
        )
        )
    ])
    # make figure larger
    fig.update_layout(height = height, width= width, title=title)
    fig.show()

In [207]:
ovi_cell_chunk = ovi_cell[['0.0', '0.05', '0.1']].copy()
create_sankey(ovi_cell_chunk, width=800, height=800, title='Ovi Cell Types')

## Dummy data testing

In [234]:
# max res to use
max_res = '0.5'

# df to use
cell_test = df_test[["0.0", '0.5']].copy()

# Get list of module ids
module_ids = cell_test[max_res].unique().tolist()

In [235]:
the_homies = []

# loop through each module id
for i in module_ids:
    # Grab the roes that have module id 
    temp_df = cell_test[cell_test[max_res] == i]
    print(temp_df)
    #print(temp_df.drop_duplicates())
    #print(temp_df.drop_duplicates().shape[0])

    # test whether there is more than one row since singletons dont count
    #if temp_df.shape[0] > 1:
        
        # test for number of unique rows
    if temp_df.drop_duplicates().shape[0] == 1:
        the_homies.append(i)
    print(the_homies)

# Retrieve rows of the original dataframe that have ride or die modules
ride_or_die = df_test[df_test[max_res].isin(the_homies)]
ride_or_die

      0.0  0.5
id            
1000    1    1
2000    1    1
3000    1    1
[1]
      0.0  0.5
id            
4000    1    2
5000    1    2
6000    2    2
7000    2    2
[1]
      0.0  0.5
id            
8000    2    3
[1, 3]


Unnamed: 0_level_0,0.0,0.5,1.0
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1000,1,1,1
2000,1,1,2
3000,1,1,3
8000,2,3,7


## Ride or die from 0.1 max res for celltypes

In [208]:
# max res to use
max_res = '0.1'

# df to use
cell_test = ovi_cell[["0.0","0.05", max_res]].copy()

# Get list of module ids
module_ids = cell_test[max_res].unique().tolist()

In [210]:
cell_test[cell_test['0.1'] == 1]

Unnamed: 0,0.0,0.05,0.1
0,1,1,1
1,1,1,1
79,1,1,1
80,1,1,1
86,1,1,1
...,...,...,...
1444,1,1,1
1455,1,1,1
1456,1,1,1
1457,1,1,1


In [211]:
# create an empty list to store mod ids of riders
the_homies = []

# loop through each module id
for i in module_ids:
    # Grab the roes that have module id 
    temp_df = cell_test[cell_test[max_res] == i]
    print(temp_df)
    print(temp_df.drop_duplicates())
    print(temp_df.drop_duplicates().shape[0])

    # test whether there is more than one row since singletons dont count
    #if temp_df.shape[0] > 1:
        
        # test for number of unique rows
    if temp_df.drop_duplicates().shape[0] == 1:
        the_homies.append(i)
    print(the_homies)

# Retrieve rows of the original dataframe that have ride or die modules
ride_or_die = ovi_cell[ovi_cell[max_res].isin(the_homies)]
ride_or_die

      0.0  0.05  0.1
0       1     1    1
1       1     1    1
79      1     1    1
80      1     1    1
86      1     1    1
...   ...   ...  ...
1444    1     1    1
1455    1     1    1
1456    1     1    1
1457    1     1    1
1461    1     1    1

[258 rows x 3 columns]
     0.0  0.05  0.1
0      1     1    1
280    3     3    1
398    6     6    1
646    2     2    1
996    5     5    1
5
[]
      0.0  0.05  0.1
2       2     2    2
8       2     2    2
9       2     2    2
11      2     2    2
20      2     2    2
...   ...   ...  ...
1443    2     2    2
1445    2     2    2
1454    2     2    2
1458    2     2    2
1459    2     2    2

[270 rows x 3 columns]
    0.0  0.05  0.1
2     2     2    2
60    1     1    2
2
[]
      0.0  0.05  0.1
3       2     2    3
38      1     1    3
52      2     2    3
55      2     2    3
56      2     2    3
...   ...   ...  ...
1427    2     2    3
1431    4     2    3
1434    2     2    3
1435    2     2    3
1462    2     2    3

[169 row

Unnamed: 0,celltype,key,0.75,0.05,0.1,0.25,0.5,1.0,0.0
5,AOTU003,6,6,4,5,5,6,6,4
68,AVLP495,69,57,4,11,13,48,59,4
412,FS4A,413,238,6,14,47,187,265,6
487,LAL056,488,268,4,15,25,208,302,4
502,LAL087,503,256,7,16,26,202,288,4
503,LAL090,504,66,4,17,22,58,67,4
605,LHPV11a1,606,308,9,18,60,241,353,3
627,MBON07,628,318,7,21,26,248,362,1
661,PAM04_a,662,326,3,22,62,254,375,3
662,PAM04_b,663,327,3,22,26,255,376,3


In [212]:
ride_or_die= ride_or_die[['0.0', '0.05', '0.1', '0.5', '0.75', '1.0']].copy()
ride_or_die

Unnamed: 0,0.0,0.05,0.1,0.5,0.75,1.0
5,4,4,5,6,6,6
68,4,4,11,48,57,59
412,6,6,14,187,238,265
487,4,4,15,208,268,302
502,4,7,16,202,256,288
503,4,4,17,58,66,67
605,3,9,18,241,308,353
627,1,7,21,248,318,362
661,3,3,22,254,326,375
662,3,3,22,255,327,376


In [213]:
the_homies

[5, 11, 14, 15, 16, 17, 18, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]

In [214]:
create_sankey(ride_or_die, width=1000, height=600, title='Ride or Die Modules')

In [215]:
ovi_cell[ovi_cell['celltype'] == 'FS1A']

Unnamed: 0,celltype,key,0.75,0.05,0.1,0.25,0.5,1.0,0.0
409,FS1A,410,236,6,13,37,186,263,6


Here we see that there are only neurons from cluster 1, 3 and 4 that are left at the end. Doing jaccard on this dataframe would result in 100% similaritity

## Ride or die from 1.0 max res

In [216]:
# max res to use
max_res = '1.0'

# df to use
cell_test = ovi_cell[["0.0","0.05", '0.1', '0.5', '0.75', '1.0']].copy()

# Get list of module ids
module_ids = cell_test[max_res].unique().tolist()

In [217]:
# create an empty list to store mod ids of riders
the_homies = []

# loop through each module id
for i in module_ids:
    # Grab the rows that have module id 
    temp_df = cell_test[cell_test[max_res] == i]

    # test for number of unique rows
    if temp_df.drop_duplicates().shape[0] == 1:
        the_homies.append(i)

# Retrieve rows of the original dataframe that have ride or die modules
ride_or_die = ovi_cell[ovi_cell[max_res].isin(the_homies)]
ride_or_die

Unnamed: 0,celltype,key,0.75,0.05,0.1,0.25,0.5,1.0,0.0
0,,1,1,1,1,1,1,1,1
2,5-HTPMPV01,3,3,2,2,3,3,3,2
7,AOTU007,8,8,2,7,3,8,8,2
8,AOTU008_a,9,9,2,2,3,9,9,2
13,AOTU019,14,14,4,6,8,13,14,4
...,...,...,...,...,...,...,...,...,...
1453,oviDNb,1454,398,2,8,76,300,596,5
1455,pC1a,1456,488,1,1,59,348,552,1
1456,pC1b,1457,351,1,1,1,272,402,1
1458,pC1d,1459,439,2,2,70,317,588,2


In [218]:
ride_or_die[ride_or_die['celltype'] == 'FS1A']  

Unnamed: 0,celltype,key,0.75,0.05,0.1,0.25,0.5,1.0,0.0


In [219]:
ride_or_die_plot= ride_or_die[['0.0', '0.05', '0.1', '0.5', '0.75', '1.0']].copy()
ride_or_die_plot

Unnamed: 0,0.0,0.05,0.1,0.5,0.75,1.0
0,1,1,1,1,1,1
2,2,2,2,3,3,3
7,2,2,7,8,8,8
8,2,2,2,9,9,9
13,4,4,6,13,14,14
...,...,...,...,...,...,...
1453,5,2,8,300,398,596
1455,1,1,1,348,488,552
1456,1,1,1,272,351,402
1458,2,2,2,317,439,588


In [220]:
create_sankey(ride_or_die_plot, width=1000, height=600, title='Ride or Die Modules')

In [221]:
# Filter cluster 3 of ride or die
ride_or_die_cluster3 = ride_or_die[ride_or_die['0.0'] == 3]
ride_or_die_cluster3

Unnamed: 0,celltype,key,0.75,0.05,0.1,0.25,0.5,1.0,0.0
19,AOTU030,20,20,3,10,11,18,20,3
26,ATL001,27,25,3,7,10,21,26,3
27,ATL002,28,26,3,7,10,22,27,3
33,ATL012,34,25,3,7,10,21,26,3
46,ATL038,47,25,3,7,10,21,26,3
...,...,...,...,...,...,...,...,...,...
1247,SMP419,1248,497,3,10,11,353,572,3
1268,SMP447,1269,497,3,10,11,353,572,3
1370,SMP568_a,1371,448,3,10,11,118,503,3
1439,WEDPN17_a,1440,309,4,19,54,242,354,3


There are alot more nodes when filtering at 1.0. Each grouping seems to include some celltypes of interest. 

# Ride or die at 0.75 max res

In [222]:
# max res to use
max_res = '0.75'

# df to use
cell_test = ovi_cell[['0.0','0.05', '0.1', '0.5', '0.75']].copy()

# Get list of module ids
module_ids = cell_test[max_res].unique().tolist()

# create an empty list to store mod ids of riders
the_homies = []

# loop through each module id
for i in module_ids:
    # Grab the rows that have module id 
    temp_df = cell_test[cell_test[max_res] == i]

    # test whether there ismore than one row since singletons dont count
    #if temp_df.shape[0] > 1:
        
        # test for number of unique rows
    if temp_df.drop_duplicates().shape[0] == 1:
        the_homies.append(i)
# Retrieve rows of the original dataframe that have ride or die modules
ride_or_die = ovi_cell[ovi_cell[max_res].isin(the_homies)]
ride_or_die_plot= ride_or_die[['0.0', '0.05', '0.1', '0.5', '0.75', '1.0']].copy()
ride_or_die_plot

Unnamed: 0,0.0,0.05,0.1,0.5,0.75,1.0
0,1,1,1,1,1,1
2,2,2,2,3,3,3
6,4,4,6,7,7,7
8,2,2,2,9,9,9
13,4,4,6,13,14,14
...,...,...,...,...,...,...
1451,4,4,6,200,255,286
1455,1,1,1,348,488,552
1456,1,1,1,272,351,402
1458,2,2,2,317,439,588


In [223]:
ride_or_die

Unnamed: 0,celltype,key,0.75,0.05,0.1,0.25,0.5,1.0,0.0
0,,1,1,1,1,1,1,1,1
2,5-HTPMPV01,3,3,2,2,3,3,3,2
6,AOTU004,7,7,4,6,6,7,7,4
8,AOTU008_a,9,9,2,2,3,9,9,2
13,AOTU019,14,14,4,6,8,13,14,4
...,...,...,...,...,...,...,...,...,...
1451,mALD1,1452,255,4,6,53,200,286,4
1455,pC1a,1456,488,1,1,59,348,552,1
1456,pC1b,1457,351,1,1,1,272,402,1
1458,pC1d,1459,439,2,2,70,317,588,2


In [224]:
ride_or_die[ride_or_die['celltype'] == 'FS1A']  

Unnamed: 0,celltype,key,0.75,0.05,0.1,0.25,0.5,1.0,0.0


In [225]:
create_sankey(ride_or_die_plot, width=1000, height=600, title='Ride or Die Modules')

# 0.5

In [None]:
# max res to use
max_res = '0.5'

# df to use
cell_test = ovi_cell[['0.0','0.05', '0.1', '0.5']].copy()

# Get list of module ids
module_ids = cell_test[max_res].unique().tolist()

# create an empty list to store mod ids of riders
the_homies = []

# loop through each module id
for i in module_ids:
    # Grab the rows that have module id 
    temp_df = cell_test[cell_test[max_res] == i]

    # test whether there ismore than one row since singletons dont count
    #if temp_df.shape[0] > 1:
        
        # test for number of unique rows
    if temp_df.drop_duplicates().shape[0] == 1:
        the_homies.append(i)
# Retrieve rows of the original dataframe that have ride or die modules
ride_or_die = ovi_cell[ovi_cell[max_res].isin(the_homies)]
ride_or_die_plot= ride_or_die[['0.0', '0.05', '0.1', '0.5', '0.75', '1.0']].copy()
ride_or_die_plot

Unnamed: 0,0.0,0.05,0.1,0.5,0.75,1.0
1,1,1,1,2,2,2
2,2,2,2,3,3,3
6,4,4,6,7,7,7
13,4,4,6,13,14,14
14,4,4,6,14,15,15
...,...,...,...,...,...,...
1451,4,4,6,200,255,286
1455,1,1,1,348,488,552
1456,1,1,1,272,351,402
1458,2,2,2,317,439,588


In [227]:
# Sankey
create_sankey(ride_or_die_plot, width=1000, height=600, title='Ride or Die Modules at 0.5')

In [228]:
ride_or_die[ride_or_die['celltype'] == 'FS1A']  

Unnamed: 0,celltype,key,0.75,0.05,0.1,0.25,0.5,1.0,0.0


Debrief.... What is happening here other than getting rid of duplicates? Answer: we are finding unique paths for modules at the max resolution specified.