In [26]:
import os, json
from subprocess import Popen, PIPE, STDOUT
import pandas as pd
import numpy as np

In [58]:
class MorphologyData:
    def __init__(self, master_path='regions/', json_save_dir='morph_metadata/'):
        self.master_path = master_path
        self.json_save_dir = json_save_dir
        self.df = None
        self.morpho_df = None
        self.as_morph = None
        self.as_metadata = None

    @staticmethod
    def check_swc(curdir):
        for i in os.path.listdir(curdir):
            if i.endswith('swc'):
                return True
        return False

    def check_metadata_exists(self, name):
        for i in os.listdir(self.json_save_dir):
            if i == f'{name}.json':
                return True
        return False

    def fetch_metadata(self):
        '''
        STEP 1: Fetch metadata from neuromorpho.org using the name of the swc file (assumed its downloaded)
        See neuromorpho to download all the data you want, then process them here
        '''

        print(f'Extracting data from {self.master_path}')
        for i in os.listdir(f'{self.master_path}'):
            name = i.split('.')[0]
            if self.check_metadata_exists(name):
                continue
            command = f"sh get_from_name.sh {name} {self.json_save_dir}"
            process = Popen(command, shell=True, stdout=PIPE, stderr=STDOUT)

            with process.stdout:
                for line in iter(process.stdout.readline, b''):
                    print(line.decode("utf-8").strip())


    def combine_raw_metadata(self):
            '''
            STEP 2: After fetching metadata in separate json files, combine them into a single csv file before preprocessing
            '''
            dfs = []
            for file in os.listdir(morph_data.json_save_dir):
                try:
                    with open(f'{morph_data.json_save_dir}/{file}') as f:
                        json_data = pd.json_normalize(json.loads(f.read()))
                    dfs.append(json_data)
                except Exception as e:
                    print(f'{file}:{str(e)}')
                    
            df = pd.concat(dfs, sort=False)  # or sort=True depending on your needs
            df.reset_index(inplace=True, drop=True)
            #df.to_csv('morpho_metadata_raw.csv')
            return df

    def clean_metadata(self,df):
            '''
            STEP 3: Clean the metadata
            Reasonings for cleaning are explained in comments per section
            '''
            # experiment_condition
            ## Some column variables are depicted as lists although there is 1 element. Groupby does not like that so lets take the 1st elem
            df['experiment_condition'] = df['experiment_condition'].apply(lambda x: x[0])

            # (Skipped) Reference_doi: Some list columns have more than 1 value or No value. Lets examine those
            #refdoi = df.copy().reference_doi
            #refdoi.dropna(inplace=True)

            # Species 
            # Somehow morphoneuro name query returns non mouse morphologies. Filter them out
            # lets also keep track of what we have dropped
            shouldnt_be_in_df = df[df.species!='mouse']
            df.drop(df[df.species!='mouse'].index,inplace=True) # somehow some queries returned non mouse entries

            # Domain: remove all except dsa and dsna (n=3 cells only)
            temp_df = df[df.domain != 'Dendrites, Soma, Axon']
            temp_df = temp_df[temp_df.domain != 'Dendrites, Soma, No Axon']
            df.drop(temp_df.index,inplace=True)
            shouldnt_be_in_df = pd.concat([shouldnt_be_in_df,temp_df])

            # Experiment-Control Group Difference
            df_control_group = df[df.experiment_condition == 'Control']
            df_experiment_group = df[df.experiment_condition != 'Control']

            df = df.reset_index(drop=True)
            df2 = df.copy()

            # Brain regions: Convert from list to 5 level categories
            max_level_in_data = np.max([len(i) for i in df2['brain_region']])
            cols_brainreg = [f'brain_region_{i}' for i in range(1,max_level_in_data+1)]

            reg_df = pd.DataFrame(index=df2.index,columns=cols_brainreg)

            for idx,i in df2['brain_region'].items():
                len_i = len(i)
                for idx2,j in enumerate(i):
                    reg_df.iloc[idx,idx2] = j

            df2 = df2.join(reg_df)

            # Cell types : Similar to brain regions but we have to do it for each cell type
            max_level_in_cell_type = np.max([len(i) for i in df['cell_type']])
            cols_celltype = [f'cell_type_{i}' for i in range(1,max_level_in_cell_type+1)]
            cell_type_df = pd.DataFrame(index=df2.index,columns=cols_celltype)

            for idx,i in df2['cell_type'].items():
                for idx2,j in enumerate(i):
                    cell_type_df.iloc[idx,idx2] = j

            df2 = df2.join(cell_type_df)

            # Reorder columns
            new_col_order = ['neuron_id', 'neuron_name', 'archive', 'note', 'age_scale', 'gender',
                'age_classification', 'brain_region','brain_region_1', 'brain_region_2', 'brain_region_3', 'brain_region_4',
                'brain_region_5', 'cell_type','cell_type_1','cell_type_2','cell_type_3', 'species', 'strain',
                'scientific_name', 'stain', 'experiment_condition', 'protocol',
                'slicing_direction', 'reconstruction_software', 'objective_type',
                'original_format', 'domain', 'attributes', 'magnification',
                'upload_date', 'deposition_date', 'shrinkage_reported',
                'shrinkage_corrected', 'reported_value', 'reported_xy', 'reported_z',
                'corrected_value', 'corrected_xy', 'corrected_z', 'soma_surface',
                'surface', 'volume', 'slicing_thickness', 'min_age', 'max_age',
                'min_weight', 'max_weight', 'png_url', 'reference_pmid',
                'reference_doi', 'physical_Integrity', '_links.self.href',
                '_links.measurements.href', '_links.persistence_vector.href']
            df2 = df2[new_col_order]

            # filter out non-hippocampal regions
            shouldnt_be_in_df2 = df2[df2.brain_region_1=='retina']
            shouldnt_be_in_df2 = pd.concat([shouldnt_be_in_df2,df2[df2.brain_region_1=='neocortex']])

            df2.drop(shouldnt_be_in_df2.index,inplace=True)

            na_cols = df2.columns[df2.isna().all().values]
            df2.drop(na_cols,axis=1,inplace=True)

            # fill empty values with 'Unknown'
            df2.reset_index(inplace=True,drop=True)
            drop_idx = np.where(df2.shrinkage_corrected=='')[0]
            df2.loc[drop_idx,'shrinkage_corrected'] = 'Unknown'

            drop_idx2 = np.where(df2.shrinkage_reported=='')[0]
            df2.loc[drop_idx2,'shrinkage_reported'] = 'Unknown'

            df2.shrinkage_reported = df2.shrinkage_reported.str.title()
            df2.shrinkage_corrected = df2.shrinkage_corrected.str.title()

            # some morphologies are from control and some experiment group. 
            # Since each process might affect the morphology, we will keep them separate
            df2_control_group = df2[df2.experiment_condition == 'Control']
            df2_experiment_group = df2[df2.experiment_condition != 'Control']
            df2_control_group.reset_index(inplace=True,drop=True)
            df2_experiment_group.reset_index(inplace=True,drop=True)
            df2_control_group.to_csv('morpho_metadata_controls.csv')
            df2_experiment_group.to_csv('morpho_metadata_experiments.csv')

            self.cleaned_metadata = df2
            return df2


    def save(self,save_path='morpho_metadata_processed.csv'):
        '''
        STEP 4: Save the cleaned metadata
        '''
        self.cleaned_metadata.to_csv(save_path)

In [59]:
morph_data = MorphologyData(master_path='../data/morphologies/swc30/',json_save_dir='../data/metadata/morph_features/')
morph_data.fetch_metadata()

Extracting data from ../data/morphologies/swc30/
--2023-05-16 19:49:59--  http://neuromorpho.org/api/neuron/data/metadata/morph_features/
Resolving bbpproxy.epfl.ch (bbpproxy.epfl.ch)... 192.33.211.34
Connecting to bbpproxy.epfl.ch (bbpproxy.epfl.ch)|192.33.211.34|:80... connected.
Proxy request sent, awaiting response... 301 Moved Permanently
Location: https://neuromorpho.org/api/neuron/data/metadata/morph_features/ [following]
--2023-05-16 19:50:00--  https://neuromorpho.org/api/neuron/data/metadata/morph_features/
Connecting to bbpproxy.epfl.ch (bbpproxy.epfl.ch)|192.33.211.34|:80... connected.
Proxy request sent, awaiting response... 404 Not Found
2023-05-16 19:50:00 ERROR 404: Not Found.

get_from_name.sh: line 2: /../data/metadata/morph_features/.json: No such file or directory


In [60]:
df = morph_data.combine_raw_metadata()
# morph_data.clean_metadata()
# morph_data.save()

AA1257.json:Expecting value: line 1 column 1 (char 0)
AA1319.json:Expecting value: line 1 column 1 (char 0)
AA1182.json:Expecting value: line 1 column 1 (char 0)
AA1133.json:Expecting value: line 1 column 1 (char 0)
AA1304.json:Expecting value: line 1 column 1 (char 0)
AA1122.json:Expecting value: line 1 column 1 (char 0)
AA1249.json:Expecting value: line 1 column 1 (char 0)
AA1302.json:Expecting value: line 1 column 1 (char 0)
AA1136.json:Expecting value: line 1 column 1 (char 0)
AA1248.json:Expecting value: line 1 column 1 (char 0)
AA1286.json:Expecting value: line 1 column 1 (char 0)
AA1123.json:Expecting value: line 1 column 1 (char 0)
AA1143.json:Expecting value: line 1 column 1 (char 0)
AA1160.json:Expecting value: line 1 column 1 (char 0)


In [72]:
df.to_csv('../data/metadata/morpho_metadata_raw.csv')

In [74]:
df

Unnamed: 0,neuron_id,neuron_name,archive,note,age_scale,gender,age_classification,brain_region,cell_type,species,...,max_age,min_weight,max_weight,png_url,reference_pmid,reference_doi,physical_Integrity,_links.self.href,_links.measurements.href,_links.persistence_vector.href
0,85207,AA0088,MouseLight,imaged at ~0.3 x 0.3 x 1 micron resolution wit...,Month,Female,adult,"[hypothalamus, Paraventricular hypothalamic nu...","[principal cell, projection]",mouse,...,6.0,25.0,25.0,http://neuromorpho.org/images/imageFiles/Mouse...,[27862192],[10.1002/jnr.23978],"Dendrites Moderate, Axon Complete",http://neuromorpho.org/api/neuron/id/85207,http://neuromorpho.org/api/morphometry/id/85207,http://neuromorpho.org/api/pvec/id/85207
1,260072,AA1067,MouseLight,,Month,Female,adult,"[hypothalamus, Posterior hypothalamic nucleus,...","[principal cell, projection]",mouse,...,2.0,25.0,25.0,,[30859571],[10.1101/537233],"Dendrites Moderate, Axon Complete",http://neuromorpho.org/api/neuron/id/260072,http://neuromorpho.org/api/morphometry/id/260072,http://neuromorpho.org/api/pvec/id/260072
2,121606,AA0846,MouseLight,,Month,Female,adult,"[neocortex, frontal, secondary motor, layer 5]","[principal cell, pyramidal, projection]",mouse,...,2.0,25.0,25.0,http://neuromorpho.org/images/imageFiles/Mouse...,[-8],[10.2139/ssrn.3330557],"Dendrites Moderate, Axon Complete",http://neuromorpho.org/api/neuron/id/121606,http://neuromorpho.org/api/morphometry/id/121606,http://neuromorpho.org/api/pvec/id/121606
3,85032,AA0198,MouseLight,imaged at ~0.3 x 0.3 x 1 micron resolution wit...,Month,Female,adult,"[hippocampus, dentate gyrus]","[principal cell, granule, projection]",mouse,...,6.0,25.0,25.0,http://neuromorpho.org/images/imageFiles/Mouse...,[27862192],[10.1002/jnr.23978],"Dendrites Moderate, Axon Complete",http://neuromorpho.org/api/neuron/id/85032,http://neuromorpho.org/api/morphometry/id/85032,http://neuromorpho.org/api/pvec/id/85032
4,121561,AA0779,MouseLight,,Month,Female,adult,"[hypothalamus, lateral hypothalamic area]","[principal cell, projection]",mouse,...,2.0,25.0,25.0,http://neuromorpho.org/images/imageFiles/Mouse...,[-8],[10.2139/ssrn.3330557],"Dendrites Moderate, Axon Complete",http://neuromorpho.org/api/neuron/id/121561,http://neuromorpho.org/api/morphometry/id/121561,http://neuromorpho.org/api/pvec/id/121561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1106,121671,AA0692,MouseLight,,Month,Female,adult,"[thalamus, Ventral anterior-lateral complex of...","[principal cell, projection]",mouse,...,2.0,25.0,25.0,http://neuromorpho.org/images/imageFiles/Mouse...,[-8],[10.2139/ssrn.3330557],"Dendrites Moderate, Axon Complete",http://neuromorpho.org/api/neuron/id/121671,http://neuromorpho.org/api/morphometry/id/121671,http://neuromorpho.org/api/pvec/id/121671
1107,121729,AA0630,MouseLight,,Month,Female,adult,"[neocortex, frontal, primary motor, layer 6a]","[principal cell, projection]",mouse,...,2.0,25.0,25.0,http://neuromorpho.org/images/imageFiles/Mouse...,[-8],[10.2139/ssrn.3330557],"Dendrites Moderate, Axon Complete",http://neuromorpho.org/api/neuron/id/121729,http://neuromorpho.org/api/morphometry/id/121729,http://neuromorpho.org/api/pvec/id/121729
1108,121670,AA0355,MouseLight,,Month,Female,adult,"[thalamus, Ventral anterior-lateral complex of...","[principal cell, projection]",mouse,...,2.0,25.0,25.0,http://neuromorpho.org/images/imageFiles/Mouse...,[-8],[10.2139/ssrn.3330557],"Dendrites Moderate, Axon Complete",http://neuromorpho.org/api/neuron/id/121670,http://neuromorpho.org/api/morphometry/id/121670,http://neuromorpho.org/api/pvec/id/121670
1109,121883,AA0328,MouseLight,,Month,Female,adult,"[neocortex, frontal, secondary motor, layer 2-3]","[principal cell, projection]",mouse,...,2.0,25.0,25.0,http://neuromorpho.org/images/imageFiles/Mouse...,[-8],[10.2139/ssrn.3330557],"Dendrites Moderate, Axon Complete",http://neuromorpho.org/api/neuron/id/121883,http://neuromorpho.org/api/morphometry/id/121883,http://neuromorpho.org/api/pvec/id/121883


In [62]:
df[['neuron_name','cell_type']]

Unnamed: 0,neuron_name,cell_type
0,AA0088,"[principal cell, projection]"
1,AA1067,"[principal cell, projection]"
2,AA0846,"[principal cell, pyramidal, projection]"
3,AA0198,"[principal cell, granule, projection]"
4,AA0779,"[principal cell, projection]"
...,...,...
1106,AA0692,"[principal cell, projection]"
1107,AA0630,"[principal cell, projection]"
1108,AA0355,"[principal cell, projection]"
1109,AA0328,"[principal cell, projection]"


In [70]:
df[df.neuron_name=='AA0998'].cell_type

949    [principal cell, granule, projection]
Name: cell_type, dtype: object

In [66]:
np.unique(np.concatenate(df.cell_type),return_counts=True)

(array(['Purkinje', 'cervicothalamic tract', 'granule', 'interneuron',
        'principal cell', 'projection', 'pyramidal'], dtype='<U21'),
 array([  45,    1,   41,   17, 1094, 1092,  616]))

In [67]:
np.unique(df.cell_type,return_counts=True)

(array([list(['interneuron']),
        list(['principal cell', 'Purkinje', 'projection']),
        list(['principal cell', 'granule', 'projection']),
        list(['principal cell', 'projection']),
        list(['principal cell', 'projection', 'cervicothalamic tract']),
        list(['principal cell', 'projection', 'pyramidal']),
        list(['principal cell', 'pyramidal']),
        list(['principal cell', 'pyramidal', 'projection'])], dtype=object),
 array([ 17,  45,  41, 391,   1,   1,   2, 613]))

In [111]:
df2 = df[['neuron_name','cell_type','brain_region']]
df2 = df2.explode('cell_type')
df2 = df2.explode('brain_region')
df2_grouped = df2.groupby(['cell_type','brain_region'])

In [119]:
df2.groupby(['cell_type','brain_region']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,neuron_name
cell_type,brain_region,Unnamed: 2_level_1
Purkinje,Purkinje layer,4
Purkinje,ansiform lobule,4
Purkinje,anterior,2
Purkinje,central lobule,5
Purkinje,cerebellar cortex,4
Purkinje,cerebellum,45
Purkinje,copula pyramidis,4
Purkinje,culmen,2
Purkinje,declive VI,1
Purkinje,flocculus,2


In [116]:
df2_grouped.count().head(25)

Unnamed: 0_level_0,Unnamed: 1_level_0,neuron_name
cell_type,brain_region,Unnamed: 2_level_1
Purkinje,Purkinje layer,4
Purkinje,ansiform lobule,4
Purkinje,anterior,2
Purkinje,central lobule,5
Purkinje,cerebellar cortex,4
Purkinje,cerebellum,45
Purkinje,copula pyramidis,4
Purkinje,culmen,2
Purkinje,declive VI,1
Purkinje,flocculus,2


In [105]:
pd.set_option('display.max_rows', None)

In [113]:
df2_grouped['neuron_name'].apply(list).reset_index()

Unnamed: 0,cell_type,brain_region,neuron_name
0,Purkinje,Purkinje layer,"[AA0952, AA0977, AA0968, AA0964]"
1,Purkinje,ansiform lobule,"[AA1020, AA1019, AA1024, AA1025]"
2,Purkinje,anterior,"[AA0986, AA0969]"
3,Purkinje,central lobule,"[AA0972, AA0988, AA0971, AA0961, AA0983]"
4,Purkinje,cerebellar cortex,"[AA0952, AA0977, AA0968, AA0964]"
5,Purkinje,cerebellum,"[AA0970, AA0965, AA0979, AA0993, AA0952, AA097..."
6,Purkinje,copula pyramidis,"[AA0965, AA0981, AA0966, AA0976]"
7,Purkinje,culmen,"[AA0986, AA0969]"
8,Purkinje,declive VI,[AA1026]
9,Purkinje,flocculus,"[AA0963, AA0962]"


In [47]:
max([len(i) for i in df['cell_type']])

3

In [54]:
max([len(i) for i in df['brain_region']])

6

In [52]:
max_level_in_cell_type = np.max([len(i) for i in df['cell_type']])
max_level_in_cell_type

3