# Clustering neuron data
## electrophysiology and morpology data from allen brain data portal

### Purpose of this notebook
1. download fresh data from allen data portal.
1. extract necessary feature(deleting columns)
1. cre reporter status tagging to binary(E vs I)
1. saving csv file.

## This note was created 2018-FEB. If you rerun the cells, you'll get different results due allen database updated.
## New notebook can be checked [here](./allen_data_download_2018JUL.ipynb)


Requirements for this notebook
* __allensdk__ for python2.7
* [Reference notebook](http://alleninstitute.github.io/AllenSDK/_static/examples/nb/cell_types.html)

In [1]:
from allensdk.core.cell_types_cache import CellTypesCache
from allensdk.api.queries.cell_types_api import CellTypesApi
from allensdk.core.cell_types_cache import ReporterStatus as RS
import pandas as pd

In [2]:
ctc = CellTypesCache(manifest_file='cell_types/manifest.json')

In [3]:
# all cells
cells = ctc.get_cells()
print("All cells: %d" % len(cells))

All cells: 1337


In [4]:
ephys_features = ctc.get_ephys_features()
ef_df = pd.DataFrame(ephys_features)
print(len(ef_df))
ef_df.head()

1337


Unnamed: 0,adaptation,avg_isi,electrode_0_pa,f_i_curve_slope,fast_trough_t_long_square,fast_trough_t_ramp,fast_trough_t_short_square,fast_trough_v_long_square,fast_trough_v_ramp,fast_trough_v_short_square,...,trough_t_ramp,trough_t_short_square,trough_v_long_square,trough_v_ramp,trough_v_short_square,upstroke_downstroke_ratio_long_square,upstroke_downstroke_ratio_ramp,upstroke_downstroke_ratio_short_square,vm_for_sag,vrest
0,0.093021,81.49875,-6.184375,0.205899,1.126905,2.7687,1.025464,-51.187504,-50.328125,-55.312504,...,2.804613,1.333809,-56.531254,-56.812502,-69.306253,3.904747,4.13113,3.899439,-85.156258,-68.233376
1,0.064384,76.283333,26.135,0.195247,1.10266,4.005247,1.024873,-48.71875,-49.875001,-51.500004,...,4.005453,1.344913,-48.843754,-50.093753,-67.416669,4.231335,4.299535,3.939083,-78.59375,-66.085068
2,,,-4.600001,0.055443,1.10482,9.092507,1.025024,-48.09375,-49.541669,-50.312503,...,9.10336,1.197772,-49.53125,-49.968753,-75.656255,2.154281,1.486651,2.400687,-87.15625,-75.141449
3,0.048709,118.397143,-90.575616,0.307589,1.06977,7.203945,1.024593,-45.968754,-49.093754,-53.800002,...,7.204045,1.233473,-46.156254,-49.218753,-66.318753,2.026,1.827572,1.818158,-82.40625,-65.503044
4,0.042215,81.394545,13.65375,0.162729,1.08975,2.336427,1.027302,-48.90625,-51.083335,-55.800002,...,2.361778,1.691744,-52.09375,-57.614586,-73.3375,4.208485,4.274151,4.096372,-99.093758,-72.174675


In [5]:
cell_index = { c['id']: c for c in cells }

In [6]:
reporter_status = [cell_index[cid]['reporter_status'] for cid in ef_df['specimen_id']]

In [7]:
tg_line = [cell_index[cid]['transgenic_line'] for cid in ef_df['specimen_id']]

In [8]:
hemisphere = [cell_index[cid]['hemisphere'] for cid in ef_df['specimen_id']]

In [9]:
ef_df['transgenic_line'] = pd.Series(tg_line, index=ef_df.index)
ef_df.head()

Unnamed: 0,adaptation,avg_isi,electrode_0_pa,f_i_curve_slope,fast_trough_t_long_square,fast_trough_t_ramp,fast_trough_t_short_square,fast_trough_v_long_square,fast_trough_v_ramp,fast_trough_v_short_square,...,trough_t_short_square,trough_v_long_square,trough_v_ramp,trough_v_short_square,upstroke_downstroke_ratio_long_square,upstroke_downstroke_ratio_ramp,upstroke_downstroke_ratio_short_square,vm_for_sag,vrest,transgenic_line
0,0.093021,81.49875,-6.184375,0.205899,1.126905,2.7687,1.025464,-51.187504,-50.328125,-55.312504,...,1.333809,-56.531254,-56.812502,-69.306253,3.904747,4.13113,3.899439,-85.156258,-68.233376,Cux2-CreERT2
1,0.064384,76.283333,26.135,0.195247,1.10266,4.005247,1.024873,-48.71875,-49.875001,-51.500004,...,1.344913,-48.843754,-50.093753,-67.416669,4.231335,4.299535,3.939083,-78.59375,-66.085068,Chrna2-Cre_OE25
2,,,-4.600001,0.055443,1.10482,9.092507,1.025024,-48.09375,-49.541669,-50.312503,...,1.197772,-49.53125,-49.968753,-75.656255,2.154281,1.486651,2.400687,-87.15625,-75.141449,
3,0.048709,118.397143,-90.575616,0.307589,1.06977,7.203945,1.024593,-45.968754,-49.093754,-53.800002,...,1.233473,-46.156254,-49.218753,-66.318753,2.026,1.827572,1.818158,-82.40625,-65.503044,Sst-IRES-Cre
4,0.042215,81.394545,13.65375,0.162729,1.08975,2.336427,1.027302,-48.90625,-51.083335,-55.800002,...,1.691744,-52.09375,-57.614586,-73.3375,4.208485,4.274151,4.096372,-99.093758,-72.174675,Cux2-CreERT2


In [10]:
ef_df['reporter_status'] = pd.Series(reporter_status, index=ef_df.index)
ef_df.head()

Unnamed: 0,adaptation,avg_isi,electrode_0_pa,f_i_curve_slope,fast_trough_t_long_square,fast_trough_t_ramp,fast_trough_t_short_square,fast_trough_v_long_square,fast_trough_v_ramp,fast_trough_v_short_square,...,trough_v_long_square,trough_v_ramp,trough_v_short_square,upstroke_downstroke_ratio_long_square,upstroke_downstroke_ratio_ramp,upstroke_downstroke_ratio_short_square,vm_for_sag,vrest,transgenic_line,reporter_status
0,0.093021,81.49875,-6.184375,0.205899,1.126905,2.7687,1.025464,-51.187504,-50.328125,-55.312504,...,-56.531254,-56.812502,-69.306253,3.904747,4.13113,3.899439,-85.156258,-68.233376,Cux2-CreERT2,cre reporter negative
1,0.064384,76.283333,26.135,0.195247,1.10266,4.005247,1.024873,-48.71875,-49.875001,-51.500004,...,-48.843754,-50.093753,-67.416669,4.231335,4.299535,3.939083,-78.59375,-66.085068,Chrna2-Cre_OE25,cre reporter positive
2,,,-4.600001,0.055443,1.10482,9.092507,1.025024,-48.09375,-49.541669,-50.312503,...,-49.53125,-49.968753,-75.656255,2.154281,1.486651,2.400687,-87.15625,-75.141449,,not applicable
3,0.048709,118.397143,-90.575616,0.307589,1.06977,7.203945,1.024593,-45.968754,-49.093754,-53.800002,...,-46.156254,-49.218753,-66.318753,2.026,1.827572,1.818158,-82.40625,-65.503044,Sst-IRES-Cre,cre reporter positive
4,0.042215,81.394545,13.65375,0.162729,1.08975,2.336427,1.027302,-48.90625,-51.083335,-55.800002,...,-52.09375,-57.614586,-73.3375,4.208485,4.274151,4.096372,-99.093758,-72.174675,Cux2-CreERT2,cre reporter positive


In [11]:
ef_df['hemisphere'] = pd.Series(hemisphere, index = ef_df.index)
ef_df.head()

Unnamed: 0,adaptation,avg_isi,electrode_0_pa,f_i_curve_slope,fast_trough_t_long_square,fast_trough_t_ramp,fast_trough_t_short_square,fast_trough_v_long_square,fast_trough_v_ramp,fast_trough_v_short_square,...,trough_v_ramp,trough_v_short_square,upstroke_downstroke_ratio_long_square,upstroke_downstroke_ratio_ramp,upstroke_downstroke_ratio_short_square,vm_for_sag,vrest,transgenic_line,reporter_status,hemisphere
0,0.093021,81.49875,-6.184375,0.205899,1.126905,2.7687,1.025464,-51.187504,-50.328125,-55.312504,...,-56.812502,-69.306253,3.904747,4.13113,3.899439,-85.156258,-68.233376,Cux2-CreERT2,cre reporter negative,left
1,0.064384,76.283333,26.135,0.195247,1.10266,4.005247,1.024873,-48.71875,-49.875001,-51.500004,...,-50.093753,-67.416669,4.231335,4.299535,3.939083,-78.59375,-66.085068,Chrna2-Cre_OE25,cre reporter positive,left
2,,,-4.600001,0.055443,1.10482,9.092507,1.025024,-48.09375,-49.541669,-50.312503,...,-49.968753,-75.656255,2.154281,1.486651,2.400687,-87.15625,-75.141449,,not applicable,left
3,0.048709,118.397143,-90.575616,0.307589,1.06977,7.203945,1.024593,-45.968754,-49.093754,-53.800002,...,-49.218753,-66.318753,2.026,1.827572,1.818158,-82.40625,-65.503044,Sst-IRES-Cre,cre reporter positive,right
4,0.042215,81.394545,13.65375,0.162729,1.08975,2.336427,1.027302,-48.90625,-51.083335,-55.800002,...,-57.614586,-73.3375,4.208485,4.274151,4.096372,-99.093758,-72.174675,Cux2-CreERT2,cre reporter positive,right


In [12]:
ef_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1337 entries, 0 to 1336
Data columns (total 58 columns):
adaptation                                1096 non-null float64
avg_isi                                   1216 non-null float64
electrode_0_pa                            1329 non-null float64
f_i_curve_slope                           1337 non-null float64
fast_trough_t_long_square                 1337 non-null float64
fast_trough_t_ramp                        1301 non-null float64
fast_trough_t_short_square                1337 non-null float64
fast_trough_v_long_square                 1337 non-null float64
fast_trough_v_ramp                        1301 non-null float64
fast_trough_v_short_square                1337 non-null float64
has_burst                                 1337 non-null bool
has_delay                                 1337 non-null bool
has_pause                                 1337 non-null bool
id                                        1337 non-null int64
input_re

In [13]:
no_need_features = ['electrode_0_pa', 'has_burst', 'has_delay', 'has_pause', 'id', 
                    'rheobase_sweep_id', 'rheobase_sweep_number', 'seal_gohm', 'specimen_id',
                   'thumbnail_sweep_id', 'slow_trough_t_long_square', 'slow_trough_t_ramp', 
                   'slow_trough_v_long_square', 'slow_trough_v_ramp']

In [14]:
ef_df_extracted = ef_df.drop(no_need_features, axis=1)

In [15]:
ef_df_extracted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1337 entries, 0 to 1336
Data columns (total 44 columns):
adaptation                                1096 non-null float64
avg_isi                                   1216 non-null float64
f_i_curve_slope                           1337 non-null float64
fast_trough_t_long_square                 1337 non-null float64
fast_trough_t_ramp                        1301 non-null float64
fast_trough_t_short_square                1337 non-null float64
fast_trough_v_long_square                 1337 non-null float64
fast_trough_v_ramp                        1301 non-null float64
fast_trough_v_short_square                1337 non-null float64
input_resistance_mohm                     1337 non-null float64
latency                                   1337 non-null float64
peak_t_long_square                        1337 non-null float64
peak_t_ramp                               1301 non-null float64
peak_t_short_square                       1337 non-null float

In [16]:
ef_df_extracted['height'] = ef_df_extracted['peak_v_short_square'] -ef_df_extracted['fast_trough_v_short_square']
ef_df_extracted['firing_rate'] = (1/ef_df_extracted['avg_isi'])*1000

In [17]:
ef_df_extracted.head()

Unnamed: 0,adaptation,avg_isi,f_i_curve_slope,fast_trough_t_long_square,fast_trough_t_ramp,fast_trough_t_short_square,fast_trough_v_long_square,fast_trough_v_ramp,fast_trough_v_short_square,input_resistance_mohm,...,upstroke_downstroke_ratio_long_square,upstroke_downstroke_ratio_ramp,upstroke_downstroke_ratio_short_square,vm_for_sag,vrest,transgenic_line,reporter_status,hemisphere,height,firing_rate
0,0.093021,81.49875,0.205899,1.126905,2.7687,1.025464,-51.187504,-50.328125,-55.312504,180.243392,...,3.904747,4.13113,3.899439,-85.156258,-68.233376,Cux2-CreERT2,cre reporter negative,left,98.450006,12.270127
1,0.064384,76.283333,0.195247,1.10266,4.005247,1.024873,-48.71875,-49.875001,-51.500004,134.67648,...,4.231335,4.299535,3.939083,-78.59375,-66.085068,Chrna2-Cre_OE25,cre reporter positive,left,105.031258,13.109023
2,,,0.055443,1.10482,9.092507,1.025024,-48.09375,-49.541669,-50.312503,189.80176,...,2.154281,1.486651,2.400687,-87.15625,-75.141449,,not applicable,left,89.918754,
3,0.048709,118.397143,0.307589,1.06977,7.203945,1.024593,-45.968754,-49.093754,-53.800002,164.745728,...,2.026,1.827572,1.818158,-82.40625,-65.503044,Sst-IRES-Cre,cre reporter positive,right,81.287503,8.44615
4,0.042215,81.394545,0.162729,1.08975,2.336427,1.027302,-48.90625,-51.083335,-55.800002,289.12208,...,4.208485,4.274151,4.096372,-99.093758,-72.174675,Cux2-CreERT2,cre reporter positive,right,84.887503,12.285836


In [18]:
def dis_EI(cre_line): # labelling for excitory vs inbihitory cre line
    output = []
    for item in cre_line:
        if (item == 'Pvalb-IRES-Cre' or item == 'Sst-IRES-Cre' or
            item == 'Htr3a-Cre_NO152' or item == 'Vip-IRES-Cre' or
            item == 'Gad2-IRES-Cre' or item == 'Chat-IRES-Cre-neo' or
            item == 'Chrna2-Cre_OE25' or item == 'Nkx2-1-CreERT2'):
                output = output + ['Inhibitory']
        else:
            output = output + ['Excitatory']
    return output
ef_df_extracted['binary_neuron'] = pd.DataFrame(dis_EI(ef_df_extracted['transgenic_line']))

In [19]:
ef_df_extracted.tail()

Unnamed: 0,adaptation,avg_isi,f_i_curve_slope,fast_trough_t_long_square,fast_trough_t_ramp,fast_trough_t_short_square,fast_trough_v_long_square,fast_trough_v_ramp,fast_trough_v_short_square,input_resistance_mohm,...,upstroke_downstroke_ratio_ramp,upstroke_downstroke_ratio_short_square,vm_for_sag,vrest,transgenic_line,reporter_status,hemisphere,height,firing_rate,binary_neuron
1332,0.033096,127.26,0.141466,1.30392,2.45347,1.027433,-51.062504,-51.468752,-51.916668,152.173744,...,4.406138,4.282831,-88.031258,-63.121693,,not applicable,left,102.000005,7.857929,Excitatory
1333,0.021331,55.8575,0.12619,1.14522,4.860593,1.024965,-46.09375,-47.020835,-48.49219,251.63056,...,3.710321,3.378424,-96.78125,-78.259277,Scnn1a-Tg3-Cre,cre reporter positive,left,89.195316,17.902699,Excitatory
1334,0.047917,112.735625,0.079324,1.16643,7.605778,1.025228,-57.000004,-57.572918,-58.762502,66.813688,...,3.010267,3.39661,-89.71875,-73.232635,,not applicable,left,99.037505,8.870311,Excitatory
1335,0.028293,84.403636,0.122825,1.13298,7.115233,1.025576,-44.125,-48.250003,-51.337503,165.862832,...,3.481667,4.269691,-93.0,-79.386765,Rorb-IRES2-Cre,cre reporter positive,left,92.687505,11.847831,Excitatory
1336,,6.26,0.010326,1.13888,15.9492,1.02512,-52.125004,-58.218753,-55.331253,62.743816,...,3.705363,3.634396,-78.687508,-72.151871,,not applicable,right,106.300006,159.744409,Excitatory


In [20]:
ef_df_cre = ef_df_extracted[ef_df_extracted['reporter_status']=='cre reporter positive']
len(ef_df_cre)

969

In [21]:
pd.DataFrame(ef_df_cre['transgenic_line'].value_counts())

Unnamed: 0,transgenic_line
Pvalb-IRES-Cre,142
Rorb-IRES2-Cre,120
Sst-IRES-Cre,111
Htr3a-Cre_NO152,89
Cux2-CreERT2,73
Rbp4-Cre_KL100,72
Nr5a1-Cre,69
Scnn1a-Tg3-Cre,60
Ntsr1-Cre_GN220,48
Scnn1a-Tg2-Cre,36


In [22]:
pd.DataFrame(ef_df_cre['binary_neuron'].value_counts())

Unnamed: 0,binary_neuron
Excitatory,522
Inhibitory,447


In [23]:
ef_df_cre.to_csv('ephys_data_cre_hemisphere.csv', index = False)

In [24]:
ef_df_extracted.to_csv('ephys_data_hemisphere.csv', index = False)

* ephys_data_cre.csv 
- missing hemisphere col, in further data process using these data.
- hemisphere feature was tested for pilot.