# Allen data download and visualization

## electrophysiology and morpology data from allen brain data portal

### Purpose of this notebook

1. download fresh data from allen data portal.
1. extract necessary feature(deleting columns)
1. cre reporter status tagging to binary(E vs I)
1. saving csv file.
1. visualizing some features for verification

### Requirements for this notebook
* __allensdk__ for python2.7
* [Reference notebook](http://alleninstitute.github.io/AllenSDK/_static/examples/nb/cell_types.html)

In [1]:
from allensdk.core.cell_types_cache import CellTypesCache
from allensdk.api.queries.cell_types_api import CellTypesApi
from allensdk.core.cell_types_cache import ReporterStatus as RS
import pandas as pd

In [2]:
ctc = CellTypesCache(manifest_file='./Allen_meta/manifest.json')

In [3]:
# all cells
cells = ctc.get_cells()
print("All cells: %d" % len(cells))

All cells: 2228


In [4]:
ephys_features = ctc.get_ephys_features()
ef_df = pd.DataFrame(ephys_features)
print(len(ef_df))
ef_df.head()

2228


Unnamed: 0,adaptation,avg_isi,electrode_0_pa,f_i_curve_slope,fast_trough_t_long_square,fast_trough_t_ramp,fast_trough_t_short_square,fast_trough_v_long_square,fast_trough_v_ramp,fast_trough_v_short_square,...,trough_t_ramp,trough_t_short_square,trough_v_long_square,trough_v_ramp,trough_v_short_square,upstroke_downstroke_ratio_long_square,upstroke_downstroke_ratio_ramp,upstroke_downstroke_ratio_short_square,vm_for_sag,vrest
0,,134.7,22.697498,0.08335459,1.18768,13.2952,1.025916,-56.375004,-57.38542,-57.431251,...,13.29568,1.13478,-56.593754,-57.739586,-74.143753,3.029695,3.061646,2.969821,-80.46875,-73.553391
1,,,-24.887498,-3.9136299999999995e-19,1.09984,20.650105,1.02546,-54.0,-54.828129,-54.656254,...,20.650735,1.16094,-55.406254,-55.242191,-73.5,2.441895,2.245653,2.231575,-84.406258,-73.056595
2,0.00977,39.0448,-46.765002,0.5267857,1.15784,2.55131,1.025387,-59.5,-58.234378,-59.940975,...,2.55196,1.089851,-60.0625,-58.570314,-61.371531,2.023762,2.162878,2.006406,-93.375008,-60.277321
3,-0.007898,117.816429,5.99625,0.1542553,1.989165,9.572025,1.028733,-47.53125,-50.359375,-65.5,...,9.576308,1.423229,-49.406254,-52.718752,-75.273443,3.105931,3.491663,1.733896,-87.65625,-75.205559
4,0.022842,68.321429,14.91,0.1714041,1.08198,2.46288,1.02562,-48.437504,-46.520837,-51.406253,...,2.490433,1.47969,-53.000004,-54.645837,-64.250003,3.28576,3.363504,4.234701,-81.625008,-63.474991


In [6]:
cell_index = { c['id']: c for c in cells }
reporter_status = [cell_index[cid]['reporter_status'] for cid in ef_df['specimen_id']]
tg_line = [cell_index[cid]['transgenic_line'] for cid in ef_df['specimen_id']]
hemisphere = [cell_index[cid]['hemisphere'] for cid in ef_df['specimen_id']]
ef_df['transgenic_line'] = pd.Series(tg_line, index=ef_df.index)
ef_df.head()

Unnamed: 0,adaptation,avg_isi,electrode_0_pa,f_i_curve_slope,fast_trough_t_long_square,fast_trough_t_ramp,fast_trough_t_short_square,fast_trough_v_long_square,fast_trough_v_ramp,fast_trough_v_short_square,...,trough_t_short_square,trough_v_long_square,trough_v_ramp,trough_v_short_square,upstroke_downstroke_ratio_long_square,upstroke_downstroke_ratio_ramp,upstroke_downstroke_ratio_short_square,vm_for_sag,vrest,transgenic_line
0,,134.7,22.697498,0.08335459,1.18768,13.2952,1.025916,-56.375004,-57.38542,-57.431251,...,1.13478,-56.593754,-57.739586,-74.143753,3.029695,3.061646,2.969821,-80.46875,-73.553391,
1,,,-24.887498,-3.9136299999999995e-19,1.09984,20.650105,1.02546,-54.0,-54.828129,-54.656254,...,1.16094,-55.406254,-55.242191,-73.5,2.441895,2.245653,2.231575,-84.406258,-73.056595,
2,0.00977,39.0448,-46.765002,0.5267857,1.15784,2.55131,1.025387,-59.5,-58.234378,-59.940975,...,1.089851,-60.0625,-58.570314,-61.371531,2.023762,2.162878,2.006406,-93.375008,-60.277321,Ndnf-IRES2-dgCre
3,-0.007898,117.816429,5.99625,0.1542553,1.989165,9.572025,1.028733,-47.53125,-50.359375,-65.5,...,1.423229,-49.406254,-52.718752,-75.273443,3.105931,3.491663,1.733896,-87.65625,-75.205559,Htr3a-Cre_NO152
4,0.022842,68.321429,14.91,0.1714041,1.08198,2.46288,1.02562,-48.437504,-46.520837,-51.406253,...,1.47969,-53.000004,-54.645837,-64.250003,3.28576,3.363504,4.234701,-81.625008,-63.474991,Scnn1a-Tg3-Cre


In [7]:
ef_df['reporter_status'] = pd.Series(reporter_status, index=ef_df.index)
ef_df.head()

Unnamed: 0,adaptation,avg_isi,electrode_0_pa,f_i_curve_slope,fast_trough_t_long_square,fast_trough_t_ramp,fast_trough_t_short_square,fast_trough_v_long_square,fast_trough_v_ramp,fast_trough_v_short_square,...,trough_v_long_square,trough_v_ramp,trough_v_short_square,upstroke_downstroke_ratio_long_square,upstroke_downstroke_ratio_ramp,upstroke_downstroke_ratio_short_square,vm_for_sag,vrest,transgenic_line,reporter_status
0,,134.7,22.697498,0.08335459,1.18768,13.2952,1.025916,-56.375004,-57.38542,-57.431251,...,-56.593754,-57.739586,-74.143753,3.029695,3.061646,2.969821,-80.46875,-73.553391,,not applicable
1,,,-24.887498,-3.9136299999999995e-19,1.09984,20.650105,1.02546,-54.0,-54.828129,-54.656254,...,-55.406254,-55.242191,-73.5,2.441895,2.245653,2.231575,-84.406258,-73.056595,,not applicable
2,0.00977,39.0448,-46.765002,0.5267857,1.15784,2.55131,1.025387,-59.5,-58.234378,-59.940975,...,-60.0625,-58.570314,-61.371531,2.023762,2.162878,2.006406,-93.375008,-60.277321,Ndnf-IRES2-dgCre,positive
3,-0.007898,117.816429,5.99625,0.1542553,1.989165,9.572025,1.028733,-47.53125,-50.359375,-65.5,...,-49.406254,-52.718752,-75.273443,3.105931,3.491663,1.733896,-87.65625,-75.205559,Htr3a-Cre_NO152,positive
4,0.022842,68.321429,14.91,0.1714041,1.08198,2.46288,1.02562,-48.437504,-46.520837,-51.406253,...,-53.000004,-54.645837,-64.250003,3.28576,3.363504,4.234701,-81.625008,-63.474991,Scnn1a-Tg3-Cre,positive


In [8]:
ef_df['hemisphere'] = pd.Series(hemisphere, index = ef_df.index)
ef_df.head()

Unnamed: 0,adaptation,avg_isi,electrode_0_pa,f_i_curve_slope,fast_trough_t_long_square,fast_trough_t_ramp,fast_trough_t_short_square,fast_trough_v_long_square,fast_trough_v_ramp,fast_trough_v_short_square,...,trough_v_ramp,trough_v_short_square,upstroke_downstroke_ratio_long_square,upstroke_downstroke_ratio_ramp,upstroke_downstroke_ratio_short_square,vm_for_sag,vrest,transgenic_line,reporter_status,hemisphere
0,,134.7,22.697498,0.08335459,1.18768,13.2952,1.025916,-56.375004,-57.38542,-57.431251,...,-57.739586,-74.143753,3.029695,3.061646,2.969821,-80.46875,-73.553391,,not applicable,left
1,,,-24.887498,-3.9136299999999995e-19,1.09984,20.650105,1.02546,-54.0,-54.828129,-54.656254,...,-55.242191,-73.5,2.441895,2.245653,2.231575,-84.406258,-73.056595,,not applicable,right
2,0.00977,39.0448,-46.765002,0.5267857,1.15784,2.55131,1.025387,-59.5,-58.234378,-59.940975,...,-58.570314,-61.371531,2.023762,2.162878,2.006406,-93.375008,-60.277321,Ndnf-IRES2-dgCre,positive,right
3,-0.007898,117.816429,5.99625,0.1542553,1.989165,9.572025,1.028733,-47.53125,-50.359375,-65.5,...,-52.718752,-75.273443,3.105931,3.491663,1.733896,-87.65625,-75.205559,Htr3a-Cre_NO152,positive,left
4,0.022842,68.321429,14.91,0.1714041,1.08198,2.46288,1.02562,-48.437504,-46.520837,-51.406253,...,-54.645837,-64.250003,3.28576,3.363504,4.234701,-81.625008,-63.474991,Scnn1a-Tg3-Cre,positive,left


In [9]:
ef_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2228 entries, 0 to 2227
Data columns (total 58 columns):
adaptation                                1915 non-null float64
avg_isi                                   2085 non-null float64
electrode_0_pa                            2219 non-null float64
f_i_curve_slope                           2228 non-null float64
fast_trough_t_long_square                 2228 non-null float64
fast_trough_t_ramp                        2169 non-null float64
fast_trough_t_short_square                2228 non-null float64
fast_trough_v_long_square                 2228 non-null float64
fast_trough_v_ramp                        2169 non-null float64
fast_trough_v_short_square                2228 non-null float64
has_burst                                 2228 non-null bool
has_delay                                 2228 non-null bool
has_pause                                 2228 non-null bool
id                                        2228 non-null int64
input_re

In [10]:
no_need_features = ['electrode_0_pa', 'has_burst', 'has_delay', 'has_pause', 'id', 
                    'rheobase_sweep_id', 'rheobase_sweep_number', 'seal_gohm', 'specimen_id',
                   'thumbnail_sweep_id', 'slow_trough_t_long_square', 'slow_trough_t_ramp', 
                   'slow_trough_v_long_square', 'slow_trough_v_ramp']

In [12]:
ef_df_extracted = ef_df.drop(no_need_features, axis=1)
ef_df_extracted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2228 entries, 0 to 2227
Data columns (total 44 columns):
adaptation                                1915 non-null float64
avg_isi                                   2085 non-null float64
f_i_curve_slope                           2228 non-null float64
fast_trough_t_long_square                 2228 non-null float64
fast_trough_t_ramp                        2169 non-null float64
fast_trough_t_short_square                2228 non-null float64
fast_trough_v_long_square                 2228 non-null float64
fast_trough_v_ramp                        2169 non-null float64
fast_trough_v_short_square                2228 non-null float64
input_resistance_mohm                     2228 non-null float64
latency                                   2228 non-null float64
peak_t_long_square                        2228 non-null float64
peak_t_ramp                               2169 non-null float64
peak_t_short_square                       2228 non-null float

In [13]:
ef_df_extracted['height'] = ef_df_extracted['peak_v_short_square'] -ef_df_extracted['fast_trough_v_short_square']
ef_df_extracted['firing_rate'] = (1/ef_df_extracted['avg_isi'])*1000
ef_df_extracted.head()

Unnamed: 0,adaptation,avg_isi,f_i_curve_slope,fast_trough_t_long_square,fast_trough_t_ramp,fast_trough_t_short_square,fast_trough_v_long_square,fast_trough_v_ramp,fast_trough_v_short_square,input_resistance_mohm,...,upstroke_downstroke_ratio_long_square,upstroke_downstroke_ratio_ramp,upstroke_downstroke_ratio_short_square,vm_for_sag,vrest,transgenic_line,reporter_status,hemisphere,height,firing_rate
0,,134.7,0.08335459,1.18768,13.2952,1.025916,-56.375004,-57.38542,-57.431251,54.894264,...,3.029695,3.061646,2.969821,-80.46875,-73.553391,,not applicable,left,105.506253,7.423905
1,,,-3.9136299999999995e-19,1.09984,20.650105,1.02546,-54.0,-54.828129,-54.656254,103.684656,...,2.441895,2.245653,2.231575,-84.406258,-73.056595,,not applicable,right,91.312508,
2,0.00977,39.0448,0.5267857,1.15784,2.55131,1.025387,-59.5,-58.234378,-59.940975,224.580336,...,2.023762,2.162878,2.006406,-93.375008,-60.277321,Ndnf-IRES2-dgCre,positive,right,84.329864,25.611605
3,-0.007898,117.816429,0.1542553,1.989165,9.572025,1.028733,-47.53125,-50.359375,-65.5,151.193344,...,3.105931,3.491663,1.733896,-87.65625,-75.205559,Htr3a-Cre_NO152,positive,left,75.828125,8.487781
4,0.022842,68.321429,0.1714041,1.08198,2.46288,1.02562,-48.437504,-46.520837,-51.406253,171.530176,...,3.28576,3.363504,4.234701,-81.625008,-63.474991,Scnn1a-Tg3-Cre,positive,left,88.476567,14.636696


In [14]:
def dis_EI(cre_line): # labelling for excitory vs inbihitory cre line
    output = []
    for item in cre_line:
        if (item == 'Pvalb-IRES-Cre' or item == 'Sst-IRES-Cre' or
            item == 'Htr3a-Cre_NO152' or item == 'Vip-IRES-Cre' or
            item == 'Gad2-IRES-Cre' or item == 'Chat-IRES-Cre-neo' or
            item == 'Chrna2-Cre_OE25' or item == 'Nkx2-1-CreERT2'):
                output = output + ['Inhibitory']
        else:
            output = output + ['Excitatory']
    return output
ef_df_extracted['binary_neuron'] = pd.DataFrame(dis_EI(ef_df_extracted['transgenic_line']))
ef_df_extracted.tail()

Unnamed: 0,adaptation,avg_isi,f_i_curve_slope,fast_trough_t_long_square,fast_trough_t_ramp,fast_trough_t_short_square,fast_trough_v_long_square,fast_trough_v_ramp,fast_trough_v_short_square,input_resistance_mohm,...,upstroke_downstroke_ratio_ramp,upstroke_downstroke_ratio_short_square,vm_for_sag,vrest,transgenic_line,reporter_status,hemisphere,height,firing_rate,binary_neuron
2223,0.003986,20.891111,0.558727,1.256315,14.493233,1.024111,-59.406254,-58.531254,-61.912501,101.41624,...,1.354347,1.47502,-85.468758,-78.499512,Rorb-IRES2-Cre,negative,left,82.393752,47.867248,Excitatory
2224,0.179194,245.906667,0.057143,1.19876,9.888127,1.026348,-47.0,-44.468754,-47.493754,66.745424,...,3.817988,4.980603,-84.218758,-72.547661,,not applicable,left,93.737504,4.066584,Excitatory
2225,0.064075,98.066667,0.179114,1.180315,3.36348,1.026457,-47.593754,-47.020835,-52.979168,201.156832,...,3.982813,3.957265,-94.96875,-72.988518,Cux2-CreERT2,positive,left,92.69792,10.197145,Excitatory
2226,0.015452,52.414667,0.17621,1.21698,2.176593,1.025509,-46.34375,-46.687503,-50.450895,283.386752,...,3.000039,2.261698,-98.875,-59.984798,Vip-IRES-Cre,positive,right,61.995538,19.078629,Inhibitory
2227,0.227778,72.46,0.069643,1.10916,5.296718,1.024698,-50.281254,-49.906253,-52.937501,116.282496,...,3.337449,3.804038,-82.343758,-69.373192,Cux2-CreERT2,positive,right,97.135419,13.800718,Excitatory


In [16]:
ef_df_cre = ef_df_extracted[ef_df_extracted['reporter_status']=='positive']
len(ef_df_cre)

1813

In [17]:
pd.DataFrame(ef_df_cre['transgenic_line'].value_counts())

Unnamed: 0,transgenic_line
Pvalb-IRES-Cre,216
Htr3a-Cre_NO152,161
Rorb-IRES2-Cre,132
Sst-IRES-Cre,122
Vip-IRES-Cre,94
Ndnf-IRES2-dgCre,93
Scnn1a-Tg3-Cre,87
Rbp4-Cre_KL100,83
Nr5a1-Cre,82
Cux2-CreERT2,78


In [18]:
pd.DataFrame(ef_df_cre['binary_neuron'].value_counts())

Unnamed: 0,binary_neuron
Excitatory,1017
Inhibitory,796


## Caution!!

* Because the allen db updated the cell types, our current model is not match. We built our models in early 2018 (in Feb~Mar). So we don't want mess up models built, the data was not downloaded but just checked. 
* For current model, we keep using previous downloaded data(1802.. or ephys_data, cre_...csv)