# Clustering neuron data
## electrophysiology and morpology data from allen brain data portal

### Purpose of this notebook
1. download fresh data from allen data portal.
1. exploration the data 
  1. density plot (vs Cre line)
  1. Box plot (vs Cre line)
1. clustering neurons using unsupervised ML methods.
  1. hierachial tree
  1. k-nearset
  1. etc
1. comparison clustered neurons and its Cre_line property

Requirements for this notebook
* __allensdk__ for python2.7
* [Reference notebook](http://alleninstitute.github.io/AllenSDK/_static/examples/nb/cell_types.html)

In [1]:
from allensdk.core.cell_types_cache import CellTypesCache
from allensdk.api.queries.cell_types_api import CellTypesApi
from allensdk.core.cell_types_cache import ReporterStatus as RS
import pandas as pd

  from ._conv import register_converters as _register_converters


In [2]:
ctc = CellTypesCache(manifest_file='cell_types/manifest.json')

In [3]:
# all cells
cells = ctc.get_cells()
print("All cells: %d" % len(cells))

All cells: 1337


In [4]:
ephys_features = ctc.get_ephys_features()
ef_df = pd.DataFrame(ephys_features)
print(len(ef_df))
ef_df.head()

1337


Unnamed: 0,adaptation,avg_isi,electrode_0_pa,f_i_curve_slope,fast_trough_t_long_square,fast_trough_t_ramp,fast_trough_t_short_square,fast_trough_v_long_square,fast_trough_v_ramp,fast_trough_v_short_square,...,trough_t_ramp,trough_t_short_square,trough_v_long_square,trough_v_ramp,trough_v_short_square,upstroke_downstroke_ratio_long_square,upstroke_downstroke_ratio_ramp,upstroke_downstroke_ratio_short_square,vm_for_sag,vrest
0,0.093021,81.49875,-6.184375,0.205899,1.126905,2.7687,1.025464,-51.187504,-50.328125,-55.312504,...,2.804613,1.333809,-56.531254,-56.812502,-69.306253,3.904747,4.13113,3.899439,-85.156258,-68.233376
1,0.064384,76.283333,26.135,0.195247,1.10266,4.005247,1.024873,-48.71875,-49.875001,-51.500004,...,4.005453,1.344913,-48.843754,-50.093753,-67.416669,4.231335,4.299535,3.939083,-78.59375,-66.085068
2,,,-4.600001,0.055443,1.10482,9.092507,1.025024,-48.09375,-49.541669,-50.312503,...,9.10336,1.197772,-49.53125,-49.968753,-75.656255,2.154281,1.486651,2.400687,-87.15625,-75.141449
3,0.048709,118.397143,-90.575616,0.307589,1.06977,7.203945,1.024593,-45.968754,-49.093754,-53.800002,...,7.204045,1.233473,-46.156254,-49.218753,-66.318753,2.026,1.827572,1.818158,-82.40625,-65.503044
4,0.042215,81.394545,13.65375,0.162729,1.08975,2.336427,1.027302,-48.90625,-51.083335,-55.800002,...,2.361778,1.691744,-52.09375,-57.614586,-73.3375,4.208485,4.274151,4.096372,-99.093758,-72.174675


In [5]:
cell_index = { c['id']: c for c in cells }

In [7]:
reporter_status = [cell_index[cid]['reporter_status'] for cid in ef_df['specimen_id']]

In [8]:
tg_line = [cell_index[cid]['transgenic_line'] for cid in ef_df['specimen_id']]

In [9]:
ef_df['transgenic_line'] = pd.Series(tg_line, index=ef_df.index)
ef_df.head()

Unnamed: 0,adaptation,avg_isi,electrode_0_pa,f_i_curve_slope,fast_trough_t_long_square,fast_trough_t_ramp,fast_trough_t_short_square,fast_trough_v_long_square,fast_trough_v_ramp,fast_trough_v_short_square,...,trough_t_short_square,trough_v_long_square,trough_v_ramp,trough_v_short_square,upstroke_downstroke_ratio_long_square,upstroke_downstroke_ratio_ramp,upstroke_downstroke_ratio_short_square,vm_for_sag,vrest,transgenic_line
0,0.093021,81.49875,-6.184375,0.205899,1.126905,2.7687,1.025464,-51.187504,-50.328125,-55.312504,...,1.333809,-56.531254,-56.812502,-69.306253,3.904747,4.13113,3.899439,-85.156258,-68.233376,Cux2-CreERT2
1,0.064384,76.283333,26.135,0.195247,1.10266,4.005247,1.024873,-48.71875,-49.875001,-51.500004,...,1.344913,-48.843754,-50.093753,-67.416669,4.231335,4.299535,3.939083,-78.59375,-66.085068,Chrna2-Cre_OE25
2,,,-4.600001,0.055443,1.10482,9.092507,1.025024,-48.09375,-49.541669,-50.312503,...,1.197772,-49.53125,-49.968753,-75.656255,2.154281,1.486651,2.400687,-87.15625,-75.141449,
3,0.048709,118.397143,-90.575616,0.307589,1.06977,7.203945,1.024593,-45.968754,-49.093754,-53.800002,...,1.233473,-46.156254,-49.218753,-66.318753,2.026,1.827572,1.818158,-82.40625,-65.503044,Sst-IRES-Cre
4,0.042215,81.394545,13.65375,0.162729,1.08975,2.336427,1.027302,-48.90625,-51.083335,-55.800002,...,1.691744,-52.09375,-57.614586,-73.3375,4.208485,4.274151,4.096372,-99.093758,-72.174675,Cux2-CreERT2


In [10]:
ef_df['reporter_status'] = pd.Series(reporter_status, index=ef_df.index)
ef_df.head()

Unnamed: 0,adaptation,avg_isi,electrode_0_pa,f_i_curve_slope,fast_trough_t_long_square,fast_trough_t_ramp,fast_trough_t_short_square,fast_trough_v_long_square,fast_trough_v_ramp,fast_trough_v_short_square,...,trough_v_long_square,trough_v_ramp,trough_v_short_square,upstroke_downstroke_ratio_long_square,upstroke_downstroke_ratio_ramp,upstroke_downstroke_ratio_short_square,vm_for_sag,vrest,transgenic_line,reporter_status
0,0.093021,81.49875,-6.184375,0.205899,1.126905,2.7687,1.025464,-51.187504,-50.328125,-55.312504,...,-56.531254,-56.812502,-69.306253,3.904747,4.13113,3.899439,-85.156258,-68.233376,Cux2-CreERT2,cre reporter negative
1,0.064384,76.283333,26.135,0.195247,1.10266,4.005247,1.024873,-48.71875,-49.875001,-51.500004,...,-48.843754,-50.093753,-67.416669,4.231335,4.299535,3.939083,-78.59375,-66.085068,Chrna2-Cre_OE25,cre reporter positive
2,,,-4.600001,0.055443,1.10482,9.092507,1.025024,-48.09375,-49.541669,-50.312503,...,-49.53125,-49.968753,-75.656255,2.154281,1.486651,2.400687,-87.15625,-75.141449,,not applicable
3,0.048709,118.397143,-90.575616,0.307589,1.06977,7.203945,1.024593,-45.968754,-49.093754,-53.800002,...,-46.156254,-49.218753,-66.318753,2.026,1.827572,1.818158,-82.40625,-65.503044,Sst-IRES-Cre,cre reporter positive
4,0.042215,81.394545,13.65375,0.162729,1.08975,2.336427,1.027302,-48.90625,-51.083335,-55.800002,...,-52.09375,-57.614586,-73.3375,4.208485,4.274151,4.096372,-99.093758,-72.174675,Cux2-CreERT2,cre reporter positive


In [11]:
ef_df_cre = ef_df[ef_df['reporter_status']=='cre reporter positive']
len(ef_df_cre)

969

In [12]:
ef_df_cre.to_csv('ephys_data_cre.csv', index = False)