In [1]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from caveclient import CAVEclient
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
client = CAVEclient('minnie65_public')
client.materialize.version = 661

# Load data on proofread neurons and brain area

In [3]:
funcm = pd.read_pickle('../../data_full/funcmatch_brain_area.pkl')

#Drop unlabelled neurons
funcm = funcm[funcm['pt_root_id']!=0]

#Drop neurons recorded in more than one scan
funcm = funcm.drop_duplicates(subset='pt_root_id', keep = 'first')

In [4]:
funcm

Unnamed: 0,id,pt_root_id,session,scan_idx,unit_id,pt_position,brain_area
0,13434,864691135738685297,7,5,4909,"[338720, 125232, 19589]",AL
1,3045,864691135614842827,7,4,9575,"[136400, 170640, 17951]",V1
2,1474,864691135502985397,9,4,8066,"[189472, 119296, 26037]",V1
3,7915,864691135387371905,6,7,6413,"[209328, 174304, 20004]",V1
4,11589,864691135661410544,9,3,7748,"[185088, 157776, 15691]",V1
...,...,...,...,...,...,...,...
13918,12362,864691135491645535,9,4,4696,"[194400, 116928, 24042]",V1
13919,693,864691135952938019,9,4,5187,"[162064, 116864, 26169]",V1
13920,2323,864691136144304948,9,4,4693,"[194528, 118128, 23566]",V1
13922,2840,864691135394864117,9,3,2456,"[240528, 154032, 25770]",V1


In [4]:
target_classes = list(set(funcm['brain_area']))

print(f'Target classes to categorise {len(target_classes)}: {target_classes}')

Target classes to categorise 4: ['LM', 'V1', 'AL', 'RL']


# Identify inhibitory neurons of interest

In [5]:
in_ex = client.materialize.query_table('baylor_log_reg_cell_type_coarse_v1')
in_ex.head(2)

Unnamed: 0,id,created,valid,target_id,classification_system,cell_type,id_ref,created_ref,valid_ref,volume,pt_supervoxel_id,pt_root_id,pt_position,bb_start_position,bb_end_position
0,25718,2023-03-22 18:05:52.744496+00:00,t,17115,baylor_log_reg_cell_type_coarse,inhibitory,17115,2020-09-28 22:41:18.237823+00:00,t,268.646482,75934403318291307,864691135635239593,"[80992, 109360, 15101]","[nan, nan, nan]","[nan, nan, nan]"
1,25581,2023-03-22 18:05:52.650844+00:00,t,17816,baylor_log_reg_cell_type_coarse,inhibitory,17816,2020-09-28 22:42:54.932823+00:00,t,264.795587,75090047309035210,864691135618175635,"[74880, 110032, 16883]","[nan, nan, nan]","[nan, nan, nan]"


In [6]:
#Select only inhibitory neurons
inhibitory = in_ex[in_ex['cell_type'] == 'inhibitory']
#Only keep columns of interest
inhibitory_clean = inhibitory[['id', 'pt_root_id', 'pt_position', 'cell_type']].copy()
#Drop neurons recorded more than once
inhibitory_clean = inhibitory_clean.drop_duplicates(subset='pt_root_id', keep = 'first')

# Build train, validation, test and inference datasets

In [7]:
#Identify neurons that are in both sets of functionally matched and labelled as inhibitory (potentially a labelling error)
intersec = set(funcm['pt_root_id']).intersection(set(inhibitory_clean['pt_root_id']))

#Generate set of neurons exclding these with labelling uncertainty
func_set = set(funcm['pt_root_id']).difference(intersec)
inhib_set = set(inhibitory_clean['pt_root_id']).difference(intersec)

#Clean the functional and inhibitory data 
funcm_data = funcm[funcm['pt_root_id'].isin(func_set)]
inhibitory_data = inhibitory_clean[inhibitory_clean['pt_root_id'].isin(inhib_set)]

In [18]:
#Encoder for target categorical labels
encoder = LabelEncoder()
encoder.fit(['LM', 'V1', 'AL', 'RL'])


In [51]:
#Check class distribution
funcm_data.groupby('brain_area').count()

Unnamed: 0_level_0,id,pt_root_id,session,scan_idx,unit_id,pt_position
brain_area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AL,870,870,870,870,870,870
LM,29,29,29,29,29,29
RL,3158,3158,3158,3158,3158,3158
V1,7913,7913,7913,7913,7913,7913


In [33]:
#Define features and outputs
X = np.array(list(funcm_data['pt_position']))
y = encoder.transform(funcm_data['brain_area'])

#Split in trian and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Train and test models

In [52]:
#SVM with nonlinear kernel
svm = SVC(kernel = 'poly')
svm.fit(X_train, y_train)

In [55]:
pred = svm.predict(X_test)

In [56]:
print(classification_report(y_test, pred, target_names=encoder.classes_, zero_division= 0))

              precision    recall  f1-score   support

          AL       0.97      0.94      0.95       171
          LM       0.00      0.00      0.00         7
          RL       0.95      0.98      0.96       626
          V1       0.99      0.99      0.99      1590

    accuracy                           0.98      2394
   macro avg       0.73      0.73      0.73      2394
weighted avg       0.98      0.98      0.98      2394



Classes are quite imbalanced so lets try with a model that handles that better like random forest

In [57]:
forest = RandomForestClassifier()
forest.fit(X_train, y_train)

In [58]:
forestpreds = forest.predict(X_test)

In [59]:
print(classification_report(y_test, forestpreds, target_names=encoder.classes_))

              precision    recall  f1-score   support

          AL       0.96      0.97      0.97       171
          LM       0.86      0.86      0.86         7
          RL       0.98      0.98      0.98       626
          V1       1.00      1.00      1.00      1590

    accuracy                           0.99      2394
   macro avg       0.95      0.95      0.95      2394
weighted avg       0.99      0.99      0.99      2394



# Inference on inhibitory neurons

In [61]:
#Input features
X_inhib = np.array(list(inhibitory_data['pt_position']))

In [64]:
#Predict the labels
preds_inhib = forest.predict(X_inhib)

In [66]:
#Convert them back to the ctaegorical lables
inhib_brain_areas = encoder.inverse_transform(preds_inhib)

In [67]:
#Format the predictions to the rest of the data
inhib_data_ba = inhibitory_data.copy()
inhib_data_ba['brain_area'] = inhib_brain_areas

In [69]:
#Distributions look like what we saw in the previous data
inhib_data_ba.groupby('brain_area').count()

Unnamed: 0_level_0,id,pt_root_id,pt_position,cell_type
brain_area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AL,672,672,672,672
LM,49,49,49,49
RL,1510,1510,1510,1510
V1,3494,3494,3494,3494


In [70]:
#Save data
inhib_data_ba.to_pickle('../con-con-models/Data/inhibitory_nurons_ba.pkl')