In [30]:
# libraries used
import boto3
import diff_classifier.aws as aws
import pandas as pd
import seaborn as sn
import numpy as np
import matplotlib.pyplot as plt
from diff_classifier.features import calculate_features
from os import listdir, getcwd, chdir
from os.path import isfile, join
import os

from matplotlib import colors as plt_colors

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

from kneed import KneeLocator

from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation, DBSCAN, SpectralClustering
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture

import shap

In [2]:
workbookDir = getcwd()
print('Current Notebook Dir: ' + workbookDir)
chdir(workbookDir) # Go to current workbook Dir
chdir('..')        # Go up one
print(f'Using current directory for loading data: {getcwd()}')
workbookDir = getcwd()

Current Notebook Dir: /Users/nelsschimek/Documents/Nance Lab/diff_predictor/notebooks
Using current directory for loading data: /Users/nelsschimek/Documents/Nance Lab/diff_predictor


In [3]:
dataset_path = workbookDir + '/region_feature_folder/'
filelist = [f for f in listdir(dataset_path) if isfile(join(dataset_path, f)) and 'feat' in f]
filelist

['feat_NT_slice_2_striatum_vid_4.csv',
 'feat_NT_slice_2_striatum_vid_5.csv',
 'feat_NT_slice_1_striatum_vid_1.csv',
 'feat_NT_slice_2_cortex_vid_4.csv',
 'feat_NT_slice_1_striatum_vid_3.csv',
 'feat_NT_slice_1_cortex_vid_10.csv',
 'feat_NT_slice_1_striatum_vid_2.csv',
 'feat_NT_slice_2_cortex_vid_5.csv',
 'feat_NT_slice_2_cortex_vid_1.csv',
 'feat_NT_slice_2_hippocampus_vid_2.csv',
 'feat_NT_slice_2_striatum_vid_2.csv',
 'feat_NT_slice_2_striatum_vid_3.csv',
 'feat_NT_slice_2_hippocampus_vid_3.csv',
 'feat_NT_slice_2_cortex_vid_2.csv',
 'feat_NT_slice_1_cortex_vid_8.csv',
 'feat_NT_slice_1_striatum_vid_5.csv',
 'feat_NT_slice_2_hippocampus_vid_1.csv',
 'feat_NT_slice_2_striatum_vid_1.csv',
 'feat_NT_slice_1_striatum_vid_4.csv',
 'feat_NT_slice_1_cortex_vid_9.csv',
 'feat_NT_slice_2_cortex_vid_3.csv',
 'feat_NT_slice_1_hippocampus_vid_1.csv',
 'feat_NT_slice_1_hippocampus_vid_3.csv',
 'feat_NT_slice_1_hippocampus_vid_2.csv',
 'feat_NT_slice_2_ganglia_vid_3.csv',
 'feat_NT_slice_2_gangl

In [4]:
fstats_tot = None
video_num = 0
for filename in filelist:
#     try:
        fstats = pd.read_csv(dataset_path + filename, encoding = "ISO-8859-1", index_col='Unnamed: 0')
        print('{} size: {}'.format(filename, fstats.shape))
        if 'cortex' in filename:
            fstats['region'] = pd.Series(fstats.shape[0]*['cortex'], index=fstats.index)
        elif 'striatum' in filename:
            fstats['region'] = pd.Series(fstats.shape[0]*['striatum'], index=fstats.index)
        elif 'ganglia' in filename:
            fstats['region'] = pd.Series(fstats.shape[0]*['ganglia'], index=fstats.index)
        elif 'thalamus' in filename:
            fstats['region'] = pd.Series(fstats.shape[0]*['thalamus'], index=fstats.index)
        elif 'hippocampus' in filename:
            fstats['region'] = pd.Series(fstats.shape[0]*['hippocampus'], index=fstats.index)
        else:
            print('Error, no target')
        fstats['Video Number'] = pd.Series(fstats.shape[0]*[video_num], index=fstats.index)
        if fstats_tot is None:
            fstats_tot = fstats
        else:
            fstats_tot = fstats_tot.append(fstats, ignore_index=True)
        video_num += 1

feat_NT_slice_2_striatum_vid_4.csv size: (10237, 67)
feat_NT_slice_2_striatum_vid_5.csv size: (13938, 67)
feat_NT_slice_1_striatum_vid_1.csv size: (2431, 67)
feat_NT_slice_2_cortex_vid_4.csv size: (1429, 67)
feat_NT_slice_1_striatum_vid_3.csv size: (1536, 67)
feat_NT_slice_1_cortex_vid_10.csv size: (4832, 67)
feat_NT_slice_1_striatum_vid_2.csv size: (2240, 67)
feat_NT_slice_2_cortex_vid_5.csv size: (2210, 67)
feat_NT_slice_2_cortex_vid_1.csv size: (1388, 67)
feat_NT_slice_2_hippocampus_vid_2.csv size: (46, 67)
feat_NT_slice_2_striatum_vid_2.csv size: (10500, 67)
feat_NT_slice_2_striatum_vid_3.csv size: (11355, 67)
feat_NT_slice_2_hippocampus_vid_3.csv size: (307, 67)
feat_NT_slice_2_cortex_vid_2.csv size: (1784, 67)
feat_NT_slice_1_cortex_vid_8.csv size: (1984, 67)
feat_NT_slice_1_striatum_vid_5.csv size: (2169, 67)
feat_NT_slice_2_hippocampus_vid_1.csv size: (250, 67)
feat_NT_slice_2_striatum_vid_1.csv size: (8314, 67)
feat_NT_slice_1_striatum_vid_4.csv size: (2177, 67)
feat_NT_slice_

In [5]:
fstats_tot.columns
fstats_tot.head()

Unnamed: 0,Track_ID,alpha,D_fit,kurtosis,asymmetry1,asymmetry2,asymmetry3,AR,elongation,boundedness,...,Mean Mean_Intensity,Std Mean_Intensity,Mean SN_Ratio,Std SN_Ratio,Mean Deff1,Std Deff1,Mean Deff2,Std Deff2,region,Video Number
0,0.0,0.01611765,26.711181,3.288428,0.945602,0.118247,0.372335,2.366365,0.577411,0.097441,...,,,0.767566,0.170583,0.959069,1.306709,0.100702,0.109127,striatum,0
1,1.0,1.567008e-08,0.619012,2.837829,0.300884,0.539952,0.04565,1.128631,0.11397,0.251949,...,,,0.944781,0.389032,0.196385,0.214294,0.018856,0.020472,striatum,0
2,2.0,0.3825253,0.025727,3.874284,0.03858,0.819546,0.00493,1.09121,0.083586,0.058152,...,,,0.668841,0.169089,1.263415,2.049538,0.255571,0.469424,striatum,0
3,3.0,,,32.947937,0.981209,0.068865,0.477143,6.559554,0.847551,0.017961,...,,,0.668841,0.169089,1.263415,2.049538,0.255571,0.469424,striatum,0
4,4.0,0.4294176,0.240604,2.30645,0.156974,0.657617,0.021563,1.523668,0.343689,0.013608,...,,,0.944781,0.389032,0.196385,0.214294,0.018856,0.020472,striatum,0


In [18]:
fstats_tot
features = [
    'alpha', # Fitted anomalous diffusion alpha exponenet
    'D_fit', # Fitted anomalous diffusion coefficient
    'kurtosis', # Kurtosis of track
    'asymmetry1', # Asymmetry of trajecory (0 for circular symmetric, 1 for linear)
    'asymmetry2', # Ratio of the smaller to larger principal radius of gyration
    'asymmetry3', # An asymmetric feature that accnts for non-cylindrically symmetric pt distributions
    'AR', # Aspect ratio of long and short side of trajectory's minimum bounding rectangle
    'elongation', # Est. of amount of extension of trajectory from centroid
    'boundedness', # How much a particle with Deff is restricted by a circular confinement of radius r
    'fractal_dim', # Measure of how complicated a self similar figure is
    'trappedness', # Probability that a particle with Deff is trapped in a region
    'efficiency', # Ratio of squared net displacement to the sum of squared step lengths
    'straightness', # Ratio of net displacement to the sum of squared step lengths
    'MSD_ratio', # MSD ratio of the track
    'frames', # Number of frames the track spans
    'Deff1', # Effective diffusion coefficient at 0.33 s
    'Deff2', # Effective diffusion coefficient at 3.3 s
    #'angle_mean', # Mean turning angle which is counterclockwise angle from one frame point to another
    #'angle_mag_mean', # Magnitude of the turning angle mean
    #'angle_var', # Variance of the turning angle
    #'dist_tot', # Total distance of the trajectory
   # 'dist_net', # Net distance from first point to last point
    #'progression', # Ratio of the net distance traveled and the total distance
    'Mean alpha', 
    'Mean D_fit', 
    'Mean kurtosis', 
    'Mean asymmetry1', 
    'Mean asymmetry2',
    'Mean asymmetry3', 
    'Mean AR',
    'Mean elongation', 
    'Mean boundedness',
    'Mean fractal_dim', 
    'Mean trappedness', 
    'Mean efficiency',
    'Mean straightness', 
    'Mean MSD_ratio', 
    'Mean Deff1', 
    'Mean Deff2',
    ]

target = 'region'           # prediction target (y)


linear_features = [
    'Mean Deff1',
    'Mean D_fit',
    'Mean fractal_dim',
    'Mean MSD_ratio',
    'Mean kurtosis',
    'Mean straightness'
    ]
    
ecm = fstats_tot
ecm = fstats_tot[features + [target] + ['X'] + ['Y']]
ecm = ecm[~ecm.isin([np.nan, np.inf, -np.inf]).any(1)] 

In [19]:
#--------------NOT-ADDED-----------------------------
def balance_data(df, target, **kwargs):
    if 'random_state' not in kwargs:
        random_state = 1
    else:
        random_state = kwargs['random_state']
    if isinstance(target, list):
        target = target[0]
    df_target = []
    bal_df = []
    for name in df[target].unique():
        df_target.append((name, df[df[target] == name]))
    print(f"Ratio before data balance ({':'.join([str(i[0]) for i in df_target])}) = {':'.join([str(len(i[1])) for i in df_target])}")
    for i in range(len(df_target)):
        ratio = min([len(i[1]) for i in df_target])/len(df_target[i][1])
        bal_df.append(df_target[i][1].sample(frac=ratio, random_state=random_state))
    print(f"Ratio after balance ({':'.join([str(i[0]) for i in df_target])}) = {':'.join([str(len(i)) for i in bal_df])}")
    return pd.concat(bal_df)
bal_ecm = balance_data(ecm, target, random_state=1)

Ratio before data balance (striatum:cortex:hippocampus:ganglia:thalamus) = 17401:6575:124:233:186
Ratio after balance (striatum:cortex:hippocampus:ganglia:thalamus) = 124:124:124:124:124


In [20]:
resolution = 128
assert not 2048%resolution and resolution >= 128, "resolution needs to be a factor of 2048 and > 128"
bins = list(range(0, 2048+1, resolution))
bin_labels = [int(i/resolution) for i in bins][:-1]
bal_ecm['binx'] = pd.cut(bal_ecm.X, bins, labels=bin_labels, include_lowest=True)
bal_ecm['biny'] = pd.cut(bal_ecm.Y, bins, labels=bin_labels, include_lowest=True)
bal_ecm['bins'] = (len(bins)-1)*bal_ecm['binx'].astype(np.int32) + bal_ecm['biny'].astype(np.int32)
bal_ecm = bal_ecm[np.isfinite(bal_ecm['bins'])]
bal_ecm['bins'] = bal_ecm['bins'].astype(int)

In [21]:
label_df = bal_ecm[target]
df2 = bal_ecm[features]
result = pd.concat([df2, label_df], axis=1)
#result_small = result.sample(5000)
result['region'].unique()

array(['striatum', 'cortex', 'hippocampus', 'ganglia', 'thalamus'],
      dtype=object)

In [22]:
result_cleaned = result[~result.isin([np.nan, np.inf, -np.inf]).any(1)]
X = result_cleaned[features]
print(len(result))
print(len(result_cleaned))
print(len(result_cleaned['region']))
print(len(X))
print()

ss = StandardScaler()
scaled_features = ss.fit_transform(X.values)
kmean = KMeans(n_clusters=5, init='k-means++').fit(scaled_features)
centroids = kmean.cluster_centers_

print('SSE value: ', kmean.inertia_)
print('Num iterations for convergence: ', kmean.n_iter_)
#spec = SpectralClustering(n_clusters=4).fit(X)

620
620
620
620

SSE value:  13473.2045886611
Num iterations for convergence:  21


In [23]:
def get_cluster_distributions(cluster_labels, df_with_target, n_clusters):

    for region in result_cleaned['region'].unique():
        idx = np.array(df_with_target['region'] == region)
        clust_labels = cluster_labels[idx]
        true_label = result_cleaned.iloc[idx]
        print('Region:', region)
        for i in range(0, n_clusters):
            ct = (clust_labels == i).sum()
            percent = '{:.2%}'.format(ct/len(clust_labels))
            print('Percent in cluster', i,'=', percent)
        print()
get_cluster_distributions(kmean.labels_, result_cleaned, 5)

Region: striatum
Percent in cluster 0 = 12.90%
Percent in cluster 1 = 33.87%
Percent in cluster 2 = 53.23%
Percent in cluster 3 = 0.00%
Percent in cluster 4 = 0.00%

Region: cortex
Percent in cluster 0 = 15.32%
Percent in cluster 1 = 9.68%
Percent in cluster 2 = 69.35%
Percent in cluster 3 = 0.00%
Percent in cluster 4 = 5.65%

Region: hippocampus
Percent in cluster 0 = 46.77%
Percent in cluster 1 = 12.90%
Percent in cluster 2 = 16.13%
Percent in cluster 3 = 1.61%
Percent in cluster 4 = 22.58%

Region: ganglia
Percent in cluster 0 = 20.97%
Percent in cluster 1 = 41.13%
Percent in cluster 2 = 29.03%
Percent in cluster 3 = 0.00%
Percent in cluster 4 = 8.87%

Region: thalamus
Percent in cluster 0 = 44.35%
Percent in cluster 1 = 11.29%
Percent in cluster 2 = 16.13%
Percent in cluster 3 = 0.00%
Percent in cluster 4 = 28.23%



In [24]:
agg = AgglomerativeClustering(n_clusters=5).fit(scaled_features)
get_cluster_distributions(agg.labels_, result_cleaned, 5)

Region: striatum
Percent in cluster 0 = 0.00%
Percent in cluster 1 = 0.81%
Percent in cluster 2 = 27.42%
Percent in cluster 3 = 62.90%
Percent in cluster 4 = 8.87%

Region: cortex
Percent in cluster 0 = 6.45%
Percent in cluster 1 = 0.00%
Percent in cluster 2 = 20.97%
Percent in cluster 3 = 71.77%
Percent in cluster 4 = 0.81%

Region: hippocampus
Percent in cluster 0 = 42.74%
Percent in cluster 1 = 4.03%
Percent in cluster 2 = 30.65%
Percent in cluster 3 = 17.74%
Percent in cluster 4 = 4.84%

Region: ganglia
Percent in cluster 0 = 12.90%
Percent in cluster 1 = 0.81%
Percent in cluster 2 = 28.23%
Percent in cluster 3 = 32.26%
Percent in cluster 4 = 25.81%

Region: thalamus
Percent in cluster 0 = 58.87%
Percent in cluster 1 = 0.00%
Percent in cluster 2 = 32.26%
Percent in cluster 3 = 8.06%
Percent in cluster 4 = 0.81%



In [25]:
spec = SpectralClustering(n_clusters=5).fit(scaled_features)
get_cluster_distributions(spec.labels_, result_cleaned, 5)

Region: striatum
Percent in cluster 0 = 100.00%
Percent in cluster 1 = 0.00%
Percent in cluster 2 = 0.00%
Percent in cluster 3 = 0.00%
Percent in cluster 4 = 0.00%

Region: cortex
Percent in cluster 0 = 100.00%
Percent in cluster 1 = 0.00%
Percent in cluster 2 = 0.00%
Percent in cluster 3 = 0.00%
Percent in cluster 4 = 0.00%

Region: hippocampus
Percent in cluster 0 = 99.19%
Percent in cluster 1 = 0.81%
Percent in cluster 2 = 0.00%
Percent in cluster 3 = 0.00%
Percent in cluster 4 = 0.00%

Region: ganglia
Percent in cluster 0 = 100.00%
Percent in cluster 1 = 0.00%
Percent in cluster 2 = 0.00%
Percent in cluster 3 = 0.00%
Percent in cluster 4 = 0.00%

Region: thalamus
Percent in cluster 0 = 100.00%
Percent in cluster 1 = 0.00%
Percent in cluster 2 = 0.00%
Percent in cluster 3 = 0.00%
Percent in cluster 4 = 0.00%



In [26]:
guass = GaussianMixture(n_components=5).fit_predict(scaled_features)
get_cluster_distributions(guass, result_cleaned, 5)

Region: striatum
Percent in cluster 0 = 30.65%
Percent in cluster 1 = 0.00%
Percent in cluster 2 = 56.45%
Percent in cluster 3 = 6.45%
Percent in cluster 4 = 6.45%

Region: cortex
Percent in cluster 0 = 6.45%
Percent in cluster 1 = 18.55%
Percent in cluster 2 = 40.32%
Percent in cluster 3 = 29.84%
Percent in cluster 4 = 4.84%

Region: hippocampus
Percent in cluster 0 = 25.00%
Percent in cluster 1 = 42.74%
Percent in cluster 2 = 5.65%
Percent in cluster 3 = 7.26%
Percent in cluster 4 = 19.35%

Region: ganglia
Percent in cluster 0 = 30.65%
Percent in cluster 1 = 22.58%
Percent in cluster 2 = 34.68%
Percent in cluster 3 = 0.81%
Percent in cluster 4 = 11.29%

Region: thalamus
Percent in cluster 0 = 9.68%
Percent in cluster 1 = 60.48%
Percent in cluster 2 = 4.84%
Percent in cluster 3 = 1.61%
Percent in cluster 4 = 23.39%



In [32]:
bay_gauss = BayesianGaussianMixture(n_components=5).fit_predict(scaled_features)
get_cluster_distributions(bay_gauss, result_cleaned, 5)

Region: striatum
Percent in cluster 0 = 41.13%
Percent in cluster 1 = 0.00%
Percent in cluster 2 = 4.03%
Percent in cluster 3 = 10.48%
Percent in cluster 4 = 44.35%

Region: cortex
Percent in cluster 0 = 13.71%
Percent in cluster 1 = 5.65%
Percent in cluster 2 = 18.55%
Percent in cluster 3 = 2.42%
Percent in cluster 4 = 59.68%

Region: hippocampus
Percent in cluster 0 = 16.13%
Percent in cluster 1 = 8.87%
Percent in cluster 2 = 56.45%
Percent in cluster 3 = 8.06%
Percent in cluster 4 = 10.48%

Region: ganglia
Percent in cluster 0 = 38.71%
Percent in cluster 1 = 1.61%
Percent in cluster 2 = 29.84%
Percent in cluster 3 = 6.45%
Percent in cluster 4 = 23.39%

Region: thalamus
Percent in cluster 0 = 7.26%
Percent in cluster 1 = 8.06%
Percent in cluster 2 = 75.00%
Percent in cluster 3 = 6.45%
Percent in cluster 4 = 3.23%



In [29]:
affinity = AffinityPropagation().fit(scaled_features)
affinity.cluster_centers_.shape

(63, 33)