In [None]:
from diff_predictor import data_process, predxgboost, spatial
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt 

from os import listdir, getcwd, chdir
from os.path import isfile, join
import os
from sklearn.preprocessing import scale, StandardScaler
from numpy.random import permutation


from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, recall_score, precision_score, f1_score
import operator
import xgboost as xgb
import shap
from xgboost.training import CVPack
from xgboost import callback
from xgboost.core import CallbackEnv
from xgboost.core import EarlyStopException
from xgboost.core import STRING_TYPES

from diff_classifier.features import alpha_calc, unmask_track
from diff_predictor.utils import plot_msd_comparisons, plot_individual_msds, plot_particles_in_frame

import scipy.stats as stats
from scipy.optimize import curve_fit
import numpy.ma as ma

In [None]:
workbookDir = getcwd()

print('Current Notebook Dir: ' + workbookDir)
chdir(workbookDir) # Go to current workbook Dir"
chdir('..')        # Go up one
chdir('..') 
print(f'Using current directory for loading data: {getcwd()}')
workbookDir = getcwd()

In [None]:
#load paths to data

age_feature_path = workbookDir + '/data/raw_data_age/'
age_feature_filelist = [f for f in listdir(age_feature_path) if isfile(join(age_feature_path, f)) and 'feat' in f]
print(len(age_feature_filelist))

# age_msd_path = workbookDir + '/raw_data_age/'
# age_msd_filelist = [f for f in listdir(age_msd_path) if isfile(join(age_msd_path, f)) and 'msd' in f]
# print(len(age_msd_filelist))

region_dataset_path = workbookDir + '/data/region_feature_folder/'
region_filelist = [f for f in listdir(region_dataset_path) if isfile(join(region_dataset_path, f)) and 'feat' in f]
print(len(region_filelist))

# treatment_dataset_path = workbookDir + '/data/ecm_feature_folder/'
# treatment_filelist = [f for f in listdir(treatment_dataset_path) if isfile(join(treatment_dataset_path, f)) and 'msd' in f]
# print(len(treatment_filelist))

# Analysis of age dataset

In [None]:
fstats_tot_age = data_process.generate_fullstats(age_feature_path, age_feature_filelist, ['P14','P35', 'P70'], 'age')

In [None]:
feature_list = [
    'alpha', # Fitted anomalous diffusion alpha exponenet
    'D_fit', # Fitted anomalous diffusion coefficient
    'kurtosis', # Kurtosis of track
    'asymmetry1', # Asymmetry of trajecory (0 for circular symmetric, 1 for linear)
    'asymmetry2', # Ratio of the smaller to larger principal radius of gyration
    'asymmetry3', # An asymmetric feature that accnts for non-cylindrically symmetric pt distributions
    'AR', # Aspect ratio of long and short side of trajectory's minimum bounding rectangle
    'elongation', # Est. of amount of extension of trajectory from centroid
    'boundedness', # How much a particle with Deff is restricted by a circular confinement of radius r
    'fractal_dim', # Measure of how complicated a self similar figure is
    'trappedness', # Probability that a particle with Deff is trapped in a region
    'efficiency', # Ratio of squared net displacement to the sum of squared step lengths
    'straightness', # Ratio of net displacement to the sum of squared step lengths
    'MSD_ratio', # MSD ratio of the track
#     'frames', # Number of frames the track spans
    'Deff1', # Effective diffusion coefficient at 0.33 s
    'Deff2', # Effective diffusion coefficient at 3.3 s
    #'angle_mean', # Mean turning angle which is counterclockwise angle from one frame point to another
    #'angle_mag_mean', # Magnitude of the turning angle mean
    #'angle_var', # Variance of the turning angle
    #'dist_tot', # Total distance of the trajectory
    #'dist_net', # Net distance from first point to last point
    #'progression', # Ratio of the net distance traveled and the total distance
    'Mean alpha', 
    'Mean D_fit', 
    'Mean kurtosis', 
    'Mean asymmetry1', 
    'Mean asymmetry2',
    'Mean asymmetry3', 
    'Mean AR',
    'Mean elongation', 
    'Mean boundedness',
    'Mean fractal_dim', 
    'Mean trappedness', 
    'Mean efficiency',
    'Mean straightness', 
    'Mean MSD_ratio', 
    'Mean Deff1', 
    'Mean Deff2',
    ]

target = 'age'

In [None]:
ecm = fstats_tot_age[feature_list + [target, 'Track_ID', 'X', 'Y']] #dont think i need these rn
print(ecm.shape)
ecm = ecm[~ecm[list(set(feature_list) - set(['Deff2', 'Mean Deff2']))].isin([np.nan, np.inf, -np.inf]).any(1)]       # Removing nan and inf data points
ecm.shape

In [None]:
bal_ecm = data_process.balance_data(ecm, target, random_state=1)
bal_ecm = data_process.bin_data(bal_ecm, resolution=128)
label_df = bal_ecm[target]
features_df = bal_ecm.drop([target, 'Track_ID', 'X', 'Y', 'binx', 'biny', 'bins'], axis=1)
features = features_df.columns

# Regular split

seed = 1234
np.random.seed(seed)
train_split = 0.7
test_split = 0.5

le = preprocessing.LabelEncoder()
bal_ecm['encoded_target'] = le.fit_transform(bal_ecm[target])

training_bins = np.random.choice(bal_ecm.bins.unique(), int(len(bal_ecm.bins.unique())*train_split), replace=False)

X_train = bal_ecm[bal_ecm.bins.isin(training_bins)]
X_test_val = bal_ecm[~bal_ecm.bins.isin(training_bins)]
X_val, X_test = train_test_split(X_test_val, test_size=test_split, random_state=seed)

y_train = X_train['encoded_target']
y_test = X_test['encoded_target']
y_val = X_val['encoded_target']

# dtrain = X_train[features]
# dtest = X_test[features]
# dval = X_val[features]

dtrain = xgb.DMatrix(X_train[features], label=y_train)
dtest = xgb.DMatrix(X_test[features], label=y_test)
dval = xgb.DMatrix(X_val[features], label=y_val)

In [None]:
def full_preprocess(ecm, balanced=True, y_scramble=False, target=None):

    rand_state = np.random.randint(1, 2000)
    if balanced:
        bal_ecm = data_process.balance_data(ecm, target, random_state=rand_state)
        bal_ecm = bal_ecm.reset_index(drop=True)
        #sampled_df = bal_ecm.sample(frac=0.5)
        sampled_df = data_process.bin_data(bal_ecm)
    else:
        sampled_df = data_process.bin_data(ecm)
    label_df = sampled_df[target]
    features_df = sampled_df.drop([target, 'X', 'Y', 'binx', 'biny', 'bins', 'Track_ID'], axis=1)
    features = features_df.columns

    if y_scramble:
        perm = permutation(len(label_df))
        label_shuffled = label_df[perm]
        le = preprocessing.LabelEncoder()
        sampled_df['encoded_target'] = le.fit_transform(label_shuffled)
    else:
        le = preprocessing.LabelEncoder()
        sampled_df['encoded_target'] = le.fit_transform(sampled_df[target])

    seed = rand_state
    np.random.seed(seed)
    train_split = 0.7
    test_split = 0.5


    training_bins = np.random.choice(sampled_df['bins'].unique(), int(len(sampled_df['bins'].unique())*train_split), replace=False)

    X_train = sampled_df[sampled_df['bins'].isin(training_bins)]
    X_test_val = sampled_df[~sampled_df['bins'].isin(training_bins)]
    X_val, X_test = train_test_split(X_test_val, test_size=test_split, random_state=seed)

    y_train = X_train['encoded_target']
    y_test = X_test['encoded_target']
    y_val = X_val['encoded_target']

    dtrain = xgb.DMatrix(X_train[features], label=y_train)
    dtest = xgb.DMatrix(X_test[features], label=y_test)
    dval = xgb.DMatrix(X_val[features], label=y_val)
    return dtrain, dtest, dval, X_train, X_test, y_train, y_test, le



In [None]:
spatial.get_lengths(bal_ecm, X_train, X_test, X_val)

In [None]:
param = {'max_depth': 3,
         'eta': 0.005,
         'min_child_weight': 0,
         'verbosity': 0,
         'objective': 'multi:softprob',
         'num_class': 3,
         'silent': 'True',
         'gamma': 5,
         'subsample': 0.15,
         'colsample_bytree': 0.8,
         'eval_metric': "mlogloss",
#          # GPU integration will cut time in ~half:
#          'gpu_id' : 0,
#          'tree_method': 'gpu_hist',
#          'predictor': 'gpu_predictor'
         }

In [None]:
predxgboost.train(param, dtrain, dtest, dval)

In [None]:
(best_model, best_param, best_eval, best_boost_rounds) = predxgboost.xgb_paramsearch(X_train=X_train, y_train=X_train['encoded_target'], features=features, init_params=param)

In [None]:
best_param = {'max_depth': 4, 'eta': 0.1, 'min_child_weight': 1, 'verbosity': 0, 'objective': 'multi:softprob', 'num_class': 3, 'silent': 'True', 'gamma': 1.0, 'subsample': 0.5, 'colsample_bytree': 0.6, 'eval_metric': 'mlogloss'}

In [None]:
best_param = {'max_depth': 4, 'eta': 0.01, 'min_child_weight': 2, 'verbosity': 0, 'objective': 'multi:softprob', 'num_class': 5, 'silent': 'True', 'gamma': 0.2, 'subsample': 0.6, 'colsample_bytree': 0.5, 'eval_metric': 'mlogloss'}

In [None]:
def get_multimodel_averages(target_column, classes, data, params, num_boost_rounds, balanced=True, y_scramble=False, models_to_run=50):
    """

    """

    results_dict = {
        'tot_acc_vals': np.zeros(models_to_run),
        'tot_prec_vals': np.zeros(models_to_run),
        'tot_rec_vals': np.zeros(models_to_run),
        'tot_f1_vals': np.zeros(models_to_run),
        'booster_list': list(range(models_to_run)),
        'truelabels_list': list(range(models_to_run)),
        'preds_list': list(range(models_to_run)),
        'xtest_list': list(range(models_to_run))
    }


    for class_name in classes:
        key_name_acc = class_name + '_acc_vals'
        results_dict[key_name_acc] = np.zeros(models_to_run)

        key_name_prec = class_name + '_prec_vals'
        results_dict[key_name_prec] = np.zeros(models_to_run)

        key_name_rec = class_name + '_rec_vals'
        results_dict[key_name_rec] = np.zeros(models_to_run)

        key_name_f1 = class_name + '_f1_vals'
        results_dict[key_name_f1] = np.zeros(models_to_run)
    


    for i in range(models_to_run):
        print(i)

        dtrain, dtest, dval, X_train, X_test, y_train, y_test, le = full_preprocess(ecm, balanced=balanced, target=target_column, y_scramble=y_scramble)
        booster, acc, true_label, preds = predxgboost.train(best_param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=num_boost_rounds, verbose=False)

        preds = np.array(preds)

        results_dict['tot_acc_vals'][i] = accuracy_score(true_label, preds)
        results_dict['tot_prec_vals'][i] = precision_score(true_label, preds, average='macro')
        results_dict['tot_rec_vals'][i] = recall_score(true_label, preds, average='macro')
        results_dict['tot_f1_vals'][i] = f1_score(true_label, preds, average='macro')

        results_dict['xtest_list'][i] = X_test

        results_dict['booster_list'][i] = booster
        results_dict['truelabels_list'][i] = true_label
        results_dict['preds_list'][i] = preds

        prec, rec, f1, sup = precision_recall_fscore_support(true_label, preds)

        for class_name in classes:
            class_label = np.where(le.classes_ == class_name)
            class_idx = np.where(true_label == class_label[0])

            key_name_acc = class_name + '_acc_vals'
            results_dict[key_name_acc][i] = accuracy_score(true_label[class_idx], preds[class_idx])

            key_name_prec = class_name + '_prec_vals'
            results_dict[key_name_prec][i] = prec[class_label]

            key_name_rec = class_name + '_rec_vals'
            results_dict[key_name_rec][i] = rec[class_label]

            key_name_f1 = class_name + '_f1_vals'
            results_dict[key_name_f1][i] = f1[class_label]

    return results_dict

In [None]:
age_results_dict_100models = get_multimodel_averages('age', ecm['age'].unique(), ecm, best_param, 767, models_to_run=10, y_scramble=False)

In [None]:
from scipy import stats


In [None]:
for key in age_results_dict_100models.keys():
    value = age_results_dict_100models[key]
    if isinstance(value, (np.ndarray, np.generic) ):
        # fig = plt.figure()
        # plt.hist(value, bins=25)
        # plt.title(f'age, {key}')
        print(f'age, {key}')
        print(np.median(value))
        print(stats.iqr(value, interpolation='midpoint'))
        print()

In [None]:
for key in age_results_dict.keys():
    value = age_results_dict[key]
    if isinstance(value, (np.ndarray, np.generic) ):
        fig = plt.figure()
        plt.hist(value, bins=25)
        plt.title(f'age, {key}')

In [None]:
age_yscramb_results_dict = get_multimodel_averages('age', ecm['age'].unique(), ecm, best_param, 804, models_to_run=50, y_scramble=True)

In [None]:
for key in age_yscramb_results_dict.keys():
    value = age_yscramb_results_dict[key]
    if isinstance(value, (np.ndarray, np.generic)):
        fig = plt.figure()
        plt.hist(value, bins=25)

In [None]:
# def get_multimodel_averages(target, models_to_run=50, )

#     age_tot_acc_vals = np.zeros(models_to_run)
#     age_tot_prec_vals = np.zeros(models_to_run)
#     age_tot_rec_vals = np.zeros(models_to_run)
#     age_tot_f1_vals = np.zeros(models_to_run)

#     age_booster_list = list(range(models_to_run))
#     age_truelabels_list = list(range(models_to_run))
#     age_preds_list = list(range(models_to_run))
#     age_xtest_list = list(range(models_to_run))

#     P14_acc_vals = np.zeros(models_to_run)
#     P35_acc_vals = np.zeros(models_to_run)
#     P70_acc_vals = np.zeros(models_to_run)

#     P14_prec_vals = np.zeros(models_to_run)
#     P35_prec_vals = np.zeros(models_to_run)
#     P70_prec_vals = np.zeros(models_to_run)

#     P14_rec_vals = np.zeros(models_to_run)
#     P35_rec_vals = np.zeros(models_to_run)
#     P70_rec_vals = np.zeros(models_to_run)

#     P14_f1_vals = np.zeros(models_to_run)
#     P35_f1_vals = np.zeros(models_to_run)
#     P70_f1_vals = np.zeros(models_to_run)

#     P14_sup_vals = np.zeros(models_to_run)
#     P35_sup_vals = np.zeros(models_to_run)
#     P70_sup_vals = np.zeros(models_to_run)

#     for i in range(models_to_run):
#         print(i)


#         dtrain, dtest, dval, X_train, X_test, y_train, y_test, le = full_preprocess(ecm, balanced=True, target=target)
#         booster, acc, true_label, preds = predxgboost.train(best_param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=804, verbose=False)

#         preds = np.array(preds)

#         age_tot_acc_vals[i] = accuracy_score(true_label, preds)
#         age_tot_prec_vals[i] = precision_score(true_label, preds, average='macro')
#         age_tot_rec_vals[i] = recall_score(true_label, preds, average='macro')
#         age_tot_f1_vals[i] = f1_score(true_label, preds, average='macro')

#         age_xtest_list[i] = X_test

#         age_booster_list[i] = booster
#         age_truelabels_list[i] = true_label
#         age_preds_list[i] = preds

#         prec, rec, f1, sup = precision_recall_fscore_support(true_label, preds)

#         p14_idx = np.where(le.classes_=='P14')
#         p35_idx = np.where(le.classes_=='P35')
#         p70_idx = np.where(le.classes_=='P70')

#         p14_locs = np.where(true_label==p14_idx[0])
#         p35_locs = np.where(true_label==p35_idx[0])
#         p70_locs = np.where(true_label==p70_idx[0])

#         P14_acc_vals[i] = accuracy_score(true_label[p14_locs], preds[p14_locs])
#         P35_acc_vals[i] = accuracy_score(true_label[p35_locs], preds[p35_locs])
#         P70_acc_vals[i] = accuracy_score(true_label[p70_locs], preds[p70_locs])


#         P14_prec_vals[i] = prec[p14_idx]
#         P35_prec_vals[i] = prec[p35_idx]
#         P70_prec_vals[i] = prec[p70_idx]

#         P14_rec_vals[i] = rec[p14_idx]
#         P35_rec_vals[i] = rec[p35_idx]
#         P70_rec_vals[i] = rec[p70_idx]

#         P14_f1_vals[i] = f1[p14_idx]
#         P35_f1_vals[i] = f1[p35_idx]
#         P70_f1_vals[i] = f1[p70_idx]

#         P14_sup_vals[i] = sup[p14_idx]
#         P35_sup_vals[i] = sup[p35_idx]
#         P70_sup_vals[i] = sup[p70_idx]

In [None]:
true_label[p14_locs]

In [None]:
print(P70_acc_vals.mean())
print(P70_acc_vals.std())

In [None]:
print(age_tot_acc_vals.mean())
print(age_tot_acc_vals.std())

In [None]:
print(age_tot_prec_vals.mean())
print(age_tot_prec_vals.std())

In [None]:
print(age_tot_rec_vals.mean())
print(age_tot_rec_vals.std())

In [None]:
print(age_tot_f1_vals.mean())
print(age_tot_f1_vals.std())

In [None]:
print(P14_sup_vals.mean())
print(P14_sup_vals.std())

In [None]:
print(P35_f1_vals.mean())
print(P35_f1_vals.std())

In [None]:
print(P70_f1_vals.mean())
print(P70_f1_vals.std())

In [None]:
len(P14_acc_vals)

In [None]:
medIdx = list(age_tot_acc_vals).index(np.percentile(age_tot_acc_vals,50,interpolation='nearest'))
medIdx

In [None]:
med = np.argsort(age_yscramb_results_dict['tot_acc_vals'])[len(age_yscramb_results_dict['tot_acc_vals'])//2]
med

In [None]:
min_idx = np.argsort(age_tot_acc_vals)[0]
max_idx = np.argsort(age_tot_acc_vals)[-1]
print(min_idx)
print(max_idx)

In [None]:
age_yscramb_results_dict['booster_list']

In [None]:
booster = age_yscramb_results_dict['booster_list'][med]
class_names = le.classes_




In [None]:
age_yscramb_results_dict['xtest_list'][med]

In [None]:
age_xtest_list

In [None]:
from matplotlib import colors as plt_colors

explainer = shap.TreeExplainer(booster)
shap_values = explainer.shap_values(age_yscramb_results_dict['xtest_list'][med])
c_NT = '#E69F00'
c_HYase = '#56B4E9'
c_ChABC = '#009E73'

colors = [c_NT, c_HYase, c_ChABC]
class_inds = np.argsort([-np.abs(shap_values[i]).mean() for i in range(len(shap_values))])
cmap = plt_colors.ListedColormap(np.array(colors)[class_inds])
shap.summary_plot(shap_values, X_test[feature_list], class_names=np.array(class_names), max_display=15, title='Total SHAP Values', color=cmap)

In [None]:
class_names

In [None]:
for i in range(5): 
    figsize = (7.5, 5)
    fig = plt.figure(figsize=figsize)
    ax = fig.gca()
    shap.summary_plot(shap_values[i], X_test[feature_list], max_display=5, show=False)
    ax.set_title(f'Top 5 Features for {le.classes_[i]}')

In [None]:
# Currently using parameters found in the diff_mode analysis notebook for age
booster, acc, true_label, preds = predxgboost.train(best_param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=804, verbose=False)


In [None]:
class_names = le.classes_
class_results = classification_report(true_label, preds, digits=4, target_names = class_names)
print(str(class_results))

In [None]:
metrics.confusion_matrix(y_test, preds)
plt.figure(figsize=(12,10))
cm_array = metrics.confusion_matrix(true_label, preds)
df_cm = pd.DataFrame(cm_array, index = class_names, columns = class_names)

sns.set(font_scale=1.4) # for label size
ax = sns.heatmap(df_cm, annot=True, annot_kws={"size": 16}, cmap="YlGnBu")
ax.set(xlabel='Predicted', ylabel='Actual')

plt.show()

In [None]:
# Needed bc of this issue: https://github.com/slundberg/shap/issues/1215

# model_bytearray = booster.save_raw()[4:]
# def myfun(self=None):
#     return model_bytearray

# booster.save_raw = myfun

explainer = shap.TreeExplainer(booster)
shap_values = explainer.shap_values(X_test[features])

In [None]:
from matplotlib import colors as plt_colors

explainer = shap.TreeExplainer(booster)
shap_values = explainer.shap_values(X_test[features])
c_NT = '#E69F00'
c_HYase = '#56B4E9'
c_ChABC = '#009E73'

colors = [c_NT, c_HYase, c_ChABC]
class_inds = np.argsort([-np.abs(shap_values[i]).mean() for i in range(len(shap_values))])
cmap = plt_colors.ListedColormap(np.array(colors)[class_inds])
shap.summary_plot(shap_values, X_test[features], class_names=np.array(class_names), max_display=15, title='Total SHAP Values', color=cmap)

In [None]:
shap.summary_plot(shap_values, X_test[features], class_names=np.array(class_names), max_display=15, title='Total SHAP Values')

### Outlier detection

In [None]:
scaler = StandardScaler()
features_df_filled = ecm[features].fillna(0)
scaled_df = scaler.fit_transform(features_df_filled)
scaled_df = pd.DataFrame(scaled_df, columns=features)

In [None]:
features_df_filled['alpha']

In [None]:
L, S = r_pca.R_pca(scaled_df).fit(max_iter=5000)

In [None]:
S_array = np.absolute(S.values)
S_array

S_magnitudes = np.zeros(len(S))
for i in range(len(S)):
    abs_sum = np.sum(S_array[i])
    S_magnitudes[i] = abs_sum

In [None]:
plt.hist(S_magnitudes, bins=5000)

In [None]:
plt.hist((S_magnitudes), bins=5000)
plt.vlines((S_magnitudes).mean()+((S_magnitudes).std()*3), ymin=0, ymax=40, color='r')
plt.vlines((S_magnitudes).mean()-((S_magnitudes).std()*3), ymin=0, ymax=40, color='r')
plt.title('Distrubution of Sparse Matrix Magnitudes')

In [None]:
plt.hist(np.log(S_magnitudes), bins=5000)
plt.vlines(np.log(S_magnitudes).mean()+(np.log(S_magnitudes).std()*4), ymin=0, ymax=40, color='r')
plt.vlines(np.log(S_magnitudes).mean()-(np.log(S_magnitudes).std()*4), ymin=0, ymax=40, color='r')
plt.title('Distrubution of Sparse Matrix Magnitudes')

In [None]:
outlier_df


In [None]:
upper_outlier_cutoff = np.log(S_magnitudes).mean()+(np.log(S_magnitudes).std()*5)
S_mag_log = np.log(S_magnitudes)
outliers = S_mag_log[S_mag_log > upper_outlier_cutoff]
print(len(outliers))
outlier_inds = np.where(S_mag_log > upper_outlier_cutoff)
normal_inds = np.where(S_mag_log <= upper_outlier_cutoff)
outlier_df = ecm.iloc[outlier_inds[0]]
normal_df = ecm.iloc[normal_inds[0]]

In [None]:
bal_ecm = data_process.balance_data(normal_df, target, random_state=1)
bal_ecm = data_process.bin_data(bal_ecm, resolution=128)

In [None]:
# Regular split

seed = 1234
np.random.seed(seed)
train_split = 0.5
test_split = 0.5

le = preprocessing.LabelEncoder()
bal_ecm['encoded_target'] = le.fit_transform(bal_ecm[target])

training_bins = np.random.choice(bal_ecm.bins.unique(), int(len(bal_ecm.bins.unique())*train_split), replace=False)

X_train = bal_ecm[bal_ecm.bins.isin(training_bins)]
X_test_val = bal_ecm[~bal_ecm.bins.isin(training_bins)]
X_val, X_test = train_test_split(X_test_val, test_size=test_split, random_state=seed)

y_train = X_train['encoded_target']
y_test = X_test['encoded_target']
y_val = X_val['encoded_target']

# dtrain = X_train[features]
# dtest = X_test[features]
# dval = X_val[features]

dtrain = xgb.DMatrix(X_train[features], label=y_train)
dtest = xgb.DMatrix(X_test[features], label=y_test)
dval = xgb.DMatrix(X_val[features], label=y_val)

In [None]:
# Currently using parameters found in the diff_mode analysis notebook for age
booster, acc, true_label, preds = predxgboost.train(best_param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=96, verbose=False)


In [None]:
class_names = le.classes_
class_results = classification_report(y_test, preds, digits=4, target_names = class_names)
print(str(class_results))

In [None]:
metrics.confusion_matrix(y_test, preds)
plt.figure(figsize=(12,10))
cm_array = metrics.confusion_matrix(y_test, preds)
df_cm = pd.DataFrame(cm_array, index = class_names, columns = class_names)

sns.set(font_scale=1.4) # for label size
ax = sns.heatmap(df_cm, annot=True, annot_kws={"size": 16}, cmap="YlGnBu")
ax.set(xlabel='Predicted', ylabel='Actual')

plt.show()

In [None]:
from matplotlib import colors as plt_colors

explainer = shap.TreeExplainer(booster)
shap_values = explainer.shap_values(X_test[features])
c_NT = '#E69F00'
c_HYase = '#56B4E9'
c_ChABC = '#009E73'

colors = [c_NT, c_HYase, c_ChABC]
class_inds = np.argsort([-np.abs(shap_values[i]).mean() for i in range(len(shap_values))])
cmap = plt_colors.ListedColormap(np.array(colors)[class_inds])
shap.summary_plot(shap_values, X_test[features], class_names=np.array(class_names), max_display=15, title='Total SHAP Values', color=cmap)

In [None]:
le_out = preprocessing.LabelEncoder()
outlier_df['encoded_target'] = le_out.fit_transform(outlier_df['age'])

In [None]:

y_outlier = outlier_df['encoded_target']
d_outliers = xgb.DMatrix(outlier_df[features], label=y_outlier)

In [None]:
true_label = d_outliers.get_label()
ypred = booster.predict(d_outliers)
preds = [np.where(x == np.max(x))[0][0] for x in ypred]
acc = accuracy_score(true_label, preds)
print("Accuracy:",acc)

In [None]:
preds
true_label

In [None]:
class_names = le.classes_
class_results = classification_report(y_outlier, preds, digits=4, target_names=le.classes_)
print(str(class_results))

In [None]:
metrics.confusion_matrix(y_outlier, preds)
plt.figure(figsize=(12,10))
cm_array = metrics.confusion_matrix(y_outlier, preds)
df_cm = pd.DataFrame(cm_array, index = class_names, columns = class_names)

sns.set(font_scale=1.4) # for label size
ax = sns.heatmap(df_cm, annot=True, annot_kws={"size": 16}, cmap="YlGnBu")
ax.set(xlabel='Predicted', ylabel='Actual')

plt.show()

In [None]:
from matplotlib import colors as plt_colors

explainer = shap.TreeExplainer(booster)
shap_values = explainer.shap_values(outlier_df[features])
c_NT = '#E69F00'
c_HYase = '#56B4E9'
c_ChABC = '#009E73'


colors = [c_NT, c_HYase, c_ChABC]
class_inds = np.argsort([-np.abs(shap_values[i]).mean() for i in range(len(shap_values))])
cmap = plt_colors.ListedColormap(np.array(colors)[class_inds])
shap.summary_plot(shap_values, outlier_df[features], class_names=np.array(class_names), max_display=15, title='Total SHAP Values', color=cmap)

In [None]:
perr_alph = []
perr_dcoef = []

for i in range(len(age_msd_filelist)):

    
    msd_df = pd.read_csv(age_msd_path + age_msd_filelist[i])
    trackids = msd_df['Track_ID'].unique()
    partcount = trackids.shape[0]
    for particle in range(0, partcount):

        single_track_masked = msd_df.loc[msd_df['Track_ID'] == trackids[particle]].sort_values(['Track_ID', 'Frame'], ascending=[1,1]).reset_index(drop=True)
        single_track = unmask_track(single_track_masked)
        xpos = single_track['MSDs']
        ypos = single_track['Frame']

        def msd_alpha(xpos, alph, dcoef):
                return 4*dcoef*(xpos**alph)

        try:
            popt, pcov = curve_fit(msd_alpha, xpos, ypos)
            alph = popt[0]
            dcoef = popt[1]
            perr = np.sqrt(np.diag(pcov))
            perr_alph.append(perr[0])
            perr_dcoef.append(perr[1])
        except RuntimeError:
            print('Optimal parameters not found. Print NaN instead.')
            alph = np.nan
            dcoef = np.nan
        

In [None]:
perr_alph_arr.min()

In [None]:
perr_alph_arr = np.array(perr_alph)
perr_alph_arr = perr_alph_arr[perr_alph_arr != np.inf]
perr_alph_arr = perr_alph_arr[perr_alph_arr != np.nan]

plt.hist(perr_alph_arr)

In [None]:
perr_dcoef_arr = np.array(perr_dcoef)
perr_dcoef_arr = perr_dcoef_arr[perr_dcoef_arr != np.inf]
plt.hist(perr_dcoef_arr)

# Region data

In [None]:
fstats_tot_region = data_process.generate_fullstats(region_dataset_path, region_filelist, ['cortex', 'striatum', 'hippocampus',], 'region')

In [None]:
target = 'region'
ecm = fstats_tot_region[feature_list + [target, 'Track_ID', 'X', 'Y']] #dont think i need these rn
print(ecm.shape)
ecm = ecm[~ecm[list(set(feature_list) - set(['Deff2', 'Mean Deff2']))].isin([np.nan, np.inf, -np.inf]).any(1)]       # Removing nan and inf data points
ecm.shape

In [None]:
region_results_dict = get_multimodel_averages('region', ecm['region'].unique(), ecm, best_param, 1157)

In [None]:
for key in region_results_dict.keys():
    value = region_results_dict[key]
    if isinstance(value, (np.ndarray, np.generic) ):
        fig = plt.figure()
        plt.hist(value)
        plt.title(f'region, {key}')

In [None]:
for key in region_results_dict.keys():
    value = region_results_dict[key]
    if isinstance(value, (np.ndarray, np.generic) ):
        print(f'region, {key}')
        print(np.median(value))
        print(stats.iqr(value, interpolation= 'midpoint'))
        print()

In [None]:
region_yscramb_results_dict = get_multimodel_averages('region', ecm['region'].unique(), ecm, best_param, 200, True, True)

In [None]:
for key in region_yscramb_results_dict.keys():
    value = region_yscramb_results_dict[key]
    if isinstance(value, (np.ndarray, np.generic) ):
        fig = plt.figure()
        plt.hist(value, bins=25)

In [None]:
region_results_dict = results_dict

In [None]:
plt.hist(region_results_dict['tot_acc_vals'], bins=15)

In [None]:
region_results_dict

In [None]:
bal_ecm = data_process.balance_data(ecm, target, random_state=1)
bal_ecm = data_process.bin_data(bal_ecm, resolution=128)
bal_ecm = bal_ecm.reset_index(drop=True)

In [None]:
label_df = bal_ecm[target].copy()
features_df = bal_ecm.drop([target, 'Track_ID', 'X', 'Y', 'binx', 'biny', 'bins'], axis=1)
features = features_df.columns

In [None]:
# Regular split

seed = 1234
np.random.seed(seed)
train_split = 0.5
test_split = 0.5

le = preprocessing.LabelEncoder()
bal_ecm['encoded_target'] = le.fit_transform(label_df)

training_bins = np.random.choice(bal_ecm.bins.unique(), int(len(bal_ecm.bins.unique())*train_split), replace=False)

X_train = bal_ecm[bal_ecm.bins.isin(training_bins)]
X_test_val = bal_ecm[~bal_ecm.bins.isin(training_bins)]
X_val, X_test = train_test_split(X_test_val, test_size=test_split, random_state=seed)

y_train = X_train['encoded_target']
y_test = X_test['encoded_target']
y_val = X_val['encoded_target']

# dtrain = X_train[features]
# dtest = X_test[features]
# dval = X_val[features]

dtrain = xgb.DMatrix(X_train[features], label=y_train)
dtest = xgb.DMatrix(X_test[features], label=y_test)
dval = xgb.DMatrix(X_val[features], label=y_val)

In [None]:
best_param = {'max_depth': 4,
 'eta': 0.005,
 'min_child_weight': 0,
 'verbosity': 0,
 'objective': 'multi:softprob',
 'num_class': 5,
 'silent': 'True',
 'gamma': 5,
 'subsample': 0.6,
 'colsample_bytree': 0.7,
 'eval_metric': 'mlogloss'}

In [None]:
booster, acc, true_label, preds = predxgboost.train(best_param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=1157, verbose=False)

In [None]:
class_names = le.classes_
class_results = classification_report(y_test, preds, digits=4, target_names = class_names)
print(str(class_results))

In [None]:
metrics.confusion_matrix(y_test, preds)
plt.figure(figsize=(12,10))
cm_array = metrics.confusion_matrix(y_test, preds)
df_cm = pd.DataFrame(cm_array, index = class_names, columns = class_names)

sns.set(font_scale=1.4) # for label size
ax = sns.heatmap(df_cm, annot=True, annot_kws={"size": 10}, cmap="YlGnBu")
# ax.set(xlabel='Predicted', ylabel='Actual')

plt.show()

In [None]:
from matplotlib import colors as plt_colors

explainer = shap.TreeExplainer(booster)
shap_values = explainer.shap_values(X_test[features])
c_NT = '#E69F00'
c_HYase = '#56B4E9'
c_ChABC = '#009E73'

colors = [c_NT, c_HYase, c_ChABC]
class_inds = np.argsort([-np.abs(shap_values[i]).mean() for i in range(len(shap_values))])
cmap = plt_colors.ListedColormap(np.array(colors)[class_inds])
shap.summary_plot(shap_values, X_test[features], class_names=np.array(class_names), max_display=15, title='Total SHAP Values')


## Outlier detection

In [None]:
scaler = StandardScaler()
features_df_filled = ecm[features].fillna(0)
scaled_df = scaler.fit_transform(features_df_filled)
scaled_df = pd.DataFrame(scaled_df, columns=features)

In [None]:
L, S = r_pca.R_pca(scaled_df).fit(max_iter=5000)

In [None]:
S_array = np.absolute(S.values)
S_array

S_magnitudes = np.zeros(len(S))
for i in range(len(S)):
    abs_sum = np.sum(S_array[i])
    S_magnitudes[i] = abs_sum

In [None]:
plt.hist(S_magnitudes, bins=5000)

In [None]:
plt.hist(np.log(S_magnitudes), bins=5000)
plt.vlines(np.log(S_magnitudes).mean()+(np.log(S_magnitudes).std()*3), ymin=0, ymax=40, color='r')
plt.vlines(np.log(S_magnitudes).mean()-(np.log(S_magnitudes).std()*3), ymin=0, ymax=40, color='r')
plt.title('Distrubution of Sparse Matrix Magnitudes')

In [None]:
upper_outlier_cutoff = np.log(S_magnitudes).mean()+(np.log(S_magnitudes).std()*5)
S_mag_log = np.log(S_magnitudes)
outliers = S_mag_log[S_mag_log > upper_outlier_cutoff]
print(len(outliers))
outlier_inds = np.where(S_mag_log > upper_outlier_cutoff)
normal_inds = np.where(S_mag_log <= upper_outlier_cutoff)
outlier_inds[0]
outlier_df = ecm.iloc[outlier_inds[0]]
normal_df = ecm.iloc[normal_inds[0]]

In [None]:
bal_ecm = data_process.balance_data(normal_df, target, random_state=1)
bal_ecm = data_process.bin_data(bal_ecm, resolution=128)

In [None]:
# Regular split

seed = 1234
np.random.seed(seed)
train_split = 0.5
test_split = 0.5

le = preprocessing.LabelEncoder()
bal_ecm['encoded_target'] = le.fit_transform(bal_ecm[target])

training_bins = np.random.choice(bal_ecm.bins.unique(), int(len(bal_ecm.bins.unique())*train_split), replace=False)

X_train = bal_ecm[bal_ecm.bins.isin(training_bins)]
X_test_val = bal_ecm[~bal_ecm.bins.isin(training_bins)]
X_val, X_test = train_test_split(X_test_val, test_size=test_split, random_state=seed)

y_train = X_train['encoded_target']
y_test = X_test['encoded_target']
y_val = X_val['encoded_target']

# dtrain = X_train[features]
# dtest = X_test[features]
# dval = X_val[features]

dtrain = xgb.DMatrix(X_train[features], label=y_train)
dtest = xgb.DMatrix(X_test[features], label=y_test)
dval = xgb.DMatrix(X_val[features], label=y_val)

In [None]:
booster, acc, true_label, preds = predxgboost.train(best_param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=best_boost_rounds, verbose=False)

In [None]:
class_names = le.classes_
class_results = classification_report(y_test, preds, digits=4, target_names = class_names)
print(str(class_results))

In [None]:
le_out = preprocessing.LabelEncoder()
outlier_df['encoded_target'] = le_out.fit_transform(outlier_df[target])

In [None]:

y_outlier = outlier_df['encoded_target']
d_outliers = xgb.DMatrix(outlier_df[features], label=y_outlier)

In [None]:
true_label = d_outliers.get_label()
ypred = booster.predict(d_outliers)
preds = [np.where(x == np.max(x))[0][0] for x in ypred]
acc = accuracy_score(true_label, preds)
print("Accuracy:",acc)

In [None]:
class_names = le.classes_
class_results = classification_report(true_label, preds, digits=4, target_names = class_names)
print(str(class_results))

# Treatment Data

In [None]:
fstats_tot_treatment = data_process.generate_fullstats(treatment_dataset_path, treatment_filelist, ['NT', 'ChABC'], 'treatment')

In [None]:
target = 'treatment'
ecm = fstats_tot_treatment[feature_list + [target, 'Track_ID', 'X', 'Y']] #dont think i need these rn
print(ecm.shape)
ecm = ecm[~ecm[list(set(feature_list) - set(['Deff2', 'Mean Deff2']))].isin([np.nan, np.inf, -np.inf]).any(1)]       # Removing nan and inf data points
ecm.shape

In [None]:
best_param = {'max_depth': 5, 'eta': 0.05, 'min_child_weight': 0, 'verbosity': 0, 'objective': 'multi:softprob', 'num_class': 2, 'silent': 'True', 'gamma': 2.0, 'subsample': 0.15, 'colsample_bytree': 0.8, 'eval_metric': 'mlogloss'}
best_boost_rounds = 57

In [None]:
treatment_results_dict = get_multimodel_averages('treatment', ecm['treatment'].unique(), ecm, best_param, 57)

In [None]:
for key in treatment_results_dict.keys():
    value = treatment_results_dict[key]
    if isinstance(value, (np.ndarray, np.generic) ):
        print(key)
        print(np.median(value))
        print(stats.iqr(value, interpolation = 'midpoint'))
        print()

In [None]:
treatment_results_dict_binary = get_multimodel_averages('treatment', ecm['treatment'].unique(), ecm, best_param, 57)

In [None]:
treatment_yscramb_results_dict = get_multimodel_averages('treatment', ecm['treatment'].unique(), ecm, best_param, 57, True, True, 50)

In [None]:
treatment_results_dict.keys()

In [None]:
for key in treatment_yscramb_results_dict.keys():
    value = treatment_yscramb_results_dict[key]
    if isinstance(value, (np.ndarray, np.generic) ):
        print(key)
        print(np.median(value))
        print(stats.iqr(value, interpolation = 'midpoint'))
        print()

In [None]:
for key in treatment_yscramb_results_dict.keys():
    value = treatment_yscramb_results_dict[key]
    if isinstance(value, (np.ndarray, np.generic) ):
        fig = plt.figure()
        plt.hist(value, bins=25)

In [None]:
bal_ecm = data_process.balance_data(ecm, target, random_state=1)
bal_ecm = data_process.bin_data(bal_ecm, resolution=128)
label_df = bal_ecm[target]
features_df = bal_ecm.drop([target, 'Track_ID', 'X', 'Y', 'binx', 'biny', 'bins'], axis=1)
features = features_df.columns

# Regular split

seed = 1234
np.random.seed(seed)
train_split = 0.5
test_split = 0.5

le = preprocessing.LabelEncoder()
bal_ecm['encoded_target'] = le.fit_transform(bal_ecm[target])

training_bins = np.random.choice(bal_ecm.bins.unique(), int(len(bal_ecm.bins.unique())*train_split), replace=False)

X_train = bal_ecm[bal_ecm.bins.isin(training_bins)]
X_test_val = bal_ecm[~bal_ecm.bins.isin(training_bins)]
X_val, X_test = train_test_split(X_test_val, test_size=test_split, random_state=seed)

y_train = X_train['encoded_target']
y_test = X_test['encoded_target']
y_val = X_val['encoded_target']

# dtrain = X_train[features]
# dtest = X_test[features]
# dval = X_val[features]

dtrain = xgb.DMatrix(X_train[features], label=y_train)
dtest = xgb.DMatrix(X_test[features], label=y_test)
dval = xgb.DMatrix(X_val[features], label=y_val)

In [None]:
print(len(X_train))
print(len(X_test))
print(len(X_val))

In [None]:
booster, acc, true_label, preds = predxgboost.train(best_param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=best_boost_rounds, verbose=False)

In [None]:
class_names = le.classes_
class_results = classification_report(y_test, preds, digits=4, target_names = class_names)
print(str(class_results))

In [None]:
metrics.confusion_matrix(y_test, preds)
plt.figure(figsize=(12,10))
cm_array = metrics.confusion_matrix(y_test, preds)
df_cm = pd.DataFrame(cm_array, index = class_names, columns = class_names)

sns.set(font_scale=1.4) # for label size
ax = sns.heatmap(df_cm, annot=True, annot_kws={"size": 16}, cmap="YlGnBu")
ax.set(xlabel='Predicted', ylabel='Actual')

plt.show()

In [None]:
from matplotlib import colors as plt_colors

explainer = shap.TreeExplainer(booster)
shap_values = explainer.shap_values(X_test[features])
c_NT = '#fc8d59'
#c_HYase = '#ffffbf'
c_ChABC = '#91bfdb'

colors = [c_ChABC, c_NT]
class_inds = np.argsort([-np.abs(shap_values[i]).mean() for i in range(len(shap_values))])
cmap = plt_colors.ListedColormap(np.array(colors)[class_inds])
shap.summary_plot(shap_values, X_test[features], class_names=np.array(class_names), max_display=15, title='Total SHAP Values', color=cmap)


In [None]:
plt.hist(np.log(S_magnitudes), bins=5000)
plt.vlines(np.log(S_magnitudes).mean()+(np.log(S_magnitudes).std()*3), ymin=0, ymax=40, color='r')
plt.vlines(np.log(S_magnitudes).mean()-(np.log(S_magnitudes).std()*3), ymin=0, ymax=40, color='r')
plt.title('Distrubution of Sparse Matrix Magnitudes')

In [None]:
upper_outlier_cutoff = np.log(S_magnitudes).mean()+(np.log(S_magnitudes).std()*5.7)
S_mag_log = np.log(S_magnitudes)
outliers = S_mag_log[S_mag_log > upper_outlier_cutoff]
print(len(outliers))
outlier_inds = np.where(S_mag_log > upper_outlier_cutoff)
normal_inds = np.where(S_mag_log <= upper_outlier_cutoff)
outlier_inds[0]
outlier_df = ecm.iloc[outlier_inds[0]]
normal_df = ecm.iloc[normal_inds[0]]

In [None]:
bal_ecm = data_process.balance_data(normal_df, target, random_state=1)
bal_ecm = data_process.bin_data(bal_ecm, resolution=128)

In [None]:
# Regular split

seed = 1234
np.random.seed(seed)
train_split = 0.5
test_split = 0.5

le = preprocessing.LabelEncoder()
bal_ecm['encoded_target'] = le.fit_transform(bal_ecm[target])

training_bins = np.random.choice(bal_ecm.bins.unique(), int(len(bal_ecm.bins.unique())*train_split), replace=False)

X_train = bal_ecm[bal_ecm.bins.isin(training_bins)]
X_test_val = bal_ecm[~bal_ecm.bins.isin(training_bins)]
X_val, X_test = train_test_split(X_test_val, test_size=test_split, random_state=seed)

y_train = X_train['encoded_target']
y_test = X_test['encoded_target']
y_val = X_val['encoded_target']

# dtrain = X_train[features]
# dtest = X_test[features]
# dval = X_val[features]

dtrain = xgb.DMatrix(X_train[features], label=y_train)
dtest = xgb.DMatrix(X_test[features], label=y_test)
dval = xgb.DMatrix(X_val[features], label=y_val)

In [None]:
le = preprocessing.LabelEncoder()
outlier_df['encoded_target'] = le.fit_transform(outlier_df[target])

In [None]:

y_outlier = outlier_df['encoded_target']
d_outliers = xgb.DMatrix(outlier_df[features], label=y_outlier)

In [None]:
booster, acc, true_label, preds = predxgboost.train(best_param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=best_boost_rounds, verbose=False)

In [None]:
true_label = d_outliers.get_label()
ypred = booster.predict(d_outliers)
preds = [np.where(x == np.max(x))[0][0] for x in ypred]
acc = accuracy_score(true_label, preds)
print("Accuracy:",acc)

In [None]:
class_names

In [None]:
class_names = le.classes_
class_results = classification_report(true_label, preds, digits=4, target_names = class_names)
print(str(class_results))