In [None]:
from diff_predictor import data_process, predxgboost
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt 

from os import listdir, getcwd, chdir
from os.path import isfile, join
import os

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
import operator
import xgboost as xgb
import shap
from xgboost.training import CVPack
from xgboost import callback
from xgboost.core import CallbackEnv
from xgboost.core import EarlyStopException
from xgboost.core import STRING_TYPES

In [None]:
def ecdf(data):
    x = np.sort(data)
    y = np.arange(1, len(data)+1) / len(data)
    return x, y

In [None]:
workbookDir = getcwd()

print('Current Notebook Dir: ' + workbookDir)
chdir(workbookDir) # Go to current workbook Dir"
chdir('..')        # Go up one
print(f'Using current directory for loading data: {getcwd()}')
workbookDir = getcwd()

In [None]:
age_feature_path = workbookDir + '/data/raw_data_age/'
age_feature_filelist = [f for f in listdir(age_feature_path) if isfile(join(age_feature_path, f)) and 'feat' in f]
print(len(age_feature_filelist))

age_msd_path = workbookDir + '/raw_data_age/'
age_msd_filelist = [f for f in listdir(age_msd_path) if isfile(join(age_msd_path, f)) and 'msd' in f]
print(len(age_msd_filelist))

region_dataset_path = workbookDir + '/data/region_feature_folder/'
region_filelist = [f for f in listdir(region_dataset_path) if isfile(join(region_dataset_path, f)) and 'feat' in f]
print(len(region_filelist))

treatment_dataset_path = workbookDir + '/data/ecm_feature_folder/'
treatment_filelist = [f for f in listdir(treatment_dataset_path) if isfile(join(treatment_dataset_path, f))]# and 'feat' in f]
print(len(treatment_filelist))

In [None]:
treatment_filelist

# Deep dive into Age Dataset

In [None]:
fstats_tot_age = data_process.generate_fullstats(age_feature_path, age_feature_filelist, ['P14', 'P35', 'P70'], 'age')

In [None]:
fstats_tot_age = pd.DataFrame()
total_particles_count = 0
subset_particles_count = 0

frame_counter = 0

for i in range(len(age_feature_filelist)):
    df_features = pd.read_csv(age_feature_path + age_feature_filelist[i])
    file_name = age_feature_filelist[i]
    targets = ['P14', 'P35', 'P70']
    if any(substring in file_name for substring in targets):
        file_indicator = file_name[8:] # grabs the unique part of the file name only
        msd_filename = 'msd' + file_indicator
        target = file_name[9:12]

        features_df = pd.read_csv(age_feature_path + file_name)
        msd_df = pd.read_csv(age_msd_path + msd_filename)


        msd_df['Track_ID'] = msd_df['Track_ID'] + frame_counter
        # Remove particles that are in frame less than one second
        cutoff_df = msd_df[(msd_df['Frame'] == 16) & (msd_df['MSDs'].notna())]
        msd_df = msd_df[msd_df['Track_ID'].isin(set(cutoff_df['Track_ID'].unique()))]
        features_subset_df = features_df[features_df['Track_ID'].isin(set(msd_df['Track_ID'].unique()))]
        features_df['age'] = pd.Series(features_df.shape[0]*[target], index=features_df.index)
        
        features_df['Track_ID'] = features_df['Track_ID'] + frame_counter
        fstats_tot_age = fstats_tot_age.append(features_df)
        total_particles_count += len(features_df)
        subset_particles_count += len(features_subset_df)

In [None]:
feature_list = [
    'alpha', # Fitted anomalous diffusion alpha exponenet
    'D_fit', # Fitted anomalous diffusion coefficient
    'kurtosis', # Kurtosis of track
    'asymmetry1', # Asymmetry of trajecory (0 for circular symmetric, 1 for linear)
    'asymmetry2', # Ratio of the smaller to larger principal radius of gyration
    'asymmetry3', # An asymmetric feature that accnts for non-cylindrically symmetric pt distributions
    'AR', # Aspect ratio of long and short side of trajectory's minimum bounding rectangle
    'elongation', # Est. of amount of extension of trajectory from centroid
    'boundedness', # How much a particle with Deff is restricted by a circular confinement of radius r
    'fractal_dim', # Measure of how complicated a self similar figure is
    'trappedness', # Probability that a particle with Deff is trapped in a region
    'efficiency', # Ratio of squared net displacement to the sum of squared step lengths
    'straightness', # Ratio of net displacement to the sum of squared step lengths
    'MSD_ratio', # MSD ratio of the track
#     'frames', # Number of frames the track spans
    'Deff1', # Effective diffusion coefficient at 0.33 s
    'Deff2', # Effective diffusion coefficient at 3.3 s
    #'angle_mean', # Mean turning angle which is counterclockwise angle from one frame point to another
    #'angle_mag_mean', # Magnitude of the turning angle mean
    #'angle_var', # Variance of the turning angle
    #'dist_tot', # Total distance of the trajectory
    #'dist_net', # Net distance from first point to last point
    #'progression', # Ratio of the net distance traveled and the total distance
    'Mean alpha', 
    'Mean D_fit', 
    'Mean kurtosis', 
    'Mean asymmetry1', 
    'Mean asymmetry2',
    'Mean asymmetry3', 
    'Mean AR',
    'Mean elongation', 
    'Mean boundedness',
    'Mean fractal_dim', 
    'Mean trappedness', 
    'Mean efficiency',
    'Mean straightness', 
    'Mean MSD_ratio', 
    'Mean Deff1', 
    'Mean Deff2',
    ]

target = 'age'

In [None]:
fstats_tot_age

In [None]:
ecm = fstats_tot_age[feature_list + [target, 'Track_ID', 'X', 'Y']] #dont think i need these rn
print(ecm.shape)
ecm = ecm[~ecm[list(set(feature_list) - set(['Deff2', 'Mean Deff2']))].isin([np.nan, np.inf, -np.inf]).any(1)]       # Removing nan and inf data points
ecm.shape

In [None]:
ecm[target].unique()

## Here are the cutoffs for different motion types

### alpha > 1.25: Directed Motion
### 1.25 ≥ alpha ≥ 0.75: Normal Diffusion
### 0.75 > alpha: Anomalous Diffusion/Confined Diffusion

In [None]:
fig, axes = plt.subplots(1,3, sharey=True, figsize=(8,6))
#plt.ylim([0, 250])


for i, unique_class in enumerate(ecm[target].unique()):
    labels = ['superdiffusive', 'brownian', 'subdiffusive']
    percentages = []
    #bins=200
    # print(f'class {unique_class}')
    df = ecm[ecm[target] == unique_class]

    directed_df = df[df['alpha'] > 1.1]
    percentages.append(len(directed_df)/len(df))
    # axes[i].hist(directed_df['alpha'], bins=bins)
    # print(len(directed_df))
    # print(f'directed diffusion %: {len(directed_df)/len(df)}')

    normal_df = df[(df['alpha'] <= 1.1) & (df['alpha'] >= 0.9)]
    percentages.append(len(normal_df)/len(df))
    # axes[i].hist(normal_df['alpha'], bins=bins)
    # print(len(normal_df))
    # print(f'normal diffusion %: {len(normal_df)/len(df)}')

    constrained_df = df[(df['alpha'] < 0.9) & (df['alpha'] > 0.1)]
    percentages.append(len(constrained_df)/len(df))
    # axes[i].hist(confined_df['alpha'], bins=bins)
    # print(len(confined_df))
    # print(f'confined diffusion %: {len(confined_df)/len(df)}')

    #hindered_df = df[(df['alpha'] <= 0.5) & (df['alpha'] > 0.2)]
    #percentages.append(len(hindered_df)/len(df))
    # axes[i].hist(hindered_df['alpha'], bins=bins)
    # print(f'hindered diffusion %: {len(hindered_df)/len(df)}')

    immobilized_df = df[(df['alpha'] <= 0.1)]
    percentages.append(len(immobilized_df)/len(df))
    # axes[i].hist(immobilized_df['alpha'], bins='doane')
    # print(f'immobilized diffusion %: {len(immobilized_df)/len(df)}')
    # print('')
    # axes[i].hist((df['alpha']), bins=40)
    #axes[i].vlines(([0.2, 0.5, 0.75, 1.25]), ymin=0, ymax=400, color='r')
    
    axes[i].bar(np.arange(len(percentages)), percentages)
    axes[i].set_xticklabels(labels)
    axes[i].set_xticks(np.arange(len(percentages)))
    axes[i].set_title(unique_class)

    #break

#for ax in axes:
    #ax.set_xscale('log')
    #ax.set_xlim([-1,1.4])

for ax in fig.axes:
    plt.sca(ax)
    plt.xticks(rotation=45)

In [None]:
fig = plt.figure(figsize=(4,8))


labels = ecm[target].unique()
labels.sort()

directed_percent = np.zeros(len(labels))
normal_percent = np.zeros(len(labels))
constrained_percent = np.zeros(len(labels))
immobilized_percent = np.zeros(len(labels))

for i, unique_class in enumerate(labels):
    print(unique_class)
    df = ecm[ecm[target] == unique_class]

    directed_df = df[df['alpha'] > 1.1]
    directed_percent[i] = (len(directed_df)/len(df))
    print(directed_percent[i])

    normal_df = df[(df['alpha'] <= 1.1) & (df['alpha'] >= 0.9)]
    normal_percent[i] = (len(normal_df)/len(df))
    print(normal_percent[i])
    
    constrained_df = df[(df['alpha'] < 0.9)]
    constrained_percent[i] = (len(constrained_df)/len(df))
    print(constrained_percent[i])
    print()
    
    #immobilized_df = df[(df['alpha'] <= 0.1)]
    #immobilized_percent[i] = (len(immobilized_df)/len(df))
    
    
#plt.bar(labels, immobilized_percent, color='r', label='immobilized')
bar_w = 0.5
plt.bar(labels, constrained_percent, label='Subdiffusive', width=bar_w, color='#b7a57a')
plt.bar(labels, normal_percent, bottom=constrained_percent+immobilized_percent, color='#999999', label='Brownian', width=bar_w)
plt.bar(labels, directed_percent, bottom=constrained_percent+immobilized_percent+normal_percent, color='#4b2e83', label='Superdiffusive', width=bar_w)
plt.ylim([0,1])
plt.legend(loc='lower right', fontsize=14)
plt.title('Percentage of Diffusion Modes per Age', fontsize=15, fontname='Arial', fontweight='bold')
plt.ylim([0,1])
plt.xticks(fontsize=15, fontname='Arial', fontweight='bold')
plt.yticks(fontsize=15, fontname='Arial', fontweight='bold')

In [None]:
mx = np.array(p70_ecm['alpha'])
x = np.sort(mx)
y = np.arange(1, len(mx)+1) / len(mx)
x, y = ecdf(mx)

In [None]:
from matplotlib.ticker import PercentFormatter
bin_num = 500
fig, ax = plt.subplots(1,3)
ax[0].hist(np.array(p14_ecm['alpha']), bins=bin_num)
ax[1].hist(np.array(p35_ecm['alpha']), bins=bin_num)
ax[2].hist(np.array(p70_ecm['alpha']), bins=bin_num)
#plt.gca().yaxis.set_major_formatter(PercentFormatter(1))


In [None]:
fig = plt.figure(figsize=(12,8))
for df in age_df_list:
    mx = np.array(df['alpha'])
    x = np.sort(mx)
    y = np.arange(1, len(mx)+1) / len(mx)
    plt.scatter(x, y, alpha=0.7, s=1, label=df['age'].unique()[0])
plt.vlines([directed], 0, 1, label='normal diffusion cutoff', linestyles='dashed')
plt.vlines(confined, 0, 1, label='confined diffusion cutoff', linestyles='dotted')
plt.xlim([0,3])
plt.legend()

In [None]:
# confined_ecm = ecm[ecm['alpha'] < 0.75]
# normal_ecm = ecm[(ecm['alpha'] >= 0.75) & (ecm['alpha'] <= 1.25)]
# directed_ecm = ecm[ecm['alpha'] > 1.25]

# directed_normal_ecm = ecm[ecm['alpha'] >= 0.75]
# directed_confined_ecm = ecm[(ecm['alpha'] > 1.25) | (ecm['alpha'] < 0.75)]
# normal_confined_ecm = ecm[ecm['alpha'] <= 1.25]

# no_immobil = ecm[ecm['alpha'] > 0.2]

In [None]:
def full_preprocess(ecm, balanced=True, target=None):

    rand_state = np.random.randint(1, 2000)
    if balanced:
        bal_ecm = data_process.balance_data(ecm, target, random_state=rand_state)
        #sampled_df = bal_ecm.sample(frac=0.5)
        sampled_df = data_process.bin_data(bal_ecm)
    else:
        sampled_df = data_process.bin_data(ecm)
    label_df = sampled_df[target]
    features_df = sampled_df.drop([target, 'X', 'Y', 'binx', 'biny', 'bins', 'Track_ID'], axis=1)
    features = features_df.columns

    seed = rand_state
    np.random.seed(seed)
    train_split = 0.8
    test_split = 0.5

    le = preprocessing.LabelEncoder()
    sampled_df['encoded_target'] = le.fit_transform(sampled_df[target])

    training_bins = np.random.choice(sampled_df['bins'].unique(), int(len(sampled_df['bins'].unique())*train_split), replace=False)

    X_train = sampled_df[sampled_df['bins'].isin(training_bins)]
    X_test_val = sampled_df[~sampled_df['bins'].isin(training_bins)]
    X_val, X_test = train_test_split(X_test_val, test_size=test_split, random_state=seed)

    y_train = X_train['encoded_target']
    y_test = X_test['encoded_target']
    y_val = X_val['encoded_target']

    dtrain = xgb.DMatrix(X_train[features], label=y_train)
    dtest = xgb.DMatrix(X_test[features], label=y_test)
    dval = xgb.DMatrix(X_val[features], label=y_val)
    return dtrain, dtest, dval, X_train, X_test, y_train, y_test, le



In [None]:
param = {'max_depth': 3,
         'eta': 0.005,
         'min_child_weight': 0,
         'verbosity': 0,
         'objective': 'multi:softprob',
         'num_class': 3,
         'silent': 'True',
         'gamma': 5,
         'subsample': 0.15,
         'colsample_bytree': 0.8,
         'eval_metric': "mlogloss",
#          # GPU integration will cut time in ~half:
#          'gpu_id' : 0,
#          'tree_method': 'gpu_hist',
#          'predictor': 'gpu_predictor'
         }

In [None]:
confined_ecm = ecm[(ecm['alpha'] < 0.9) & (ecm['alpha'] > 0.1)]
normal_ecm = ecm[(ecm['alpha'] >= 0.9) & (ecm['alpha'] <= 1.1)]
directed_ecm = ecm[ecm['alpha'] > 1.1]

#hindered_ecm = ecm[(ecm['alpha'] > 0.2) & (ecm['alpha'] <= 0.5)]
#immobilized_ecm = ecm[ecm['alpha'] <= 0.1]

In [None]:
param = {'max_depth': 4, 'eta': 0.1, 'min_child_weight': 1, 'verbosity': 0, 'objective': 'multi:softprob', 'num_class': 3, 'silent': 'True', 'gamma': 1.0, 'subsample': 0.5, 'colsample_bytree': 0.6, 'eval_metric': 'mlogloss'}

In [None]:
target = 'age'

normal_acc_list = []
confined_acc_list = []
directed_acc_list = []
all_acc_list = []
#hindered_acc_list = []
#immobilized_acc_list = []
#no_imm_acc = []

normal_shap_list = []
confined_shap_list = []
directed_shap_list = []
all_shap_list = []
#immobilized_shap_list = []

def myfun(self=None):
    return model_bytearray
    
for i in range(50):
    print(i)
    # dtrain, dtest, dval, X_train, X_test, y_train, y_test = full_preprocess(normal_ecm, balanced=True, target=target)
    # booster, acc, true_label, preds = predxgboost.train(param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=200, verbose=False)
    # normal_acc_list.append(acc)
    #model_bytearray = booster.save_raw()[4:]
    #booster.save_raw = myfun
    #explainer = shap.TreeExplainer(booster)
    #shap_values = explainer.shap_values(X_test[features])
    #normal_shap_list.append(shap_values)

    dtrain, dtest, dval, X_train, X_test, y_train, y_test, le = full_preprocess(directed_ecm, balanced=True, target=target)
    booster, acc, true_label, preds = predxgboost.train(best_param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=76, verbose=False)
    directed_acc_list.append(acc)
    #model_bytearray = booster.save_raw()[4:]
    #booster.save_raw = myfun
    # explainer = shap.TreeExplainer(booster)
    # shap_values = explainer.shap_values(X_test[features])
    # directed_shap_list.append(shap_values)
    

    # dtrain, dtest, dval, X_train, X_test, y_train, y_test = full_preprocess(confined_ecm, balanced=True, target=target)
    # booster, acc, true_label, preds = predxgboost.train(param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=200, verbose=False)
    # confined_acc_list.append(acc)
    # model_bytearray = booster.save_raw()[4:]
    # booster.save_raw = myfun
    # explainer = shap.TreeExplainer(booster)
    # shap_values = explainer.shap_values(X_test[features])
    # confined_shap_list.append(shap_values)

    dtrain, dtest, dval, X_train, X_test, y_train, y_test, le = full_preprocess(ecm, balanced=True, target=target)
    booster, acc, true_label, preds = predxgboost.train(best_param_all, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=200, verbose=False)
    all_acc_list.append(acc)
    # model_bytearray = booster.save_raw()[4:]
    # booster.save_raw = myfun
    # explainer = shap.TreeExplainer(booster)
    # shap_values = explainer.shap_values(X_test[features])
    # all_shap_list.append(shap_values)

    # dtrain, dtest, dval, y_test= full_preprocess(hindered_ecm, balanced=True, target=target)
    # booster, acc, true_label, preds = predxgboost.train(param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=200, verbose=False)
    # hindered_acc_list.append(acc)

    # dtrain, dtest, dval, X_train, X_test, y_train, y_test = full_preprocess(immobilized_ecm, balanced=True, target=target)
    # booster, acc, true_label, preds = predxgboost.train(param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=200, verbose=False)
    # immobilized_acc_list.append(acc)
    # # model_bytearray = booster.save_raw()[4:]
    # # booster.save_raw = myfun
    # explainer = shap.TreeExplainer(booster)
    # shap_values = explainer.shap_values(X_test[features])
    # immobilized_shap_list.append(shap_values)

In [None]:
age_dict = {#'confined_acc': np.array(confined_acc_list),
                  'directed_acc': np.array(directed_acc_list),
                  #'normal_acc': np.array(normal_acc_list),
                  #'hindered_acc': np.array(hindered_acc_list),
                  #'immobilized_acc': np.array(immobilized_acc_list),
                  'all_modes_acc': np.array(all_acc_list)
                    }
age_result_df = pd.DataFrame.from_dict(age_dict)
age_result_df.to_csv('age_accuracies_v5.csv', index=False)


In [None]:
age_result_df = pd.read_csv('age_accuracies_v4.csv')

In [None]:
age_result_df.head()

In [None]:
plt.figure(figsize=(4,8))
conf_x, conf_y = ecdf(np.array(age_result_df['confined_acc']))
dir_x, dir_y = ecdf(np.array(age_result_df['directed_acc']))
norm_x, norm_y = ecdf(np.array(age_result_df['normal_acc']))
all_x, all_y = ecdf(np.array(age_result_df['all_modes_acc']))
#hind_x, hind_y = ecdf(np.array(hindered_acc_list))
#imm_x, imm_y = ecdf(np.array(immobilized_acc_list))
#no_imm_x, no_imm_y = ecdf(np.array(no_imm_acc))
plt.scatter(dir_x, dir_y, label='Superdiffusive', s=7, c='#4b2e83')
plt.scatter(conf_x, conf_y, label='Subdiffusive', s=7, c='#b7a57a')
plt.scatter(norm_x, norm_y, label='Brownian', s=7, c='#999999')
plt.plot(all_x, all_y, label='All modes', c='k')
#plt.scatter(hind_x, hind_y, label='hindered', s=2)
#plt.scatter(imm_x, imm_y, label='immobilized', s=2)
#plt.scatter(no_imm_x, no_imm_y, label='no immobilized points', s=0.5)
plt.legend(loc='upper left', markerscale=2.0, fontsize=14)
plt.xlabel('Model Accuracy', fontsize=15, fontname='Arial', fontweight='bold')
plt.ylabel('Percentage %', fontsize=15, fontname='Arial', fontweight='bold')
plt.title('ECDF of Model Accuracy for different diffusion modes', fontsize=15, fontname='Arial', fontweight='bold')
plt.xlim([0.8, 0.9])
plt.xticks(fontsize=15, fontname='Arial', fontweight='bold')
plt.yticks(fontsize=15, fontname='Arial', fontweight='bold')

In [None]:
plt.figure()
conf_x, conf_y = ecdf(np.array(confined_acc_list))
dir_x, dir_y = ecdf(np.array(directed_acc_list))
norm_x, norm_y = ecdf(np.array(normal_acc_list))
all_x, all_y = ecdf(np.array(all_acc_list))
#hind_x, hind_y = ecdf(np.array(hindered_acc_list))
#imm_x, imm_y = ecdf(np.array(immobilized_acc_list))
#no_imm_x, no_imm_y = ecdf(np.array(no_imm_acc))
plt.scatter(dir_x, dir_y, label='Superdiffusive', s=2)
plt.scatter(conf_x, conf_y, label='Subdiffusive', s=2)
plt.scatter(norm_x, norm_y, label='Brownian', s=2)
plt.scatter(all_x, all_y, label='All modes', s=2)
#plt.scatter(hind_x, hind_y, label='hindered', s=2)
#plt.scatter(imm_x, imm_y, label='immobilized', s=2)
#plt.scatter(no_imm_x, no_imm_y, label='no immobilized points', s=0.5)
plt.legend()
plt.xlabel('Model Accuracy')
plt.ylabel('Percentage %')
plt.title('ECDF of Model Accuracy for different diffusion modes')
plt.xlim([0.5, 1])

In [None]:
plt.scatter(dir_x, dir_y, label='directed')

In [None]:
directed_acc_list = []
for i in range(100):
    print(i)
    dtrain, dtest, dval, y_test= full_preprocess(directed_ecm, balanced=True)
    booster, acc, true_label, preds = predxgboost.train(param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=200, verbose=False)
    directed_acc_list.append(acc)

In [None]:
all_acc_list = []
for i in range(100):
    print(i)
    dtrain, dtest, dval, y_test= full_preprocess(ecm, balanced=True)
    booster, acc, true_label, preds = predxgboost.train(param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=200, verbose=False)
    all_acc_list.append(acc)

In [None]:
normal_acc_list = []
for i in range(100):
    print(i)
    dtrain, dtest, dval, y_test= full_preprocess(normal_ecm, balanced=True)
    booster, acc, true_label, preds = predxgboost.train(param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=200, verbose=False)
    normal_acc_list.append(acc)

In [None]:
directed_ecm[target].unique()

In [None]:
param

In [None]:
dtrain, dtest, dval, X_train, X_test, y_train, y_test, le = full_preprocess(ecm, target=target)
(best_model, best_param, best_eval, best_boost_rounds) = predxgboost.xgb_paramsearch(X_train=X_train, y_train=X_train['encoded_target'], features=features, init_params=param)

In [None]:
print(best_param)
print(best_boost_rounds)
print(best_eval)

In [None]:
print(best_param)
print(best_boost_rounds)
print(best_eval)

In [None]:
best_param_all = {'max_depth': 4, 'eta': 0.1, 'min_child_weight': 1, 'verbosity': 0, 'objective': 'multi:softprob', 'num_class': 3, 'silent': 'True', 'gamma': 1.0, 'subsample': 0.5, 'colsample_bytree': 0.6, 'eval_metric': 'mlogloss'}

In [None]:
print(best_param)
print(best_boost_rounds)
print(best_eval)

In [None]:
best_model

In [None]:
directed_acc_list = []
all_acc_list = []
for i in range(100):

    print(i)
    dtrain, dtest, dval, X_train, y_test= full_preprocess(ecm, balanced=True, target=target)
    booster, acc, true_label, preds = predxgboost.train(best_param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=best_boost_rounds, verbose=False)
    all_acc_list.append(acc)

    print(i)
    dtrain, dtest, dval, X_train, y_test= full_preprocess(directed_ecm, balanced=True, target=target)
    booster, acc, true_label, preds = predxgboost.train(best_param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=best_boost_rounds, verbose=False)
    directed_acc_list.append(acc)

In [None]:
plt.figure()
dir_x, dir_y = ecdf(np.array(directed_acc_list))

no_imm_x, no_imm_y = ecdf(np.array(all_acc_list))
plt.scatter(dir_x, dir_y, label='directed', s=2)
#plt.hist(dir_x, bins=100)
plt.scatter(no_imm_x, no_imm_y, label='all', s=0.5)
#plt.hist(no_imm_x, bins=100)
plt.legend()
plt.xlabel('Model Accuracy')
plt.ylabel('Percentage %')
plt.title('ECDF of Model Accuracy for different diffusion modes')

# Deep dive into region data

In [None]:
fstats_tot_region = data_process.generate_fullstats(region_dataset_path, region_filelist, ['cortex', 'hippocampus', 'striatum'], 'region')

In [None]:
target = 'region'
ecm = fstats_tot_region[feature_list + [target, 'Track_ID', 'X', 'Y']] #dont think i need these rn
print(ecm.shape)
ecm = ecm[~ecm[list(set(feature_list) - set(['Deff2', 'Mean Deff2']))].isin([np.nan, np.inf, -np.inf]).any(1)]       # Removing nan and inf data points
ecm.shape

In [None]:
fig = plt.figure(figsize=(4,8))


labels = ecm[target].unique()
labels.sort()

directed_percent = np.zeros(len(labels))
normal_percent = np.zeros(len(labels))
constrained_percent = np.zeros(len(labels))
immobilized_percent = np.zeros(len(labels))

for i, unique_class in enumerate(labels):
    
    print(unique_class)
    df = ecm[ecm[target] == unique_class]

    directed_df = df[df['alpha'] > 1.1]
    directed_percent[i] = (len(directed_df)/len(df))
    print(directed_percent[i])

    normal_df = df[(df['alpha'] <= 1.1) & (df['alpha'] >= 0.9)]
    normal_percent[i] = (len(normal_df)/len(df))
    print(normal_percent[i])
    
    constrained_df = df[(df['alpha'] < 0.9)]
    constrained_percent[i] = (len(constrained_df)/len(df))
    print(constrained_percent[i])
    print()
    
    #immobilized_df = df[(df['alpha'] <= 0.1)]
    #immobilized_percent[i] = (len(immobilized_df)/len(df))
    
    
#plt.bar(labels, immobilized_percent, color='r', label='immobilized')
bar_w = 0.5
plt.bar(labels, constrained_percent, color='#b7a57a', label='Subdiffusive', width=bar_w)
plt.bar(labels, normal_percent, bottom=constrained_percent+immobilized_percent, color='#999999', label='Brownian', width=bar_w)
plt.bar(labels, directed_percent, bottom=constrained_percent+immobilized_percent+normal_percent, color='#4b2e83', label='Superdiffusive', width=bar_w)
plt.ylim([0,1])
plt.xticks(rotation='45')
plt.legend(loc='lower right', fontsize=14)
plt.title('Percentage of Diffusion Modes per Region', fontsize=15, fontname='Arial', fontweight='bold')
plt.ylim([0,1])
plt.xticks(fontsize=15, fontname='Arial', fontweight='bold')
plt.yticks(fontsize=15, fontname='Arial', fontweight='bold')

In [None]:
hippo_ecm = ecm[ecm[target] == 'hippocampus']
print(len(hippo_ecm))
thala_ecm = ecm[ecm[target] == 'thalamus']
print(len(thala_ecm))
gangl_ecm = ecm[ecm[target] == 'ganglia']
print(len(gangl_ecm))

cortex_ecm = ecm[ecm[target] == 'cortex']
print(len(cortex_ecm))

striat_ecm = ecm[ecm[target] == 'striatum']
print(len(striat_ecm))


In [None]:
#age_df_list = [hippo_ecm, thala_ecm, gangl_ecm, cortex_ecm, striat_ecm]
fig, axes = plt.subplots(1,len(ecm[target].unique()), sharey=True, figsize=(8,6))
#plt.ylim([0, 250])


for i, unique_class in enumerate(ecm[target].unique()):
    labels = ['directed', 'normal', 'confined', 'hindered', 'immobilized']
    percentages = []
    #bins=200
    # print(f'class {unique_class}')
    df = ecm[ecm[target] == unique_class]

    directed_df = df[df['alpha'] > 1.25]
    percentages.append(len(directed_df)/len(df))
    # axes[i].hist(directed_df['alpha'], bins=bins)
    # print(len(directed_df))
    # print(f'directed diffusion %: {len(directed_df)/len(df)}')

    normal_df = df[(df['alpha'] <= 1.25) & (df['alpha'] >= 0.75)]
    percentages.append(len(normal_df)/len(df))
    # axes[i].hist(normal_df['alpha'], bins=bins)
    # print(len(normal_df))
    # print(f'normal diffusion %: {len(normal_df)/len(df)}')

    confined_df = df[(df['alpha'] < 0.75) & (df['alpha'] > 0.5)]
    percentages.append(len(confined_df)/len(df))
    # axes[i].hist(confined_df['alpha'], bins=bins)
    # print(len(confined_df))
    # print(f'confined diffusion %: {len(confined_df)/len(df)}')

    hindered_df = df[(df['alpha'] <= 0.5) & (df['alpha'] > 0.2)]
    percentages.append(len(hindered_df)/len(df))
    # axes[i].hist(hindered_df['alpha'], bins=bins)
    # print(f'hindered diffusion %: {len(hindered_df)/len(df)}')

    immobilized_df = df[(df['alpha'] <= 0.2)]
    percentages.append(len(immobilized_df)/len(df))
    # axes[i].hist(immobilized_df['alpha'], bins='doane')
    # print(f'immobilized diffusion %: {len(immobilized_df)/len(df)}')
    # print('')
    # axes[i].hist((df['alpha']), bins=40)
    #axes[i].vlines(([0.2, 0.5, 0.75, 1.25]), ymin=0, ymax=400, color='r')
    
    axes[i].bar(np.arange(len(percentages)), percentages)
    axes[i].set_xticklabels(labels)
    axes[i].set_xticks(np.arange(len(percentages)))
    axes[i].set_title(unique_class)

    #break

#for ax in axes:
    #ax.set_xscale('log')
    #ax.set_xlim([-1,1.4])

for ax in fig.axes:
    plt.sca(ax)
    plt.xticks(rotation=45)

In [None]:
confined_ecm = ecm[(ecm['alpha'] < 0.9)]
normal_ecm = ecm[(ecm['alpha'] >= 0.9) & (ecm['alpha'] <= 1.1)]
directed_ecm = ecm[ecm['alpha'] > 1.1]

#hindered_ecm = ecm[(ecm['alpha'] > 0.2) & (ecm['alpha'] <= 0.5)]
#immobilized_ecm = ecm[ecm['alpha'] <= 0.1]

#best_ecm = ecm[(ecm['alpha'] > 1.25) | (ecm['alpha'] <= 0.2)]
#len(best_ecm)

In [None]:
target = 'region'

normal_acc_list = []
confined_acc_list = []
directed_acc_list = []
all_acc_list = []
#hindered_acc_list = []
immobilized_acc_list = []
for i in range(50):
    print(i)
    dtrain, dtest, dval, X_train, X_test, y_train, y_test, le = full_preprocess(normal_ecm, balanced=True, target=target)
    booster, acc, true_label, preds = predxgboost.train(param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=200, verbose=False)
    normal_acc_list.append(acc)

    dtrain, dtest, dval, X_train, X_test, y_train, y_test, le = full_preprocess(directed_ecm, balanced=True, target=target)
    booster, acc, true_label, preds = predxgboost.train(param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=200, verbose=False)
    directed_acc_list.append(acc)

    dtrain, dtest, dval, X_train, X_test, y_train, y_test, le = full_preprocess(confined_ecm, balanced=True, target=target)
    booster, acc, true_label, preds = predxgboost.train(param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=200, verbose=False)
    confined_acc_list.append(acc)

    dtrain, dtest, dval, X_train, X_test, y_train, y_test, le = full_preprocess(ecm, balanced=True, target=target)
    booster, acc, true_label, preds = predxgboost.train(param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=200, verbose=False)
    all_acc_list.append(acc)

    # dtrain, dtest, dval, X_train, y_test = full_preprocess(hindered_ecm, balanced=True, target=target)
    # booster, acc, true_label, preds = predxgboost.train(param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=200, verbose=False)
    # hindered_acc_list.append(acc)

    # dtrain, dtest, dval, X_train, y_test = full_preprocess(immobilized_ecm, balanced=True, target=target)
    # booster, acc, true_label, preds = predxgboost.train(param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=200, verbose=False)
    # immobilized_acc_list.append(acc)

In [None]:
target = 'region'
dir_ecm_acc = []
all_acc_list = []
for i in range(50):
    print(i)
    dtrain, dtest, dval, X_train, y_test= full_preprocess(directed_ecm, balanced=True, target=target)
    booster, acc, true_label, preds = predxgboost.train(best_param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=200, verbose=False)
    dir_ecm_acc.append(acc)

    dtrain, dtest, dval, X_train, y_test= full_preprocess(ecm, balanced=True, target=target)
    booster, acc, true_label, preds = predxgboost.train(best_param_all, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=200, verbose=False)
    all_acc_list.append(acc)

In [None]:
plt.figure(figsize=(4,8))
#conf_x, conf_y = ecdf(np.array(confined_acc_list))
dir_x, dir_y = ecdf(np.array(directed_acc_list))
#norm_x, norm_y = ecdf(np.array(normal_acc_list))
all_x, all_y = ecdf(np.array(all_acc_list))
#hind_x, hind_y = ecdf(np.array(hindered_acc_list))
#imm_x, imm_y = ecdf(np.array(immobilized_acc_list))
#best_x, best_y = ecdf(np.array(best_ecm_acc))
#no_norm_x, no_norm_y = ecdf(np.array(no_normal_acc))
plt.scatter(dir_x, dir_y, label='Superdiffusive', s=2)
#plt.scatter(conf_x, conf_y, label='Subdiffusive', s=2)
#plt.scatter(norm_x, norm_y, label='Brownian', s=2)
plt.plot(all_x, all_y, label='all modes', c='r')
#plt.scatter(hind_x, hind_y, label='hindered', s=2)
#plt.scatter(imm_x, imm_y, label='immobilized', s=2)
#plt.scatter(no_norm_x, no_norm_y, label='no normal points', s=0.5)
#plt.scatter(best_x, best_y, label='top two', s=2)
plt.legend()
plt.xlabel('Model Accuracy')
plt.ylabel('Percentage %')
plt.title('ECDF of Model Accuracy for different diffusion modes')
plt.xlim([0.85, 0.95])

In [None]:
plt.figure(figsize=(4,8))
conf_x, conf_y = ecdf(np.array(region_result_df['confined_acc']))
dir_x, dir_y = ecdf(np.array(region_result_df['directed_acc']))
norm_x, norm_y = ecdf(np.array(region_result_df['normal_acc']))
all_x, all_y = ecdf(np.array(region_result_df['all_modes_acc']))
#hind_x, hind_y = ecdf(np.array(hindered_acc_list))
#imm_x, imm_y = ecdf(np.array(immobilized_acc_list))
#no_imm_x, no_imm_y = ecdf(np.array(no_imm_acc))
plt.scatter(dir_x, dir_y, label='Superdiffusive', s=7, c='#4b2e83')
plt.scatter(conf_x, conf_y, label='Subdiffusive', s=7, c='#b7a57a')
plt.scatter(norm_x, norm_y, label='Brownian', s=7, c='#999999')
plt.plot(all_x, all_y, label='All modes', c='k')
#plt.scatter(hind_x, hind_y, label='hindered', s=2)
#plt.scatter(imm_x, imm_y, label='immobilized', s=2)
#plt.scatter(no_imm_x, no_imm_y, label='no immobilized points', s=0.5)
plt.legend(loc='upper left', markerscale=2.0, fontsize=14)
plt.xlabel('Model Accuracy', fontsize=15, fontname='Arial', fontweight='bold')
plt.ylabel('Percentage %', fontsize=15, fontname='Arial', fontweight='bold')
plt.title('ECDF of Model Accuracy for different diffusion modes', fontsize=15, fontname='Arial', fontweight='bold')
plt.xticks(fontsize=15, fontname='Arial', fontweight='bold')
plt.yticks(fontsize=15, fontname='Arial', fontweight='bold')
plt.xlim([0.8, 0.9])

In [None]:
region_dict = {'confined_acc': np.array(confined_acc_list),
                  'directed_acc': np.array(directed_acc_list),
                  'normal_acc': np.array(normal_acc_list),
                  #'hindered_acc': np.array(hindered_acc_list),
                  #'immobilized_acc': np.array(immobilized_acc_list),
                  'all_modes_acc': np.array(all_acc_list)
                    }
region_result_df = pd.DataFrame.from_dict(region_dict)
region_result_df.to_csv('region_accuracies_V3.csv', index=False)

In [None]:
region_result_df = pd.read_csv('region_accuracies_V2.csv')

In [None]:
region_result_df

In [None]:
param = {'max_depth': 3,
         'eta': 0.005,
         'min_child_weight': 0,
         'verbosity': 0,
         'objective': 'multi:softprob',
         'num_class': 5,
         'silent': 'True',
         'gamma': 5,
         'subsample': 0.15,
         'colsample_bytree': 0.8,
         'eval_metric': "mlogloss",
#          # GPU integration will cut time in ~half:
#          'gpu_id' : 0,
#          'tree_method': 'gpu_hist',
#          'predictor': 'gpu_predictor'
         }

In [None]:
dtrain, dtest, dval, X_train, X_test, y_train, y_test, le = full_preprocess(directed_ecm, target=target)
(best_model, best_param, best_eval, best_boost_rounds) = predxgboost.xgb_paramsearch(X_train=X_train, y_train=X_train['encoded_target'], features=features, init_params=param)

In [None]:
best_param

In [None]:
best_param_all_ = {'max_depth': 4,
 'eta': 0.005,
 'min_child_weight': 0,
 'verbosity': 0,
 'objective': 'multi:softprob',
 'num_class': 5,
 'silent': 'True',
 'gamma': 5,
 'subsample': 0.6,
 'colsample_bytree': 0.7,
 'eval_metric': 'mlogloss'}

# Deep dive into treatment data

In [None]:
fstats_tot_treatment = data_process.generate_fullstats(treatment_dataset_path, treatment_filelist, ['NT', 'ChABC'], 'treatment')

In [None]:
target = 'treatment'
ecm = fstats_tot_treatment[feature_list + [target, 'Track_ID', 'X', 'Y']] #dont think i need these rn
print(ecm.shape)
ecm = ecm[~ecm[list(set(feature_list) - set(['Deff2', 'Mean Deff2']))].isin([np.nan, np.inf, -np.inf]).any(1)]       # Removing nan and inf data points
ecm.shape

In [None]:
fig = plt.figure(figsize=(4,8))


labels = ecm[target].unique()

directed_percent = np.zeros(len(labels))
normal_percent = np.zeros(len(labels))
constrained_percent = np.zeros(len(labels))
immobilized_percent = np.zeros(len(labels))

for i, unique_class in enumerate(ecm[target].unique()):
    
    print(unique_class)
    df = ecm[ecm[target] == unique_class]

    directed_df = df[df['alpha'] > 1.1]
    directed_percent[i] = (len(directed_df)/len(df))
    print(directed_percent[i])

    normal_df = df[(df['alpha'] <= 1.1) & (df['alpha'] >= 0.9)]
    normal_percent[i] = (len(normal_df)/len(df))
    print(normal_percent[i])
    
    constrained_df = df[(df['alpha'] < 0.9)]
    constrained_percent[i] = (len(constrained_df)/len(df))
    print(constrained_percent[i])
    print()
    
    #immobilized_df = df[(df['alpha'] <= 0.1)]
    #immobilized_percent[i] = (len(immobilized_df)/len(df))
    
    
#plt.bar(labels, immobilized_percent, color='r', label='immobilized')
bar_w = 0.5
plt.bar(labels, constrained_percent, color='#b7a57a', label='Subdiffusive', width=bar_w)
plt.bar(labels, normal_percent, bottom=constrained_percent+immobilized_percent, color='#999999', label='Brownian', width=bar_w)
plt.bar(labels, directed_percent, bottom=constrained_percent+immobilized_percent+normal_percent, color='#4b2e83', label='Superdiffusive', width=bar_w)
plt.ylim([0,1])
plt.legend(loc='lower right', fontsize=14)
plt.title('Percentage of Diffusion Modes per Treatment Group', fontsize=15, fontname='Arial', fontweight='bold')
plt.ylim([0,1])
plt.xticks(fontsize=15, fontname='Arial', fontweight='bold')
plt.yticks(fontsize=15, fontname='Arial', fontweight='bold')


In [None]:
#age_df_list = [hippo_ecm, thala_ecm, gangl_ecm, cortex_ecm, striat_ecm]
fig, axes = plt.subplots(1,len(ecm[target].unique()), sharey=True, figsize=(8,6))
#plt.ylim([0, 250])


for i, unique_class in enumerate(ecm[target].unique()):
    labels = ['directed', 'normal', 'confined', 'hindered', 'immobilized']
    percentages = []
    #bins=200
    # print(f'class {unique_class}')
    df = ecm[ecm[target] == unique_class]

    directed_df = df[df['alpha'] > 1.25]
    percentages.append(len(directed_df)/len(df))
    # axes[i].hist(directed_df['alpha'], bins=bins)
    # print(len(directed_df))
    # print(f'directed diffusion %: {len(directed_df)/len(df)}')

    normal_df = df[(df['alpha'] <= 1.25) & (df['alpha'] >= 0.75)]
    percentages.append(len(normal_df)/len(df))
    # axes[i].hist(normal_df['alpha'], bins=bins)
    # print(len(normal_df))
    # print(f'normal diffusion %: {len(normal_df)/len(df)}')

    confined_df = df[(df['alpha'] < 0.75) & (df['alpha'] > 0.5)]
    percentages.append(len(confined_df)/len(df))
    # axes[i].hist(confined_df['alpha'], bins=bins)
    # print(len(confined_df))
    # print(f'confined diffusion %: {len(confined_df)/len(df)}')

    hindered_df = df[(df['alpha'] <= 0.5) & (df['alpha'] > 0.2)]
    percentages.append(len(hindered_df)/len(df))
    # axes[i].hist(hindered_df['alpha'], bins=bins)
    # print(f'hindered diffusion %: {len(hindered_df)/len(df)}')

    immobilized_df = df[(df['alpha'] <= 0.2)]
    percentages.append(len(immobilized_df)/len(df))
    # axes[i].hist(immobilized_df['alpha'], bins='doane')
    # print(f'immobilized diffusion %: {len(immobilized_df)/len(df)}')
    # print('')
    # axes[i].hist((df['alpha']), bins=40)
    #axes[i].vlines(([0.2, 0.5, 0.75, 1.25]), ymin=0, ymax=400, color='r')
    
    axes[i].bar(np.arange(len(percentages)), percentages)
    axes[i].set_xticklabels(labels)
    axes[i].set_xticks(np.arange(len(percentages)))
    axes[i].set_title(unique_class)

    #break

#for ax in axes:
    #ax.set_xscale('log')
    #ax.set_xlim([-1,1.4])

for ax in fig.axes:
    plt.sca(ax)
    plt.xticks(rotation=45)

In [None]:
confined_ecm = ecm[(ecm['alpha'] < 0.9)]
normal_ecm = ecm[(ecm['alpha'] >= 0.9) & (ecm['alpha'] <= 1.1)]
directed_ecm = ecm[ecm['alpha'] > 1.1]

#hindered_ecm = ecm[(ecm['alpha'] > 0.2) & (ecm['alpha'] <= 0.5)]
#immobilized_ecm = ecm[ecm['alpha'] <= 0.1]

In [None]:
ecm

In [None]:
target = 'treatment'

normal_acc_list = []
confined_acc_list = []
directed_acc_list = []
all_acc_list = []
#hindered_acc_list = []
#immobilized_acc_list = []
for i in range(50):
    print(i)
    dtrain, dtest, dval, X_train, X_test, y_train, y_test, le = full_preprocess(normal_ecm, balanced=True, target=target)
    booster, acc, true_label, preds = predxgboost.train(param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=200, verbose=False)
    normal_acc_list.append(acc)

    dtrain, dtest, dval, X_train, X_test, y_train, y_test, le = full_preprocess(directed_ecm, balanced=True, target=target)
    booster, acc, true_label, preds = predxgboost.train(param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=200, verbose=False)
    directed_acc_list.append(acc)

    dtrain, dtest, dval, X_train, X_test, y_train, y_test, le = full_preprocess(confined_ecm, balanced=True, target=target)
    booster, acc, true_label, preds = predxgboost.train(param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=200, verbose=False)
    confined_acc_list.append(acc)

    dtrain, dtest, dval, X_train, X_test, y_train, y_test, le = full_preprocess(ecm, balanced=True, target=target)
    booster, acc, true_label, preds = predxgboost.train(param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=200, verbose=False)
    all_acc_list.append(acc)

    # dtrain, dtest, dval, y_test= full_preprocess(hindered_ecm, balanced=True, target=target)
    # booster, acc, true_label, preds = predxgboost.train(param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=200, verbose=False)
    # hindered_acc_list.append(acc)

    # dtrain, dtest, dval, x_train, y_test= full_preprocess(immobilized_ecm, balanced=True, target=target)
    # booster, acc, true_label, preds = predxgboost.train(param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=200, verbose=False)
    # immobilized_acc_list.append(acc)

In [None]:
plt.figure(figsize=(4,8))
conf_x, conf_y = ecdf(np.array(confined_acc_list))
dir_x, dir_y = ecdf(np.array(directed_acc_list))
norm_x, norm_y = ecdf(np.array(normal_acc_list))
all_x, all_y = ecdf(np.array(all_acc_list))
#hind_x, hind_y = ecdf(np.array(hindered_acc_list))
#imm_x, imm_y = ecdf(np.array(immobilized_acc_list))
plt.scatter(dir_x, dir_y, label='Superdiffusive', s=2)
plt.scatter(conf_x, conf_y, label='Subdiffusive', s=2)
plt.scatter(norm_x, norm_y, label='Brownian', s=2)
plt.plot(all_x, all_y, label='All modes', c='r')
#plt.scatter(hind_x, hind_y, label='hindered', s=2)
#plt.scatter(imm_x, imm_y, label='immobilized', s=2)
plt.legend()
plt.xlabel('Model Accuracy')
plt.ylabel('Percentage %')
plt.title('ECDF of Model Accuracy for different diffusion modes')
plt.xlim([.65, .75])

In [None]:
treatment_dict = {'confined_acc': np.array(confined_acc_list),
                  'directed_acc': np.array(directed_acc_list),
                  'normal_acc': np.array(normal_acc_list),
                  #'hindered_acc': np.array(hindered_acc_list),
                  #'#immobilized_acc': np.array(immobilized_acc_list),
                  'all_modes_acc': np.array(all_acc_list)
                    }
treatment_result_df = pd.DataFrame.from_dict(treatment_dict)
treatment_result_df.to_csv('treatment_accuracies_v4.csv', index=False)

In [None]:
treatment_result_df = pd.read_csv('treatment_accuracies_v4.csv')

In [None]:
treatment_result_df

In [None]:
plt.figure(figsize=(4,8))
conf_x, conf_y = ecdf(np.array(treatment_result_df['confined_acc']))
dir_x, dir_y = ecdf(np.array(treatment_result_df['directed_acc']))
norm_x, norm_y = ecdf(np.array(treatment_result_df['normal_acc']))
all_x, all_y = ecdf(np.array(treatment_result_df['all_modes_acc']))
#hind_x, hind_y = ecdf(np.array(hindered_acc_list))
#imm_x, imm_y = ecdf(np.array(immobilized_acc_list))
#no_imm_x, no_imm_y = ecdf(np.array(no_imm_acc))
plt.scatter(dir_x, dir_y, label='Superdiffusive', s=7, c='#4b2e83')
plt.scatter(conf_x, conf_y, label='Subdiffusive', s=7, c='#b7a57a')
plt.scatter(norm_x, norm_y, label='Brownian', s=7, c='#999999')
plt.plot(all_x, all_y, label='All modes', c='k')
#plt.scatter(hind_x, hind_y, label='hindered', s=2)
#plt.scatter(imm_x, imm_y, label='immobilized', s=2)
#plt.scatter(no_imm_x, no_imm_y, label='no immobilized points', s=0.5)
plt.xlim([0.65, 0.75])
plt.legend(loc='upper left', markerscale=2.0, fontsize=14)
plt.xlabel('Model Accuracy', fontsize=15, fontname='Arial', fontweight='bold')
plt.ylabel('Percentage %', fontsize=15, fontname='Arial', fontweight='bold')
plt.title('ECDF of Model Accuracy for different diffusion modes', fontsize=15, fontname='Arial', fontweight='bold')
plt.xticks(fontsize=15, fontname='Arial', fontweight='bold')
plt.yticks(fontsize=15, fontname='Arial', fontweight='bold')

In [None]:
treatment_result_df.head()

In [None]:
param = {'max_depth': 3,
         'eta': 0.005,
         'min_child_weight': 0,
         'verbosity': 0,
         'objective': 'binary:hinge',
         #'num_class': 2,
         'silent': 'True',
         'gamma': 5,
         'subsample': 0.15,
         'colsample_bytree': 0.8,
         'eval_metric': "logloss",
#          # GPU integration will cut time in ~half:
#          'gpu_id' : 0,
#          'tree_method': 'gpu_hist',
#          'predictor': 'gpu_predictor'
         }

In [None]:
dtrain, dtest, dval, X_train, X_test, y_train, y_test, le = full_preprocess(ecm, target=target)
#(best_model, best_param, best_eval, best_boost_rounds) = predxgboost.xgb_paramsearch(X_train=X_train, y_train=X_train['encoded_target'], features=feature_list, init_params=param, metrics=['error', 'logloss', 'auc'])

In [None]:
print(best_param)
print(best_boost_rounds)

In [None]:
best_boost_rounds = 57
best_param = {'max_depth': 5, 'eta': 0.05, 'min_child_weight': 0, 'verbosity': 0, 'objective': 'binary:logitraw', 'silent': 'True', 'gamma': 2, 'subsample': 0.15, 'colsample_bytree': 0.8, 'eval_metric': 'error'}

In [None]:
booster, acc, true_label, preds = predxgboost.train(param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=best_boost_rounds, verbose=True)


In [None]:
class_names = le.classes_
class_results = classification_report(y_test, preds, digits=4, target_names = class_names)
print(str(class_results))

In [None]:
metrics.confusion_matrix(y_test, preds)
plt.figure(figsize=(12,10))
cm_array = metrics.confusion_matrix(y_test, preds)
df_cm = pd.DataFrame(cm_array, index = class_names, columns = class_names)

sns.set(font_scale=1.4) # for label size
ax = sns.heatmap(df_cm, annot=True, annot_kws={"size": 16}, cmap="YlGnBu")
ax.set(xlabel='Predicted', ylabel='Actual')

plt.show()

In [None]:
from matplotlib import colors as plt_colors

explainer = shap.TreeExplainer(booster)
shap_values = explainer.shap_values(X_test[features])
c_NT = '#E69F00'
#c_HYase = '#56B4E9'
c_ChABC = '#009E73'

colors = [c_NT, c_ChABC]
class_inds = np.argsort([-np.abs(shap_values[i]).mean() for i in range(len(shap_values))])
cmap = plt_colors.ListedColormap(np.array(colors)[class_inds])
shap.summary_plot(shap_values, X_test[features], class_names=np.array(class_names), max_display=15, title='Total SHAP Values', color=cmap)

In [None]:
best_param_alldata = {'max_depth': 5, 'eta': 0.05, 'min_child_weight': 0, 'verbosity': 0, 'objective': 'multi:softprob', 'num_class': 2, 'silent': 'True', 'gamma': 2.0, 'subsample': 0.15, 'colsample_bytree': 0.8, 'eval_metric': 'mlogloss'}
best_boost_rounds_alldata = 57

In [None]:
#directed_acc_list = []
all_acc_list = []
#hindered_acc_list = []
#immobilized_acc_list = []
for i in range(50):
    print(i)
    dtrain, dtest, dval, X_train, X_test, y_train, y_test, le = full_preprocess(ecm, balanced=True, target=target)
    booster, acc, true_label, preds = predxgboost.train(best_param_alldata, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=57, verbose=False)
    all_acc_list.append(acc)

    # dtrain, dtest, dval, X_train, X_test, y_train, y_test, le = full_preprocess(directed_ecm, balanced=True, target=target)
    # booster, acc, true_label, preds = predxgboost.train(best_param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=best_boost_rounds, verbose=False)
    # directed_acc_list.append(acc)

In [None]:
all_acc_list

In [None]:
plt.figure()#figsize=(4,8))
dir_x, dir_y = ecdf(np.array(directed_acc_list))
all_x, all_y = ecdf(np.array(all_acc_list))
#hind_x, hind_y = ecdf(np.array(hindered_acc_list))
#imm_x, imm_y = ecdf(np.array(immobilized_acc_list))
plt.scatter(dir_x, dir_y, label='Superdiffusive', s=2)
plt.plot(all_x, all_y, label='All modes', c='r')
#plt.scatter(hind_x, hind_y, label='hindered', s=2)
#plt.scatter(imm_x, imm_y, label='immobilized', s=2)
plt.legend()
plt.xlabel('Model Accuracy')
plt.ylabel('Percentage %')
plt.title('ECDF of Model Accuracy for different diffusion modes')
plt.xlim([.65, .75])