In [19]:
from diff_predictor import data_process, predxgboost, spatial
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt 

from os import listdir, getcwd, chdir
from os.path import isfile, join
import os
from sklearn.preprocessing import scale, StandardScaler
from numpy.random import permutation


from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, recall_score, precision_score, f1_score
import operator
import xgboost as xgb
import shap
from xgboost.training import CVPack
from xgboost import callback
from xgboost.core import CallbackEnv
from xgboost.core import EarlyStopException
from xgboost.core import STRING_TYPES

import mlflow
import warnings


In [2]:
workbookDir = getcwd()

print('Current Notebook Dir: ' + workbookDir)
chdir(workbookDir) # Go to current workbook Dir"
chdir('..')        # Go up one
print(f'Using current directory for loading data: {getcwd()}')
workbookDir = getcwd()

Current Notebook Dir: /Users/nelsschimek/Documents/nancelab/diff_predictor/notebooks
Using current directory for loading data: /Users/nelsschimek/Documents/nancelab/diff_predictor


In [8]:
rotenone_feature_path = workbookDir + '/data/Brendan_traj_data/Cortex_features/'
rotenone_feature_filelist = [f for f in listdir(rotenone_feature_path) if isfile(join(rotenone_feature_path, f)) and 'feat' in f and '7DIV' in f ]
print(len(rotenone_feature_filelist))
print(rotenone_feature_filelist)

8
['features_P10F_5uM_7DIV_40nm_slice_1_cortex_vid_1.csv', 'features_P10F_NT_7DIV_40nm_slice_1_cortex_vid_1.csv', 'features_P10F_NT_7DIV_40nm_slice_1_cortex_vid_2.csv', 'features_P10F_1uM_7DIV_40nm_slice_1_cortex_vid_1.csv', 'features_P10F_50nM_7DIV_40nm_slice_1_cortex_vid_2.csv', 'features_P10F_5uM_7DIV_40nm_slice_2_cortex_vid_2.csv', 'features_P10F_1uM_7DIV_40nm_slice_1_cortex_vid_2.csv', 'features_P10F_50nM_7DIV_40nm_slice_1_cortex_vid_1.csv']


In [13]:
fstats_tot_rotenone = data_process.generate_fullstats(rotenone_feature_path, rotenone_feature_filelist, ['NT','50nM', '1uM', '5uM'], 'dosage')

Adding file features_P10F_5uM_7DIV_40nm_slice_1_cortex_vid_1.csv size: (2405, 68)
Adding file features_P10F_NT_7DIV_40nm_slice_1_cortex_vid_1.csv size: (1738, 68)
Adding file features_P10F_NT_7DIV_40nm_slice_1_cortex_vid_2.csv size: (1190, 68)
Adding file features_P10F_1uM_7DIV_40nm_slice_1_cortex_vid_1.csv size: (638, 68)
Adding file features_P10F_50nM_7DIV_40nm_slice_1_cortex_vid_2.csv size: (1785, 68)
Adding file features_P10F_5uM_7DIV_40nm_slice_2_cortex_vid_2.csv size: (2829, 68)
Adding file features_P10F_1uM_7DIV_40nm_slice_1_cortex_vid_2.csv size: (863, 68)
Adding file features_P10F_50nM_7DIV_40nm_slice_1_cortex_vid_1.csv size: (2632, 68)


In [14]:
feature_list = [
    'alpha', # Fitted anomalous diffusion alpha exponenet
    'D_fit', # Fitted anomalous diffusion coefficient
    'kurtosis', # Kurtosis of track
    'asymmetry1', # Asymmetry of trajecory (0 for circular symmetric, 1 for linear)
    'asymmetry2', # Ratio of the smaller to larger principal radius of gyration
    'asymmetry3', # An asymmetric feature that accnts for non-cylindrically symmetric pt distributions
    'AR', # Aspect ratio of long and short side of trajectory's minimum bounding rectangle
    'elongation', # Est. of amount of extension of trajectory from centroid
    'boundedness', # How much a particle with Deff is restricted by a circular confinement of radius r
    'fractal_dim', # Measure of how complicated a self similar figure is
    'trappedness', # Probability that a particle with Deff is trapped in a region
    'efficiency', # Ratio of squared net displacement to the sum of squared step lengths
    'straightness', # Ratio of net displacement to the sum of squared step lengths
    'MSD_ratio', # MSD ratio of the track
#     'frames', # Number of frames the track spans
    'Deff1', # Effective diffusion coefficient at 0.33 s
    'Deff2', # Effective diffusion coefficient at 3.3 s
    #'angle_mean', # Mean turning angle which is counterclockwise angle from one frame point to another
    #'angle_mag_mean', # Magnitude of the turning angle mean
    #'angle_var', # Variance of the turning angle
    #'dist_tot', # Total distance of the trajectory
    #'dist_net', # Net distance from first point to last point
    #'progression', # Ratio of the net distance traveled and the total distance
    'Mean alpha', 
    'Mean D_fit', 
    'Mean kurtosis', 
    'Mean asymmetry1', 
    'Mean asymmetry2',
    'Mean asymmetry3', 
    'Mean AR',
    'Mean elongation', 
    'Mean boundedness',
    'Mean fractal_dim', 
    'Mean trappedness', 
    'Mean efficiency',
    'Mean straightness', 
    'Mean MSD_ratio', 
    'Mean Deff1', 
    'Mean Deff2',
    ]

target = 'dosage'

In [15]:
ecm = fstats_tot_rotenone[feature_list + [target, 'Track_ID', 'X', 'Y']] #dont think i need these rn
print(ecm.shape)
ecm = ecm[~ecm[list(set(feature_list) - set(['Deff2', 'Mean Deff2']))].isin([np.nan, np.inf, -np.inf]).any(1)]       # Removing nan and inf data points
ecm.shape

(14080, 36)


(12493, 36)

In [21]:
# Wine Quality Sample
def full_preprocess(ecm, balanced=True, y_scramble=False, target=None):

    rand_state = np.random.randint(1, 2000)
    if balanced:
        bal_ecm = data_process.balance_data(ecm, target, random_state=rand_state)
        bal_ecm = bal_ecm.reset_index(drop=True)
        #sampled_df = bal_ecm.sample(frac=0.5)
        sampled_df = data_process.bin_data(bal_ecm)
    else:
        sampled_df = data_process.bin_data(ecm)
    label_df = sampled_df[target]
    features_df = sampled_df.drop([target, 'X', 'Y', 'binx', 'biny', 'bins', 'Track_ID'], axis=1)
    features = features_df.columns

    if y_scramble:
        perm = permutation(len(label_df))
        label_shuffled = label_df[perm]
        le = preprocessing.LabelEncoder()
        sampled_df['encoded_target'] = le.fit_transform(label_shuffled)
    else:
        le = preprocessing.LabelEncoder()
        sampled_df['encoded_target'] = le.fit_transform(sampled_df[target])

    seed = rand_state
    np.random.seed(seed)
    train_split = 0.8
    test_split = 0.5


    training_bins = np.random.choice(sampled_df['bins'].unique(), int(len(sampled_df['bins'].unique())*train_split), replace=False)

    X_train = sampled_df[sampled_df['bins'].isin(training_bins)]
    X_test_val = sampled_df[~sampled_df['bins'].isin(training_bins)]
    X_val, X_test = train_test_split(X_test_val, test_size=test_split, random_state=seed)

    y_train = X_train['encoded_target']
    y_test = X_test['encoded_target']
    y_val = X_val['encoded_target']

    dtrain = xgb.DMatrix(X_train[features], label=y_train)
    dtest = xgb.DMatrix(X_test[features], label=y_test)
    dval = xgb.DMatrix(X_val[features], label=y_val)

    
    
    import logging
    logging.basicConfig(level=logging.WARN)
    logger = logging.getLogger(__name__)

    def eval_metrics(actual, pred):
        
        return rmse, mae, r2


    warnings.filterwarnings("ignore")
    np.random.seed(40)

    

    # Set default values if no alpha is provided
    # if float(in_alpha) is None:
    #     alpha = 0.5
    # else:
    #     alpha = float(in_alpha)

    # Set default values if no l1_ratio is provided
    param = {'max_depth': 3,
         'eta': 0.005,
         'min_child_weight': 0,
         'verbosity': 0,
         'objective': 'multi:softprob',
         'num_class': 4,
         'silent': 'True',
         'gamma': 5,
         'subsample': 0.15,
         'colsample_bytree': 0.8,
         'eval_metric': "mlogloss",
#          # GPU integration will cut time in ~half:
#          'gpu_id' : 0,
#          'tree_method': 'gpu_hist',
#          'predictor': 'gpu_predictor'
         }

    # Useful for multiple runs (only doing one run in this sample notebook)    
    with mlflow.start_run():
        # Execute ElasticNet
        booster, acc, true_label, preds = predxgboost.train(param, dtrain, dtest, dval, evals=[(dtrain, 'train'), (dval, 'eval')], num_round=200, verbose=False)


        # Evaluate Metrics
        # predicted_qualities = booster.predict(dv)
        # (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

        # Print out metrics
        #print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
        print("  Accuracy: %s" % acc)
        

        # Log parameter, metrics, and model to MLflow
        #mlflow.log_param("alpha", alpha)
        mlflow.log_metric("Accuracy", acc)
        

        mlflow.xgboost.log_model(booster, "model")

In [23]:
full_preprocess(ecm, target='dosage')


Ratio before data balance (5uM:NT:1uM:50nM) = 4577:2759:1289:3868
Ratio after balance (5uM:NT:1uM:50nM) = 1289:1289:1289:1289
Accuracy: 0.47947761194029853
  Accuracy: 0.47947761194029853
