Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
2fded7f
Merge https://github.com/nlsschim/diff_predictor into Nels
nlsschim Oct 29, 2020
ed258e8
updated .gitignore
nlsschim Feb 10, 2021
2f87f3d
adding functions for xgboost pipeline and generating predictions on e…
nlsschim Apr 6, 2021
6660166
added testing file and data
nlsschim Apr 6, 2021
8fb5c22
updated .gitignore
nlsschim Apr 6, 2021
991b63c
a whole bunch of small notebook updates
nlsschim Apr 6, 2021
165e393
added function for scaling features to data_process file.
nlsschim Apr 12, 2021
709177e
removed unneeded data files and made minor notebook changes
nlsschim Apr 27, 2021
6d7033f
notebooks and scripts for determining how many trajectories are requi…
nlsschim May 18, 2021
180a854
added gitignore for branch
nlsschim May 18, 2021
8a0d3a6
updating .gitignore
nlsschim May 18, 2021
09d4069
updated gitignore...again
nlsschim May 18, 2021
3570d9b
minor updates to the train function in predxgboost and to functions i…
nlsschim May 21, 2021
949c637
updates to various notebooks, nothing major
nlsschim Jun 16, 2021
fd6037c
Merge branch 'Nels' into nels_azure
nlsschim Jun 16, 2021
86957da
Merge pull request #2 from Nance-Lab/nels_azure
nlsschim Jun 16, 2021
f19dc0b
new notebooks added
nlsschim Jul 8, 2021
9e02da1
notebook for generating train_val_test_csvs
nlsschim Jul 8, 2021
5636f30
fixing merge conflicts
nlsschim Jul 8, 2021
21a2b04
Merge branch 'Nels' of https://github.com/Nance-Lab/diff_predictor in…
nlsschim Jul 8, 2021
0e584b3
updated .gitignore
nlsschim Jul 8, 2021
0ed170f
removed diff_classifier for being an internal notebook within diff_pr…
nlsschim Jul 8, 2021
22ebbd7
merge conflicts
nlsschim Jul 20, 2021
7113c05
testing functions for most important old functions; everything passing
nlsschim Jul 21, 2021
9b08b98
updates to tests, and some notebooks
nlsschim Aug 2, 2021
d3a689e
new notebook for tsne and pca visualizations, and new file for utilit…
nlsschim Aug 11, 2021
9f4d80e
working on trajectory visualizations
nlsschim Aug 17, 2021
e637aec
working on plots for tdiffusion mode of analysis
nlsschim Aug 27, 2021
dac6bac
adding robust_pca functionality
nlsschim Aug 30, 2021
76a319b
updates to functions for plotting MSD comparison figures
nlsschim Aug 31, 2021
9cae1e7
notebook updates, nothing major
nlsschim Sep 14, 2021
704ead5
running notebooks to make figures for paper
nlsschim Sep 28, 2021
6655f3c
updated diff_predictor dependancies
nlsschim Sep 28, 2021
97abb7c
added notebook for y-scrambling technique to establish baseline for r…
nlsschim Sep 29, 2021
e78d3b5
added notebook for performing y-scrambling
nlsschim Sep 30, 2021
1a74635
running notebooks to generate figures for paper
nlsschim Oct 1, 2021
a20edf0
final visualizations for alpha fitting error plots
nlsschim Oct 7, 2021
387c027
running notebooks for final figures
nlsschim Oct 19, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 26 additions & 21 deletions diff_predictor/data_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,36 +28,42 @@ def generate_fullstats(dataset_path, filelist, targets, target_col_name='Target'
video_num = 0
for filename in filelist:
fstats = pd.read_csv(dataset_path + filename, encoding = "ISO-8859-1", index_col='Unnamed: 0')
print('{} size: {}'.format(filename, fstats.shape))
#print('{} size: {}'.format(filename, fstats.shape))

for i in range(0, len(targets)):
if targets[i] in filename:
print('Adding file {} size: {}'.format(filename, fstats.shape))
fstats[target_col_name] = pd.Series(fstats.shape[0]*[targets[i]], index=fstats.index)
break
fstats['Filename'] = pd.Series(fstats.shape[0]*[filename], index=fstats.index)
fstats['Video Number'] = pd.Series(fstats.shape[0]*[video_num], index=fstats.index)
if fstats_tot is None:
fstats_tot = fstats
else:
fstats_tot = fstats_tot.append(fstats, ignore_index=True)
video_num += 1
#break

fstats['Video Number'] = pd.Series(fstats.shape[0]*[video_num], index=fstats.index)
if fstats_tot is None:
fstats_tot = fstats
else:
fstats_tot = fstats_tot.append(fstats, ignore_index=True)
video_num += 1

return fstats_tot

def balance_data(df, target, **kwargs):
"""
Balances the dataset so there are equal number of rows for each class
Parameters:
Balance spatial data using undersampling. Assumes input will
be a dataframe and data will be used for categorical classification
Parameters
----------
df: pandas.DataFrame
dataframe to be balanced
target: string
name of dataframe column that represents that class the row is from

Returns:
--------
bal_df: pandas.DataFrame
dataframe with equal number of rows per unique class
df : pandas.DataFrame
pandas dataframe to be balanced
target : string
the name of the target/tag/y-value column to balance data around

Optional Parameters
-------------------
random_state : int : 1
seed to base random sampling from
Returns
-------
A fully balanced pandas dataframe
"""
if 'random_state' not in kwargs:
random_state = 1
Expand Down Expand Up @@ -140,5 +146,4 @@ def split_data(df, target, train_split, test_val_split=1.0, seed=1234):
y_train = X_train['encoded_target']
y_test = X_test['encoded_target']
result = np.append([(X_train, y_train), (X_test, y_test)], result)
return result, le

return result, le
65 changes: 63 additions & 2 deletions diff_predictor/eval.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,66 @@
import sys
import numpy
import scipy.stats
from seaborn import heatmap

if 'diff_predictor.core' not in sys.modules:
from diff_predictor import core

if 'core' not in sys.modules:
import core

def perf_meas(y_actual, y_pred, cls, verbose=True):
'''
Shows the performance measurements of resulting prediction.
Performance measures include true positive, true negative,
false positive, false negative
Parameters
----------
y_actual : list
Actual values of y
y_pred : list
Predicted values of y
cls : int
class to run performance measure on
verbose : boolean : True
report performance as a string
Returns
-------
tuple of four performance values (TP, FP, TN, FN)
'''

assert len(y_actual) == len(y_pred), 'Must be same number of actual and predicted values'

TP = 0
FP = 0
TN = 0
FN = 0
for i in range(len(y_actual)):
if (y_actual[i]==y_pred[i]) and (y_pred[i]==cls):
TP += 1
if (y_pred[i]==cls) and (y_actual[i]!=y_pred[i]):
FP += 1
if (y_actual[i]==y_pred[i]) and (y_pred[i]!=cls):
TN += 1
if (y_pred[i]!=cls) and (y_actual[i]!=y_pred[i]):
FN += 1
if verbose is True:
print(f'(TP, FP, TN, FN) = {(TP, FP, TN, FN)}')
return(TP, FP, TN, FN)


def corrmat(df, method='pearson', show_plot=True, **kwargs):
'''

'''
plot_options = {'annot': True,
'fmt': "f",
}
plot_options.update(kwargs)
error_msg = "Correlation type not available. Select" +\
"from pearson, spearman, or kendall corr."
switch_case = {'pearson': df.corr(),
'spearman': df.corr(method=method),
'kendall': df.corr(method=method)}
corr_mat = switch_case.get(method, lambda: error_msg)
if show_plot:
return heatmap(corr_mat, **plot_options)
return corr_mat
22 changes: 14 additions & 8 deletions diff_predictor/predxgboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,14 @@
import json
import numpy as np
import pandas as pd
from os import path
from sklearn.metrics import accuracy_score
import xgboost as xgb
import shap
from matplotlib import colors as plt_colors
import operator

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn import preprocessing

import xgboost as xgb
from xgboost import callback, DMatrix, Booster
Expand Down Expand Up @@ -71,8 +77,8 @@ def mknfold(X_train, y_train, nfold, param, evals=(), features=None):
wt_list : list
list of weights for each fold. This is the size of each fold
'''
if not features:
features = X_train.columns
#if not features:
#features = X_train.columns
out_idset, wt_list = bin_fold(X_train, nfold)
in_idset = [np.concatenate([out_idset[i]
for i in range(nfold) if k != i])
Expand Down Expand Up @@ -469,7 +475,7 @@ def _gs_helper(var1n, var2n, best_model, best_param,
return best_model, best_param, best_eval, best_boost_rounds


def train(param, dtrain, dtest, dval=None, evals=None, num_round=2000):
def train(param, dtrain, dtest, dval=None, evals=None, num_round=2000, verbose=True):
'''
Parameters
----------
Expand Down Expand Up @@ -514,13 +520,13 @@ def train(param, dtrain, dtest, dval=None, evals=None, num_round=2000):
evals = [(dtrain, 'train')]
if dval is not None and (dval, 'eval') not in evals:
evals += [(dval, 'eval')]
model = xgb.train(param, dtrain, num_round, evals, )
model = xgb.train(param, dtrain, num_round, evals, verbose_eval=verbose)
true_label = dtest.get_label()
ypred = model.predict(dtest)
preds = [np.where(x == np.max(x))[0][0] for x in ypred]
acc = accuracy_score(true_label, preds)
print("Accuracy:", acc)
return model, acc
print("Accuracy:",acc)
return model, acc, true_label, preds


def save(model, filename):
Expand Down
Loading