Nance-Lab · nlsschim · Oct 19, 2021 · Oct 29, 2020 · Feb 10, 2021 · Apr 6, 2021
diff --git a/diff_predictor/data_process.py b/diff_predictor/data_process.py
@@ -28,36 +28,42 @@ def generate_fullstats(dataset_path, filelist, targets, target_col_name='Target'
     video_num = 0
     for filename in filelist:
             fstats = pd.read_csv(dataset_path + filename, encoding = "ISO-8859-1", index_col='Unnamed: 0')
-            print('{} size: {}'.format(filename, fstats.shape))
+            #print('{} size: {}'.format(filename, fstats.shape))
 
             for i in range(0, len(targets)):
                 if targets[i] in filename:
+                    print('Adding file {} size: {}'.format(filename, fstats.shape))
                     fstats[target_col_name] = pd.Series(fstats.shape[0]*[targets[i]], index=fstats.index)
-                    break
+                    fstats['Filename'] = pd.Series(fstats.shape[0]*[filename], index=fstats.index)
+                    fstats['Video Number'] = pd.Series(fstats.shape[0]*[video_num], index=fstats.index)
+                    if fstats_tot is None:
+                        fstats_tot = fstats
+                    else:
+                        fstats_tot = fstats_tot.append(fstats, ignore_index=True)
+                    video_num += 1
+                    #break
 
-            fstats['Video Number'] = pd.Series(fstats.shape[0]*[video_num], index=fstats.index)
-            if fstats_tot is None:
-                fstats_tot = fstats
-            else:
-                fstats_tot = fstats_tot.append(fstats, ignore_index=True)
-            video_num += 1
 
     return fstats_tot
 
 def balance_data(df, target, **kwargs):
     """
-    Balances the dataset so there are equal number of rows for each class
-    Parameters:
+    Balance spatial data using undersampling. Assumes input will
+    be a dataframe and data will be used for categorical classification
+    Parameters
     ----------
-    df: pandas.DataFrame
-        dataframe to be balanced
-    target: string
-        name of dataframe column that represents that class the row is from
-
-    Returns:
-    --------
-    bal_df: pandas.DataFrame
-        dataframe with equal number of rows per unique class
+    df : pandas.DataFrame
+        pandas dataframe to be balanced
+    target : string
+        the name of the target/tag/y-value column to balance data around
+
+    Optional Parameters
+    -------------------
+    random_state : int : 1
+        seed to base random sampling from
+    Returns
+    -------
+    A fully balanced pandas dataframe
     """
     if 'random_state' not in kwargs:
         random_state = 1
@@ -140,5 +146,4 @@ def split_data(df, target, train_split, test_val_split=1.0, seed=1234):
     y_train = X_train['encoded_target']
     y_test = X_test['encoded_target']
     result = np.append([(X_train, y_train), (X_test, y_test)], result)
-    return result, le
-
+    return result, le
diff --git a/diff_predictor/eval.py b/diff_predictor/eval.py
@@ -1,5 +1,66 @@
 import sys
+import numpy
+import scipy.stats
+from seaborn import heatmap
 
+if 'diff_predictor.core' not in sys.modules:
+    from diff_predictor import core
 
-if 'core' not in sys.modules:
-    import core
+
+def perf_meas(y_actual, y_pred, cls, verbose=True):
+    '''
+    Shows the performance measurements of resulting prediction.
+    Performance measures include true positive, true negative,
+    false positive, false negative
+    Parameters
+    ----------
+    y_actual : list
+        Actual values of y
+    y_pred : list
+        Predicted values of y
+    cls : int
+        class to run performance measure on
+    verbose : boolean : True
+        report performance as a string
+    Returns
+    -------
+    tuple of four performance values (TP, FP, TN, FN)
+    '''
+
+    assert len(y_actual) == len(y_pred), 'Must be same number of actual and predicted values'
+
+    TP = 0
+    FP = 0
+    TN = 0
+    FN = 0
+    for i in range(len(y_actual)): 
+        if (y_actual[i]==y_pred[i]) and (y_pred[i]==cls):
+           TP += 1
+        if (y_pred[i]==cls) and (y_actual[i]!=y_pred[i]):
+           FP += 1
+        if (y_actual[i]==y_pred[i]) and (y_pred[i]!=cls):
+           TN += 1
+        if (y_pred[i]!=cls) and (y_actual[i]!=y_pred[i]):
+           FN += 1
+    if verbose is True:
+        print(f'(TP, FP, TN, FN) = {(TP, FP, TN, FN)}')
+    return(TP, FP, TN, FN)
+
+
+def corrmat(df, method='pearson', show_plot=True, **kwargs):
+    '''
+
+    '''
+    plot_options = {'annot': True, 
+                    'fmt': "f",
+                    }
+    plot_options.update(kwargs)
+    error_msg = "Correlation type not available. Select" +\
+                "from pearson, spearman, or kendall corr."
+    switch_case = {'pearson': df.corr(),
+                   'spearman': df.corr(method=method),
+                   'kendall': df.corr(method=method)}
+    corr_mat = switch_case.get(method, lambda: error_msg)
+    if show_plot:
+        return heatmap(corr_mat, **plot_options)
+    return corr_mat
diff --git a/diff_predictor/predxgboost.py b/diff_predictor/predxgboost.py
@@ -3,8 +3,14 @@
 import json
 import numpy as np
 import pandas as pd
-from os import path
-from sklearn.metrics import accuracy_score
+import xgboost as xgb
+import shap
+from matplotlib import colors as plt_colors
+import operator
+
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
+from sklearn import preprocessing
 
 import xgboost as xgb
 from xgboost import callback, DMatrix, Booster
@@ -71,8 +77,8 @@ def mknfold(X_train, y_train, nfold, param, evals=(), features=None):
     wt_list : list
         list of weights for each fold. This is the size of each fold
     '''
-    if not features:
-        features = X_train.columns
+    #if not features:
+        #features = X_train.columns
     out_idset, wt_list = bin_fold(X_train, nfold)
     in_idset = [np.concatenate([out_idset[i]
                                 for i in range(nfold) if k != i])
@@ -469,7 +475,7 @@ def _gs_helper(var1n, var2n, best_model, best_param,
     return best_model, best_param, best_eval, best_boost_rounds
 
 
-def train(param, dtrain, dtest, dval=None, evals=None, num_round=2000):
+def train(param, dtrain, dtest, dval=None, evals=None, num_round=2000, verbose=True):
     '''
     Parameters
     ----------
@@ -514,13 +520,13 @@ def train(param, dtrain, dtest, dval=None, evals=None, num_round=2000):
         evals = [(dtrain, 'train')]
     if dval is not None and (dval, 'eval') not in evals:
         evals += [(dval, 'eval')]
-    model = xgb.train(param, dtrain, num_round, evals, )
+    model = xgb.train(param, dtrain, num_round, evals, verbose_eval=verbose)
     true_label = dtest.get_label()
     ypred = model.predict(dtest)
     preds = [np.where(x == np.max(x))[0][0] for x in ypred]
     acc = accuracy_score(true_label, preds)
-    print("Accuracy:", acc)
-    return model, acc
+    print("Accuracy:",acc)
+    return model, acc, true_label, preds
 
 
 def save(model, filename):