From db49b7d0f053de51c22a7859fbe22cbe3e0b8d82 Mon Sep 17 00:00:00 2001
From: GilesStrong <giles.strong@outlook.com>
Date: Wed, 13 Oct 2021 10:56:29 +0200
Subject: [PATCH] Fix boostrap stats and add median

---
 CHANGES.md                |  2 ++
 lumin/utils/statistics.py | 31 +++++++++++++++++--------------
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 687ea61..deb89e4 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -7,6 +7,7 @@
 ## Additions
 
 - `plot_feat` now shows a bar plot for categorical data
+- `bootstrap_stats` added median computation
 
 ## Removals
 
@@ -19,6 +20,7 @@
     - Fixes to do with the number of batches to expect when running few number of folds than the `FoldYielder` contains.
     - Correctly implements leave-one-out for the training folds
     - renamed `n_folds` to `n_repeats` to more accurately reflect its role.
+- `bootstrap_stats` corrected computation of central 68% CI: was `np.percentile(np.abs(points), 68.2)` now `(np.percentile(points, 84.135)-np.percentile(points, 15.865))/2`
 
 ## Changes
 
diff --git a/lumin/utils/statistics.py b/lumin/utils/statistics.py
index bc28333..9fb8bac 100644
--- a/lumin/utils/statistics.py
+++ b/lumin/utils/statistics.py
@@ -30,14 +30,15 @@ def bootstrap_stats(args:Dict[str,Any], out_q:Optional[mp.Queue]=None) -> Union[
         Result dictionary if `out_q` is `None` else `None`.
     '''
 
-    out_dict, mean, std, c68, boot = {}, [], [], [], []
+    out_dict, mean, median, std, c68, boot = {}, [], [], [], [], []
     name    = ''   if 'name'    not in args else args['name']
     weights = None if 'weights' not in args else args['weights']
-    if 'n'    not in args: args['n']    = 100
-    if 'kde'  not in args: args['kde']  = False
-    if 'mean' not in args: args['mean'] = False
-    if 'std'  not in args: args['std']  = False  
-    if 'c68'  not in args: args['c68']  = False
+    if 'n'      not in args: args['n']      = 100
+    if 'kde'    not in args: args['kde']    = False
+    if 'mean'   not in args: args['mean']   = False
+    if 'median' not in args: args['median'] = False
+    if 'std'    not in args: args['std']    = False  
+    if 'c68'    not in args: args['c68']    = False
     if args['kde'] and args['data'].dtype != 'float64': data = np.array(args['data'], dtype='float64')
     else:                                               data = args['data']
     len_d = len(data)
@@ -49,14 +50,16 @@ def bootstrap_stats(args:Dict[str,Any], out_q:Optional[mp.Queue]=None) -> Union[
             kde = KDEUnivariate(points)
             kde.fit()
             boot.append([kde.evaluate(x) for x in args['x']])
-        if args['mean']: mean.append(np.mean(points))
-        if args['std']:  std.append(np.std(points, ddof=1))
-        if args['c68']:  c68.append(np.percentile(np.abs(points), 68.2))
-
-    if args['kde']:  out_dict[f'{name}_kde']  = boot
-    if args['mean']: out_dict[f'{name}_mean'] = mean
-    if args['std']:  out_dict[f'{name}_std']  = std
-    if args['c68']:  out_dict[f'{name}_c68']  = c68
+        if args['mean']:   mean.append(np.mean(points))
+        if args['median']: median.append(np.median(points))
+        if args['std']:    std.append(np.std(points, ddof=1))
+        if args['c68']:    c68.append((np.percentile(points, 84.135)-np.percentile(points, 15.865))/2)
+
+    if args['kde']:    out_dict[f'{name}_kde']    = boot
+    if args['mean']:   out_dict[f'{name}_mean']   = mean
+    if args['median']: out_dict[f'{name}_median'] = median
+    if args['std']:    out_dict[f'{name}_std']    = std
+    if args['c68']:    out_dict[f'{name}_c68']    = c68
     if out_q is not None: out_q.put(out_dict)
     else: return out_dict