minor bug fixes and improved mixture model

MatthewReid854 · Sep 9, 2021 · 78c3ca2 · 78c3ca2
1 parent 74ff19e
commit 78c3ca2
Show file tree

Hide file tree

Showing 5 changed files with 100 additions and 77 deletions.
diff --git a/docs/Changelog.rst b/docs/Changelog.rst
@@ -14,9 +14,9 @@ Changelog
 
 **New features**
 
--    Downsampling for the scatter plot on probability plots. This makes plotting much faster for large datasets (>1000 failures), particularly when using Fit_Everything which has many subplots.
 -    Added Weibull_Mixture, Weibull_CR, and Weibull_DS to Fit Everything. This also required changes to the plot window sizes.
 -    Changed the histogram plot from Fit_Everything. Legend is now on the left as it was getting too long with 15 items. New method used to scale the CDF histogram when there is censored data which is more accurate than the previous scaling method.
+-    Downsampling for the scatter plot on probability plots. This makes plotting much faster for large datasets (>1000 failures), particularly when using Fit_Everything which has many subplots.
 
 **API Changes**
 
@@ -28,10 +28,12 @@ Changelog
 -    Fit_Everything had a bug when the best distribution was Weibull_Mixture, Weibull_CR, Weibull_DS due to these distributions not having param_title_long.
 -    Probability plots with very large datasets (>10000 items) sometimes resulted in the upper ylim being 1. This is equivalent to infinity on a probability plot and caused an error. It is now manually corrected for.
 -    The least squares method for Fit_Gamma_3P produced a very poor estimate due to a bug. This carried across into the MLE result since LS is used for the MLE initial guess.
+-    The confidence intervals would sometimes not be displayed on probability plots with very small datasets. This was due to the CI arrays still containing 1's at the extremities. This is now corrected.
 
 **Other**
 
 -    Changed the method used by curve_fit within least_squares. Previously was 'dogleg' which was very slow. Changed to 'trf'. This significantly speeds up the location shifted distributions (Weibull_3P, etc.)
+-    Changed the group splitting algorithm used in Fit_Weibull_Mixture and Fit_Weibull_CR. The new method is more robust and provides better a better initial guess of the parameters for MLE.
 
 **Version: 0.6.0 --- Released: 23 July 2021**
 '''''''''''''''''''''''''''''''''''''''''''''

diff --git a/docs/Mixture models.rst b/docs/Mixture models.rst
@@ -176,7 +176,7 @@ In this example, we will compare how well the Weibull Mixture performs vs a sing
     plt.show()
 
     '''
-    Weibull_Mixture BIC: 6431.578404181696 
+    Weibull_Mixture BIC: 6431.578404076784
     Weibull_2P BIC: 6511.511759597337
     '''
 

diff --git a/docs/Probability plots.rst b/docs/Probability plots.rst
@@ -225,9 +225,9 @@ When matplotlib is asked to plot large datasets (thousands of items), it can bec
 
 Downsampling only affects the scatterplot, not the calculations. It is applied automatically for all probability plots (including when these plots are generated as an output from the Fitters), but can be controlled using the "downsample_scatterplot" keyword.
 
-If "downsample_scatterplot" is True or None, and there are over 1000 points, then the scatterplot will be downsampled by a factor. The default downsample factor will seek to produce between 500 and 1000 points. If a number is specified, it will be used as the downsample factor. FOr example, if 2 is specified then every 2nd point will be displayed, whereas if 3 is specified then every 3rd point will be displayed.
+If "downsample_scatterplot" is True or None, and there are over 1000 points, then the scatterplot will be downsampled by a factor. The default downsample factor will seek to produce between 500 and 1000 points. If a number is specified, it will be used as the downsample factor. For example, if 2 is specified then every 2nd point will be displayed, whereas if 3 is specified then every 3rd point will be displayed.
 
-The min and max points will always be retained in the downsampled scatterplot which helps to preserve the plotting range.
+The min and max points will always be displayed in the downsampled scatterplot which preserves the plotting range.
 
 See the `API <https://reliability.readthedocs.io/en/latest/API%20reference.html>`_ documentation for more detail on the default in each function.
 

diff --git a/reliability/Fitters.py b/reliability/Fitters.py
@@ -3428,51 +3428,58 @@ def __init__(
             failures=failures, right_censored=right_censored
         )  # this is only used to find AD
 
-        # find the division line. This is to assign data to each group
-        h = np.histogram(failures, bins=50, density=True)
-        hist_counts = h[0]
-        hist_bins = h[1]
-        midbins = []
-        for i in range(len(hist_bins)):
-            if i > 0 and i < len(hist_bins):
-                midbins.append((hist_bins[i] + hist_bins[i - 1]) / 2)
-        peaks_x = []
-        peaks_y = []
-        batch_width = 8
-        for i, x in enumerate(hist_counts):
-            if i < batch_width:
-                batch = hist_counts[0 : i + batch_width]
-            elif i > batch_width and i > len(hist_counts - batch_width):
-                batch = hist_counts[i - batch_width : len(hist_counts)]
-            else:
-                batch = hist_counts[
-                    i - batch_width : i + batch_width
-                ]  # the histogram counts are batched (actual batch size = 2 x batch_width)
-            if (
-                max(batch) == x
-            ):  # if the current point is higher than the rest of the batch then it is counted as a peak
-                peaks_x.append(midbins[i])
-                peaks_y.append(x)
-        if (
-            len(peaks_x) > 2
-        ):  # if there are more than 2 peaks, the mean is moved based on the height of the peaks. Higher peaks will attract the mean towards them more than smaller peaks.
-            yfracs = np.array(peaks_y) / sum(peaks_y)
-            division_line = sum(peaks_x * yfracs)
+        # this algorithm is used to estimate the dividing line between the two groups
+        # firstly it fits a gaussian kde to the histogram
+        # then it draws two straight lines from the highest peak of the kde down to the lower and upper bounds of the failures
+        # the dividing line is the point where the difference between the kde and the straight lines is greatest
+        max_failures = max(failures)
+        min_failures = min(failures)
+        gkde = ss.gaussian_kde(failures)
+        delta = max_failures - min_failures
+        x_kde = np.linspace(min_failures - delta / 5, max_failures + delta / 5, 100)
+        y_kde = gkde.evaluate(x_kde)
+        peak_y = max(y_kde)
+        peak_x = x_kde[np.where(y_kde == peak_y)][0]
+
+        left_x = min_failures
+        left_y = gkde.evaluate(left_x)
+        left_m = (peak_y - left_y) / (peak_x - left_x)
+        left_c = -left_m * left_x + left_y
+        left_line_x = np.linspace(left_x, peak_x, 100)
+        left_line_y = left_m * left_line_x + left_c  # y=mx+c
+        left_kde = gkde.evaluate(left_line_x)
+        left_diff = abs(left_line_y - left_kde)
+        left_diff_max = max(left_diff)
+        left_div_line = left_line_x[np.where(left_diff == left_diff_max)][0]
+
+        right_x = max_failures
+        right_y = gkde.evaluate(right_x)
+        right_m = (right_y - peak_y) / (right_x - peak_x)
+        right_c = -right_m * right_x + right_y
+        right_line_x = np.linspace(peak_x, right_x, 100)
+        right_line_y = right_m * right_line_x + right_c  # y=mx+c
+        right_kde = gkde.evaluate(right_line_x)
+        right_diff = abs(right_line_y - right_kde)
+        right_diff_max = max(right_diff)
+        right_div_line = right_line_x[np.where(right_diff == right_diff_max)][0]
+
+        if left_diff_max > right_diff_max:
+            dividing_line = left_div_line
         else:
-            division_line = np.average(peaks_x)
-        self.division_line = division_line
+            dividing_line = right_div_line
+
         # this is the point at which data is assigned to one group or another for the purpose of generating the initial guess
         GROUP_1_failures = []
         GROUP_2_failures = []
         GROUP_1_right_cens = []
         GROUP_2_right_cens = []
         for item in failures:
-            if item < division_line:
+            if item < dividing_line:
                 GROUP_1_failures.append(item)
             else:
                 GROUP_2_failures.append(item)
         for item in right_censored:
-            if item < division_line:
+            if item < dividing_line:
                 GROUP_1_right_cens.append(item)
             else:
                 GROUP_2_right_cens.append(item)
@@ -3492,9 +3499,8 @@ def __init__(
             print_results=False,
             optimizer=optimizer,
         )
-        p_guess = (
-            len(GROUP_1_failures) + len(GROUP_1_right_cens)
-        ) / n  # proportion guess
+        # proportion guess
+        p_guess = (len(GROUP_1_failures) + len(GROUP_1_right_cens)) / n
         guess = [
             group_1_estimates.alpha,
             group_1_estimates.beta,
@@ -3906,51 +3912,59 @@ def __init__(
         _, y = plotting_positions(
             failures=failures, right_censored=right_censored
         )  # this is only used to find AD
-        # find the division line. This is to assign data to each group
-        h = np.histogram(failures, bins=50, density=True)
-        hist_counts = h[0]
-        hist_bins = h[1]
-        midbins = []
-        for i in range(len(hist_bins)):
-            if i > 0 and i < len(hist_bins):
-                midbins.append((hist_bins[i] + hist_bins[i - 1]) / 2)
-        peaks_x = []
-        peaks_y = []
-        batch_width = 8
-        for i, x in enumerate(hist_counts):
-            if i < batch_width:
-                batch = hist_counts[0 : i + batch_width]
-            elif i > batch_width and i > len(hist_counts - batch_width):
-                batch = hist_counts[i - batch_width : len(hist_counts)]
-            else:
-                batch = hist_counts[
-                    i - batch_width : i + batch_width
-                ]  # the histogram counts are batched (actual batch size = 2 x batch_width)
-            if (
-                max(batch) == x
-            ):  # if the current point is higher than the rest of the batch then it is counted as a peak
-                peaks_x.append(midbins[i])
-                peaks_y.append(x)
-        if (
-            len(peaks_x) > 2
-        ):  # if there are more than 2 peaks, the mean is moved based on the height of the peaks. Higher peaks will attract the mean towards them more than smaller peaks.
-            yfracs = np.array(peaks_y) / sum(peaks_y)
-            division_line = sum(peaks_x * yfracs)
+
+        # this algorithm is used to estimate the dividing line between the two groups
+        # firstly it fits a gaussian kde to the histogram
+        # then it draws two straight lines from the highest peak of the kde down to the lower and upper bounds of the failures
+        # the dividing line is the point where the difference between the kde and the straight lines is greatest
+        max_failures = max(failures)
+        min_failures = min(failures)
+        gkde = ss.gaussian_kde(failures)
+        delta = max_failures - min_failures
+        x_kde = np.linspace(min_failures - delta / 5, max_failures + delta / 5, 100)
+        y_kde = gkde.evaluate(x_kde)
+        peak_y = max(y_kde)
+        peak_x = x_kde[np.where(y_kde == peak_y)][0]
+
+        left_x = min_failures
+        left_y = gkde.evaluate(left_x)
+        left_m = (peak_y - left_y) / (peak_x - left_x)
+        left_c = -left_m * left_x + left_y
+        left_line_x = np.linspace(left_x, peak_x, 100)
+        left_line_y = left_m * left_line_x + left_c  # y=mx+c
+        left_kde = gkde.evaluate(left_line_x)
+        left_diff = abs(left_line_y - left_kde)
+        left_diff_max = max(left_diff)
+        left_div_line = left_line_x[np.where(left_diff == left_diff_max)][0]
+
+        right_x = max_failures
+        right_y = gkde.evaluate(right_x)
+        right_m = (right_y - peak_y) / (right_x - peak_x)
+        right_c = -right_m * right_x + right_y
+        right_line_x = np.linspace(peak_x, right_x, 100)
+        right_line_y = right_m * right_line_x + right_c  # y=mx+c
+        right_kde = gkde.evaluate(right_line_x)
+        right_diff = abs(right_line_y - right_kde)
+        right_diff_max = max(right_diff)
+        right_div_line = right_line_x[np.where(right_diff == right_diff_max)][0]
+
+        if left_diff_max > right_diff_max:
+            dividing_line = left_div_line
         else:
-            division_line = np.average(peaks_x)
-        self.division_line = division_line
+            dividing_line = right_div_line
+
         # this is the point at which data is assigned to one group or another for the purpose of generating the initial guess
         GROUP_1_failures = []
         GROUP_2_failures = []
         GROUP_1_right_cens = []
         GROUP_2_right_cens = []
         for item in failures:
-            if item < division_line:
+            if item < dividing_line:
                 GROUP_1_failures.append(item)
             else:
                 GROUP_2_failures.append(item)
         for item in right_censored:
-            if item < division_line:
+            if item < dividing_line:
                 GROUP_1_right_cens.append(item)
             else:
                 GROUP_2_right_cens.append(item)
@@ -4023,7 +4037,7 @@ def __init__(
                 str(
                     "WARNING: The hessian matrix obtained using the "
                     + self.optimizer
-                    + " optimizer is non-invertable for the Weibull competing risks model.\n"
+                    + " optimizer is non-invertable for the Weibull Competing Risks model.\n"
                     "Confidence interval estimates of the parameters could not be obtained.\n"
                     "You may want to try fitting the model using a different optimizer."
                 ),

diff --git a/reliability/Utils.py b/reliability/Utils.py
@@ -1941,7 +1941,7 @@ def clean_CI_arrays(xlower, xupper, ylower, yupper, plot_type="CDF"):
             ylower_out3 = np.append(ylower_out3, ylower_out2[i])
             yupper_out3 = np.append(yupper_out3, yupper_out2[i])
 
-    # final error check for lengths matching and there still being at least 2 elements remaning
+    # final error check for lengths matching and there still being at least 2 elements remaining
     if (
         len(xlower_out3) != len(xupper_out3)
         or len(xlower_out3) != len(yupper_out3)
@@ -1964,6 +1964,12 @@ def fill_no_autoscale(xlower, xupper, ylower, yupper, **kwargs):
     # generate the polygon
     xstack = np.hstack([xlower, xupper[::-1]])
     ystack = np.hstack([ylower, yupper[::-1]])
+
+    # this corrects illegal yvalues in the probability plot
+    if plt.gca().get_yscale() == "function":
+        ystack[np.where(ystack >= 1)] = 0.9999999
+        ystack[np.where(ystack <= 0)] = 0.0000001
+
     polygon = np.column_stack([xstack, ystack])
     # this is equivalent to fill as it makes a polygon
     col = PolyCollection([polygon], **kwargs)
@@ -2133,6 +2139,7 @@ def exponential_CI(
                 )  # still need to specify color otherwise the invisible CI lines will consume default colors
                 # plt.scatter(t + self.gamma, yy_lower,color='blue',marker='.')
                 # plt.scatter(t + self.gamma, yy_upper, color='red', marker='.')
+
             elif plot_CI is None and q is not None:
                 return t_lower, t_upper