MeteoSwiss · loforest · Aug 22, 2022 · Aug 22, 2022 · Aug 23, 2022 · Aug 25, 2022
diff --git a/examples/demo_minimum_separation.py b/examples/demo_minimum_separation.py
@@ -0,0 +1,34 @@
+from datetime import datetime
+import ampycloud
+from ampycloud.utils import mocker
+from ampycloud.plots import diagnostic
+import sys
+
+# Generate the demo dataset for ampycloud to test the minimum separation parameter.
+# Your data should have *exactly* this structure
+
+## EXAMPLE 1
+n_ceilos = 4
+lookback_time = 1200
+rate = 30
+mock_data = mocker.mock_layers(n_ceilos, lookback_time, rate,
+                                   [{'alt': 500, 'alt_std': 30, 'sky_cov_frac': 0.5,
+                                     'period': 100, 'amplitude': 20},
+                                     {'alt': 700, 'alt_std': 30, 'sky_cov_frac': 0.5,
+                                     'period': 100, 'amplitude': 20},
+                                     {'alt': 1400, 'alt_std': 30, 'sky_cov_frac': 0.5,
+                                     'period': 100, 'amplitude': 20},
+                                     {'alt': 1600, 'alt_std': 30, 'sky_cov_frac': 0.5,
+                                     'period': 100, 'amplitude': 20}])
+
+# Run the ampycloud algorithm on it, setting the MSA to 10'000 ft
+chunk = ampycloud.run(mock_data, geoloc='Mock data', ref_dt=datetime.now())
+
+# Get the resulting METAR message
+print(chunk.metar_msg())
+
+# Display the full information available for the layers found
+print(chunk.layers)
+
+# And for the most motivated, plot the diagnostic diagram
+diagnostic(chunk, upto='layers', show=True, save_stem='ampycloud_demo_minsep', save_fmts=['png'])
@@ -219,6 +219,10 @@ def run(data: pd.DataFrame, prms: dict = None, geoloc: str = None,
             # Or to adjust some other algorithm parameter:
             prms = {'GROUPING_PRMS':{'dt_scale_kwargs':{'scale': 300}}}
 
+            # Beware not to overwrite parameters! DO NOT write it as:
+            prms = {'GROUPING_PRMS':{'dt_scale_kwargs':{'scale': 300}}, 'GROUPING_PRMS':{'algo': 'agglomerative'}}
+            # but write as:
+            prms = {'GROUPING_PRMS':{'dt_scale_kwargs':{'scale': 300}, 'algo': 'agglomerative'}}
 
     The :py:class:`.data.CeiloChunk` instance returned by this function contains all the information
     associated to the ampycloud algorithm, inclduing the raw data and slicing/grouping/layering

@@ -15,9 +15,10 @@
 from abc import ABC, abstractmethod
 import numpy as np
 import pandas as pd
+import warnings
 
 # Import from this package
-from .errors import AmpycloudError
+from .errors import AmpycloudError, AmpycloudWarning
 from .logger import log_func_call
 from . import scaler, cluster, layer
 from . import wmo, icao
@@ -258,18 +259,87 @@ def max_hits_per_layer(self) -> int:
         return int(np.sum(out))
 
     @log_func_call(logger)
-    def metarize(self, which: int = 'slices', base_frac: float = 0.1,
+    def merge_sligrolay(self, which='groups', min_sep_steps=[2000,5000],
+                        min_sep_vals=[150,500,1000]):
+        """ Method to merge sli/gro/lay whose base altitudes are too close"""
+
+        # Get ids of each point within that sli/gro/lay
+        point_ids = copy.copy(self.data.loc[:, f'{which[:-1]}_id'])
+
+        # Get ids of each sli/gro/lay
+        if which == "slices":
+            comp_ids = np.argsort(self._slices['original_id'])
+            sligroly_heights = np.sort(self._slices['alt_base'])
+        elif which == "groups":
+            comp_ids = np.argsort(self._groups['original_id'])
+            sligroly_heights = np.sort(self._groups['alt_base'])
+        elif which == "layers":
+            comp_ids = np.argsort(self._layers['original_id'])
+            sligroly_heights = np.sort(self._layers['alt_base'])
+
+        best_ncomp = len(sligroly_heights)
+        print(which, sligroly_heights)
+        if best_ncomp > 1:
+            for (ind, delta) in enumerate(np.diff(sligroly_heights)):
+                # Get minimum separation based on group base alt
+                if which == "slices":
+                    idx_sep = utils.index_between(min_sep_steps, self.slices.at[ind, 'alt_base'])
+                elif which == "groups":
+                    idx_sep = utils.index_between(min_sep_steps, self.groups.at[ind, 'alt_base'])
+                elif which == "layers":
+                    idx_sep = utils.index_between(min_sep_steps, self.layers.at[ind, 'alt_base'])
+                min_sep = min_sep_vals[idx_sep]
+
+                # Estimate the resolution of the data (by measuring the minimum separation between two data
+                # points).
+                res_orig = np.diff(np.sort(sligroly_heights.reshape(len(sligroly_heights))))
+                res_orig = np.min(res_orig[res_orig > 0])
+                logger.debug('res_orig: %.2f', res_orig)
+                # Is min_sep sufficiently large, given the data resolution ? If not, we we end up with some
+                # over-layering.
+                if min_sep < 5*res_orig:
+                    warnings.warn(f'Huh ! min_sep={min_sep} is smaller than 5*res_orig={5*res_orig}.' +
+                                'This could lead to an over-layering for thin groups !',
+                                AmpycloudWarning)
+
+                # If the the delta is large enough, move on ...
+                if delta >= min_sep:
+                    continue
+
+                # Else, I have two components that are "too close" from each other. Let's merge them by
+                # re-assigning the ids accordingly.
+                #print("ind,delta",ind, delta, min_sep)
+                #print("comp_ids", comp_ids)
+                print("point_ids:\n", point_ids)
+                point_ids.loc[point_ids == comp_ids[ind+1]] = comp_ids[ind]
+                point_ids.loc[ind+1] = comp_ids[ind]
+                print("point_ids:\n", point_ids)
+
+                # Decrease the number of valid ids
+                best_ncomp -= 1
+
+            self.data.loc[:, f'{which[:-1]}_id'] = point_ids
+            print(self.data)
+
+    @log_func_call(logger)
+    def metarize(self, which: int = 'slices', base_perc: Union[int, float] = 10,
+                 lookback_perc: Union[int, float] = 100,
                  lim0: Union[int, float] = 0, lim8: Union[int, float] = 100) -> None:
         """ Assembles a :py:class:`pandas.DataFrame` of slice/group/layer METAR properties of
         interest.
 
         Args:
             which (str, optional): whether to process 'slices', 'groups', or 'layers'.
                 Defaults to 'slices'.
-            base_frac (float, optional): amount of the lowest slice/group/layer elements,
-                expressed as a fraction (0<=base_frac<=1) of the total hit count, to consider when
-                deriving the slice/group/layer altitude (as a median). Defaults to 0.1, i.e. the
-                cloud base is the median of the lowest 10% of cloud hits of that slice/group/layer.
+            base_perc (int|float, optional): percentile of the cloud hit heights (sorted from smallest
+                to highest) to derive the slice/group/layer altitude. Defaults to 10, i.e. the
+                cloud base is the 10% percentile of cloud heights of that slice/group/layer.
+            lookback_perc (int|float, optional): percentile of the cloud hit times to compute the
+                cloud height of a given layer. This parameter serves to increase the temporal
+                representativity of the cloud height in case of layers that have an increasing height
+                during the reference period.
+                Defaults to 100, i.e. all cloud hits are considered to compute the height.
+                50 -> only 50% of the most recent hits are used.
             lim0 (int|float, optional): upper limit of the sky coverage percentage for the 0 okta
                 bin, in %. Defaults to 0.
             lim8 (int|float, optional): lower limit of the sky coverage percentage for the 8 okta
@@ -384,15 +454,15 @@ def metarize(self, which: int = 'slices', base_frac: float = 0.1,
             # Compute the corresponding okta level
             pdf.iloc[ind]['okta'] = int(wmo.perc2okta(pdf.iloc[ind]['perc'], lim0=lim0, lim8=lim8))
 
-            # What does XX% of the total hits represent, in terms of absolute hit counts ?
-            n_xxp = int(np.ceil(pdf.iloc[ind]['n_hits'] * base_frac))
+            # How many hits are there in the most recent period of that sli/gro/lay according to lookback_perc?
+            n_xxp = int(np.ceil(pdf.iloc[ind]['n_hits'] * lookback_perc/100))
 
             if n_xxp > 0:
-                # Measure the median altitude of the XX% smallest hits in the cluster,
-                # and use this as the layer base.
+                # Compute the cloud height as the percentile of the most recent hits of the sli/gro/lay 
                 pdf.iloc[ind]['alt_base'] = \
-                    np.median(self.data.loc[in_sligrolay, 'alt'].nsmallest(n_xxp))
-                # Measure the mean altitude and associated std of the layer
+                    np.percentile(self.data.loc[in_sligrolay].nlargest(n_xxp, 'dt')['alt'], base_perc)
+
+                # Measure the mean altitude and associated std of the (full) layer
                 pdf.iloc[ind]['alt_mean'] = np.nanmean(self.data.loc[in_sligrolay, 'alt'])
                 pdf.iloc[ind]['alt_std'] = np.nanstd(self.data.loc[in_sligrolay, 'alt'])
 
@@ -469,7 +539,22 @@ def find_slices(self) -> None:
             pass
 
         # Finally, let's metarize these slices !
-        self.metarize(which='slices', lim0=self.prms['OKTA_LIM0'], lim8=self.prms['OKTA_LIM8'])
+        self.metarize(which='slices', base_perc=self.prms['LAYER_BASE_PERC'],
+                    lookback_perc=self.prms['LAYER_LOOKBACK_PERC'],
+                    lim0=self.prms['OKTA_LIM0'], lim8=self.prms['OKTA_LIM8'])
+
+        # Merge too close layers and re-metarize
+        if self.prms['SLICING_PRMS']['min_sep_kwargs']['steps'] is not None and \
+                self.prms['SLICING_PRMS']['min_sep_kwargs']['separ'] is not None:
+            # Merge too close slices
+            self.merge_sligrolay(which='slices',
+                    min_sep_steps=self.prms['SLICING_PRMS']['min_sep_kwargs']['steps'],
+                    min_sep_vals=self.prms['SLICING_PRMS']['min_sep_kwargs']['separ'])
+
+            # Re-metarize
+            self.metarize(which='slices', base_perc=self.prms['LAYER_BASE_PERC'],
+                        lookback_perc=self.prms['LAYER_LOOKBACK_PERC'],
+                        lim0=self.prms['OKTA_LIM0'], lim8=self.prms['OKTA_LIM8'])
 
     @log_func_call(logger)
     def find_groups(self) -> None:
@@ -576,7 +661,23 @@ def find_groups(self) -> None:
         self.data.loc[to_fill, 'group_id'] = self.data.loc[to_fill, 'slice_id']
 
         # Finally, let's metarize these !
-        self.metarize(which='groups', lim0=self.prms['OKTA_LIM0'], lim8=self.prms['OKTA_LIM8'])
+        self.metarize(which='groups', base_perc=self.prms['LAYER_BASE_PERC'],
+                    lookback_perc=self.prms['LAYER_LOOKBACK_PERC'],
+                    lim0=self.prms['OKTA_LIM0'], lim8=self.prms['OKTA_LIM8'])
+
+        # Merge too close layers and re-metarize
+        if self.prms['GROUPING_PRMS']['min_sep_kwargs']['steps'] is not None and \
+                self.prms['GROUPING_PRMS']['min_sep_kwargs']['separ'] is not None:
+            # Merge too close groups
+            self.merge_sligrolay(which='groups',
+                    min_sep_steps=self.prms['GROUPING_PRMS']['min_sep_kwargs']['steps'],
+                    min_sep_vals=self.prms['GROUPING_PRMS']['min_sep_kwargs']['separ'])
+
+            # Re-metarize
+            self.metarize(which='groups', base_perc=self.prms['LAYER_BASE_PERC'],
+                        lookback_perc=self.prms['LAYER_LOOKBACK_PERC'],
+                        lim0=self.prms['OKTA_LIM0'], lim8=self.prms['OKTA_LIM8'])
+
 
     @log_func_call(logger)
     def find_layers(self) -> None:
@@ -626,11 +727,14 @@ def find_layers(self) -> None:
             # Reshape the array in anticipation of the GMM routine ...
             gro_alts = gro_alts.reshape(-1, 1)
 
-            # Let's also get the overall group base alt
-            if self.groups.at[ind, 'alt_base'] > 10000:
-                min_sep = self.prms['LAYERING_PRMS']['min_sep_high']
-            else:
-                min_sep = self.prms['LAYERING_PRMS']['min_sep_low']
+            # Get minimum separation based on group base alt
+            min_sep_steps = self.prms['LAYERING_PRMS']['min_sep_kwargs']['steps']
+            min_sep_vals = self.prms['LAYERING_PRMS']['min_sep_kwargs']['separ']
+            if len(min_sep_steps) != len(min_sep_vals)-1:
+                raise AmpycloudError(f'Ouch ! len({min_sep_steps}) should be equal to len({min_sep_vals})-1.')
+
+            idx_sep = utils.index_between(min_sep_steps, self.groups.at[ind, 'alt_base'])
+            min_sep = min_sep_vals[idx_sep]
             logger.info('Group base alt: %.1f', self.groups.at[ind, 'alt_base'])
             logger.info('min_sep value: %.1f', min_sep)
 
@@ -641,7 +745,7 @@ def find_layers(self) -> None:
 
             # And feed them to a Gaussian Mixture Model to figure out how many components it has ...
             ncomp, sub_layers_id, _ = layer.ncomp_from_gmm(
-                gro_alts, ncomp_max=ncomp_max, min_sep=min_sep,
+                gro_alts, ncomp_max=ncomp_max, min_sep=0,
                 **self.prms['LAYERING_PRMS']['gmm_kwargs'])
 
             # Add this info to the log
@@ -662,7 +766,22 @@ def find_layers(self) -> None:
         self.data.loc[to_fill, 'layer_id'] = self.data.loc[to_fill, 'group_id']
 
         # Finally, let's metarize these !
-        self.metarize(which='layers', lim0=self.prms['OKTA_LIM0'], lim8=self.prms['OKTA_LIM8'])
+        self.metarize(which='layers', base_perc=self.prms['LAYER_BASE_PERC'],
+                    lookback_perc=self.prms['LAYER_LOOKBACK_PERC'],
+                    lim0=self.prms['OKTA_LIM0'], lim8=self.prms['OKTA_LIM8'])
+
+        # Merge too close layers and re-metarize
+        if self.prms['LAYERING_PRMS']['min_sep_kwargs']['steps'] is not None and \
+                self.prms['LAYERING_PRMS']['min_sep_kwargs']['separ'] is not None:
+            # Merge too close layers
+            self.merge_sligrolay(which='layers',
+                    min_sep_steps=self.prms['LAYERING_PRMS']['min_sep_kwargs']['steps'],
+                    min_sep_vals=self.prms['LAYERING_PRMS']['min_sep_kwargs']['separ'])
+
+            # Re-metarize
+            self.metarize(which='layers', base_perc=self.prms['LAYER_BASE_PERC'],
+                        lookback_perc=self.prms['LAYER_LOOKBACK_PERC'],
+                        lim0=self.prms['OKTA_LIM0'], lim8=self.prms['OKTA_LIM8'])
 
     @property
     def n_slices(self) -> Union[None, int]:

@@ -17,7 +17,7 @@
 from sklearn.mixture import GaussianMixture
 
 # Import from this module
-from .errors import AmpycloudError, AmpycloudWarning
+from .errors import AmpycloudError
 from .logger import log_func_call
 from .scaler import minmax_scale
 from .utils import utils
@@ -191,18 +191,6 @@ def ncomp_from_gmm(vals: np.ndarray,
         logger.debug('Skipping the GMM computation: all the values are the same.')
         return (1, np.zeros(len(vals_orig)), None)
 
-    # Estimate the resolution of the data (by measuring the minimum separation between two data
-    # points).
-    res_orig = np.diff(np.sort(vals_orig.reshape(len(vals_orig))))
-    res_orig = np.min(res_orig[res_orig > 0])
-    logger.debug('res_orig: %.2f', res_orig)
-    # Is min_sep sufficiently large, given the data resolution ? If not, we we end up with some
-    # over-layering.
-    if min_sep < 5*res_orig:
-        warnings.warn(f'Huh ! min_sep={min_sep} is smaller than 5*res_orig={5*res_orig}.' +
-                      'This could lead to an over-layering for thin groups !',
-                      AmpycloudWarning)
-
     # Rescale the data if warranted
     if rescale_0_to_x is not None:
         vals = minmax_scale(vals) * rescale_0_to_x
@@ -235,37 +223,40 @@ def ncomp_from_gmm(vals: np.ndarray,
     logger.debug('best_model_ind (raw): %i', best_model_ind)
     logger.debug('best_ncomp (raw): %i', best_ncomp)
 
-    # If I found only one component, I can stop here
-    if best_ncomp == 1:
-        return best_ncomp, best_ids, abics
+    # Merge too close layers, but only if you are not planning to do it anyway
+    # afterwards based on the function merge_sligrolay in data.py module!
+    if min_sep > 0:
+        # If I found only one component, I can stop here
+        if best_ncomp == 1:
+            return best_ncomp, best_ids, abics
 
-    # If I found more than one component, let's make sure that they are sufficiently far apart.
-    # First, let's compute the mean component heights
-    mean_comp_heights = [np.mean(vals_orig[best_ids == i]) for i in range(ncomp[best_model_ind])]
+        # If I found more than one component, let's make sure that they are sufficiently far apart.
+        # First, let's compute the mean component heights
+        mean_comp_heights = [np.mean(vals_orig[best_ids == i]) for i in range(ncomp[best_model_ind])]
 
-    # These may not be ordered, so let's keep track of the indices
-    # First, let's deal with the fact that they are not ordered.
-    comp_ids = np.argsort(mean_comp_heights)
+        # These may not be ordered, so let's keep track of the indices
+        # First, let's deal with the fact that they are not ordered.
+        comp_ids = np.argsort(mean_comp_heights)
 
-    # Now loop throught the different components, check if they are far sufficiently far apart,
-    # and merge them otherwise.
-    for (ind, delta) in enumerate(np.diff(np.sort(mean_comp_heights))):
+        # Now loop throught the different components, check if they are far sufficiently far apart,
+        # and merge them otherwise.
+        for (ind, delta) in enumerate(np.diff(np.sort(mean_comp_heights))):
 
-        # If the the delta is large enough, move on ...
-        if delta >= min_sep:
-            continue
+            # If the the delta is large enough, move on ...
+            if delta >= min_sep:
+                continue
 
-        # Else, I have two components that are "too close" from each other. Let's merge them by
-        # re-assigning the ids accordingly.
-        best_ids[best_ids == comp_ids[ind+1]] = comp_ids[ind]
-        comp_ids[ind+1] = comp_ids[ind]
+            # Else, I have two components that are "too close" from each other. Let's merge them by
+            # re-assigning the ids accordingly.
+            best_ids[best_ids == comp_ids[ind+1]] = comp_ids[ind]
+            comp_ids[ind+1] = comp_ids[ind]
 
-        # Decrease the number of valid ids
-        best_ncomp -= 1
+            # Decrease the number of valid ids
+            best_ncomp -= 1
 
-    if not len(np.unique(best_ids)) == best_ncomp:
-        raise AmpycloudError('Ouch ! This error is impossible !')
+        if not len(np.unique(best_ids)) == best_ncomp:
+            raise AmpycloudError('Ouch ! This error is impossible !')
 
-    logger.debug('best_ncomp (final): %i', best_ncomp)
+        logger.debug('best_ncomp (final): %i', best_ncomp)
 
     return best_ncomp, best_ids, abics