Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Develop fol #83

Closed
wants to merge 14 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions examples/demo_minimum_separation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from datetime import datetime
import ampycloud
from ampycloud.utils import mocker
from ampycloud.plots import diagnostic
import sys

# Generate the demo dataset for ampycloud to test the minimum separation parameter.
# Your data should have *exactly* this structure

## EXAMPLE 1
n_ceilos = 4
lookback_time = 1200
rate = 30
mock_data = mocker.mock_layers(n_ceilos, lookback_time, rate,
[{'alt': 500, 'alt_std': 30, 'sky_cov_frac': 0.5,
'period': 100, 'amplitude': 20},
{'alt': 700, 'alt_std': 30, 'sky_cov_frac': 0.5,
'period': 100, 'amplitude': 20},
{'alt': 1400, 'alt_std': 30, 'sky_cov_frac': 0.5,
'period': 100, 'amplitude': 20},
{'alt': 1600, 'alt_std': 30, 'sky_cov_frac': 0.5,
'period': 100, 'amplitude': 20}])

# Run the ampycloud algorithm on it, setting the MSA to 10'000 ft
chunk = ampycloud.run(mock_data, geoloc='Mock data', ref_dt=datetime.now())

# Get the resulting METAR message
print(chunk.metar_msg())

# Display the full information available for the layers found
print(chunk.layers)

# And for the most motivated, plot the diagnostic diagram
diagnostic(chunk, upto='layers', show=True, save_stem='ampycloud_demo_minsep', save_fmts=['png'])
4 changes: 4 additions & 0 deletions src/ampycloud/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,10 @@ def run(data: pd.DataFrame, prms: dict = None, geoloc: str = None,
# Or to adjust some other algorithm parameter:
prms = {'GROUPING_PRMS':{'dt_scale_kwargs':{'scale': 300}}}

# Beware not to overwrite parameters! DO NOT write it as:
prms = {'GROUPING_PRMS':{'dt_scale_kwargs':{'scale': 300}}, 'GROUPING_PRMS':{'algo': 'agglomerative'}}
# but write as:
prms = {'GROUPING_PRMS':{'dt_scale_kwargs':{'scale': 300}, 'algo': 'agglomerative'}}

The :py:class:`.data.CeiloChunk` instance returned by this function contains all the information
associated to the ampycloud algorithm, inclduing the raw data and slicing/grouping/layering
Expand Down
161 changes: 140 additions & 21 deletions src/ampycloud/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@
from abc import ABC, abstractmethod
import numpy as np
import pandas as pd
import warnings

# Import from this package
from .errors import AmpycloudError
from .errors import AmpycloudError, AmpycloudWarning
from .logger import log_func_call
from . import scaler, cluster, layer
from . import wmo, icao
Expand Down Expand Up @@ -258,18 +259,87 @@ def max_hits_per_layer(self) -> int:
return int(np.sum(out))

@log_func_call(logger)
def metarize(self, which: int = 'slices', base_frac: float = 0.1,
def merge_sligrolay(self, which='groups', min_sep_steps=[2000,5000],
min_sep_vals=[150,500,1000]):
""" Method to merge sli/gro/lay whose base altitudes are too close"""

# Get ids of each point within that sli/gro/lay
point_ids = copy.copy(self.data.loc[:, f'{which[:-1]}_id'])

# Get ids of each sli/gro/lay
if which == "slices":
comp_ids = np.argsort(self._slices['original_id'])
sligroly_heights = np.sort(self._slices['alt_base'])
elif which == "groups":
comp_ids = np.argsort(self._groups['original_id'])
sligroly_heights = np.sort(self._groups['alt_base'])
elif which == "layers":
comp_ids = np.argsort(self._layers['original_id'])
sligroly_heights = np.sort(self._layers['alt_base'])

best_ncomp = len(sligroly_heights)
print(which, sligroly_heights)
if best_ncomp > 1:
for (ind, delta) in enumerate(np.diff(sligroly_heights)):
# Get minimum separation based on group base alt
if which == "slices":
idx_sep = utils.index_between(min_sep_steps, self.slices.at[ind, 'alt_base'])
elif which == "groups":
idx_sep = utils.index_between(min_sep_steps, self.groups.at[ind, 'alt_base'])
elif which == "layers":
idx_sep = utils.index_between(min_sep_steps, self.layers.at[ind, 'alt_base'])
min_sep = min_sep_vals[idx_sep]

# Estimate the resolution of the data (by measuring the minimum separation between two data
# points).
res_orig = np.diff(np.sort(sligroly_heights.reshape(len(sligroly_heights))))
res_orig = np.min(res_orig[res_orig > 0])
logger.debug('res_orig: %.2f', res_orig)
# Is min_sep sufficiently large, given the data resolution ? If not, we we end up with some
# over-layering.
if min_sep < 5*res_orig:
warnings.warn(f'Huh ! min_sep={min_sep} is smaller than 5*res_orig={5*res_orig}.' +
'This could lead to an over-layering for thin groups !',
AmpycloudWarning)

# If the the delta is large enough, move on ...
if delta >= min_sep:
continue

# Else, I have two components that are "too close" from each other. Let's merge them by
# re-assigning the ids accordingly.
#print("ind,delta",ind, delta, min_sep)
#print("comp_ids", comp_ids)
print("point_ids:\n", point_ids)
point_ids.loc[point_ids == comp_ids[ind+1]] = comp_ids[ind]
point_ids.loc[ind+1] = comp_ids[ind]
print("point_ids:\n", point_ids)

# Decrease the number of valid ids
best_ncomp -= 1

self.data.loc[:, f'{which[:-1]}_id'] = point_ids
print(self.data)

@log_func_call(logger)
def metarize(self, which: int = 'slices', base_perc: Union[int, float] = 10,
lookback_perc: Union[int, float] = 100,
lim0: Union[int, float] = 0, lim8: Union[int, float] = 100) -> None:
""" Assembles a :py:class:`pandas.DataFrame` of slice/group/layer METAR properties of
interest.

Args:
which (str, optional): whether to process 'slices', 'groups', or 'layers'.
Defaults to 'slices'.
base_frac (float, optional): amount of the lowest slice/group/layer elements,
expressed as a fraction (0<=base_frac<=1) of the total hit count, to consider when
deriving the slice/group/layer altitude (as a median). Defaults to 0.1, i.e. the
cloud base is the median of the lowest 10% of cloud hits of that slice/group/layer.
base_perc (int|float, optional): percentile of the cloud hit heights (sorted from smallest
to highest) to derive the slice/group/layer altitude. Defaults to 10, i.e. the
cloud base is the 10% percentile of cloud heights of that slice/group/layer.
lookback_perc (int|float, optional): percentile of the cloud hit times to compute the
cloud height of a given layer. This parameter serves to increase the temporal
representativity of the cloud height in case of layers that have an increasing height
during the reference period.
Defaults to 100, i.e. all cloud hits are considered to compute the height.
50 -> only 50% of the most recent hits are used.
lim0 (int|float, optional): upper limit of the sky coverage percentage for the 0 okta
bin, in %. Defaults to 0.
lim8 (int|float, optional): lower limit of the sky coverage percentage for the 8 okta
Expand Down Expand Up @@ -384,15 +454,15 @@ def metarize(self, which: int = 'slices', base_frac: float = 0.1,
# Compute the corresponding okta level
pdf.iloc[ind]['okta'] = int(wmo.perc2okta(pdf.iloc[ind]['perc'], lim0=lim0, lim8=lim8))

# What does XX% of the total hits represent, in terms of absolute hit counts ?
n_xxp = int(np.ceil(pdf.iloc[ind]['n_hits'] * base_frac))
# How many hits are there in the most recent period of that sli/gro/lay according to lookback_perc?
n_xxp = int(np.ceil(pdf.iloc[ind]['n_hits'] * lookback_perc/100))

if n_xxp > 0:
# Measure the median altitude of the XX% smallest hits in the cluster,
# and use this as the layer base.
# Compute the cloud height as the percentile of the most recent hits of the sli/gro/lay
pdf.iloc[ind]['alt_base'] = \
np.median(self.data.loc[in_sligrolay, 'alt'].nsmallest(n_xxp))
# Measure the mean altitude and associated std of the layer
np.percentile(self.data.loc[in_sligrolay].nlargest(n_xxp, 'dt')['alt'], base_perc)

# Measure the mean altitude and associated std of the (full) layer
pdf.iloc[ind]['alt_mean'] = np.nanmean(self.data.loc[in_sligrolay, 'alt'])
pdf.iloc[ind]['alt_std'] = np.nanstd(self.data.loc[in_sligrolay, 'alt'])

Expand Down Expand Up @@ -469,7 +539,22 @@ def find_slices(self) -> None:
pass

# Finally, let's metarize these slices !
self.metarize(which='slices', lim0=self.prms['OKTA_LIM0'], lim8=self.prms['OKTA_LIM8'])
self.metarize(which='slices', base_perc=self.prms['LAYER_BASE_PERC'],
lookback_perc=self.prms['LAYER_LOOKBACK_PERC'],
lim0=self.prms['OKTA_LIM0'], lim8=self.prms['OKTA_LIM8'])

# Merge too close layers and re-metarize
if self.prms['SLICING_PRMS']['min_sep_kwargs']['steps'] is not None and \
self.prms['SLICING_PRMS']['min_sep_kwargs']['separ'] is not None:
# Merge too close slices
self.merge_sligrolay(which='slices',
min_sep_steps=self.prms['SLICING_PRMS']['min_sep_kwargs']['steps'],
min_sep_vals=self.prms['SLICING_PRMS']['min_sep_kwargs']['separ'])

# Re-metarize
self.metarize(which='slices', base_perc=self.prms['LAYER_BASE_PERC'],
lookback_perc=self.prms['LAYER_LOOKBACK_PERC'],
lim0=self.prms['OKTA_LIM0'], lim8=self.prms['OKTA_LIM8'])

@log_func_call(logger)
def find_groups(self) -> None:
Expand Down Expand Up @@ -576,7 +661,23 @@ def find_groups(self) -> None:
self.data.loc[to_fill, 'group_id'] = self.data.loc[to_fill, 'slice_id']

# Finally, let's metarize these !
self.metarize(which='groups', lim0=self.prms['OKTA_LIM0'], lim8=self.prms['OKTA_LIM8'])
self.metarize(which='groups', base_perc=self.prms['LAYER_BASE_PERC'],
lookback_perc=self.prms['LAYER_LOOKBACK_PERC'],
lim0=self.prms['OKTA_LIM0'], lim8=self.prms['OKTA_LIM8'])

# Merge too close layers and re-metarize
if self.prms['GROUPING_PRMS']['min_sep_kwargs']['steps'] is not None and \
self.prms['GROUPING_PRMS']['min_sep_kwargs']['separ'] is not None:
# Merge too close groups
self.merge_sligrolay(which='groups',
min_sep_steps=self.prms['GROUPING_PRMS']['min_sep_kwargs']['steps'],
min_sep_vals=self.prms['GROUPING_PRMS']['min_sep_kwargs']['separ'])

# Re-metarize
self.metarize(which='groups', base_perc=self.prms['LAYER_BASE_PERC'],
lookback_perc=self.prms['LAYER_LOOKBACK_PERC'],
lim0=self.prms['OKTA_LIM0'], lim8=self.prms['OKTA_LIM8'])


@log_func_call(logger)
def find_layers(self) -> None:
Expand Down Expand Up @@ -626,11 +727,14 @@ def find_layers(self) -> None:
# Reshape the array in anticipation of the GMM routine ...
gro_alts = gro_alts.reshape(-1, 1)

# Let's also get the overall group base alt
if self.groups.at[ind, 'alt_base'] > 10000:
min_sep = self.prms['LAYERING_PRMS']['min_sep_high']
else:
min_sep = self.prms['LAYERING_PRMS']['min_sep_low']
# Get minimum separation based on group base alt
min_sep_steps = self.prms['LAYERING_PRMS']['min_sep_kwargs']['steps']
min_sep_vals = self.prms['LAYERING_PRMS']['min_sep_kwargs']['separ']
if len(min_sep_steps) != len(min_sep_vals)-1:
raise AmpycloudError(f'Ouch ! len({min_sep_steps}) should be equal to len({min_sep_vals})-1.')

idx_sep = utils.index_between(min_sep_steps, self.groups.at[ind, 'alt_base'])
min_sep = min_sep_vals[idx_sep]
logger.info('Group base alt: %.1f', self.groups.at[ind, 'alt_base'])
logger.info('min_sep value: %.1f', min_sep)

Expand All @@ -641,7 +745,7 @@ def find_layers(self) -> None:

# And feed them to a Gaussian Mixture Model to figure out how many components it has ...
ncomp, sub_layers_id, _ = layer.ncomp_from_gmm(
gro_alts, ncomp_max=ncomp_max, min_sep=min_sep,
gro_alts, ncomp_max=ncomp_max, min_sep=0,
**self.prms['LAYERING_PRMS']['gmm_kwargs'])

# Add this info to the log
Expand All @@ -662,7 +766,22 @@ def find_layers(self) -> None:
self.data.loc[to_fill, 'layer_id'] = self.data.loc[to_fill, 'group_id']

# Finally, let's metarize these !
self.metarize(which='layers', lim0=self.prms['OKTA_LIM0'], lim8=self.prms['OKTA_LIM8'])
self.metarize(which='layers', base_perc=self.prms['LAYER_BASE_PERC'],
lookback_perc=self.prms['LAYER_LOOKBACK_PERC'],
lim0=self.prms['OKTA_LIM0'], lim8=self.prms['OKTA_LIM8'])

# Merge too close layers and re-metarize
if self.prms['LAYERING_PRMS']['min_sep_kwargs']['steps'] is not None and \
self.prms['LAYERING_PRMS']['min_sep_kwargs']['separ'] is not None:
# Merge too close layers
self.merge_sligrolay(which='layers',
min_sep_steps=self.prms['LAYERING_PRMS']['min_sep_kwargs']['steps'],
min_sep_vals=self.prms['LAYERING_PRMS']['min_sep_kwargs']['separ'])

# Re-metarize
self.metarize(which='layers', base_perc=self.prms['LAYER_BASE_PERC'],
lookback_perc=self.prms['LAYER_LOOKBACK_PERC'],
lim0=self.prms['OKTA_LIM0'], lim8=self.prms['OKTA_LIM8'])

@property
def n_slices(self) -> Union[None, int]:
Expand Down
65 changes: 28 additions & 37 deletions src/ampycloud/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from sklearn.mixture import GaussianMixture

# Import from this module
from .errors import AmpycloudError, AmpycloudWarning
from .errors import AmpycloudError
from .logger import log_func_call
from .scaler import minmax_scale
from .utils import utils
Expand Down Expand Up @@ -191,18 +191,6 @@ def ncomp_from_gmm(vals: np.ndarray,
logger.debug('Skipping the GMM computation: all the values are the same.')
return (1, np.zeros(len(vals_orig)), None)

# Estimate the resolution of the data (by measuring the minimum separation between two data
# points).
res_orig = np.diff(np.sort(vals_orig.reshape(len(vals_orig))))
res_orig = np.min(res_orig[res_orig > 0])
logger.debug('res_orig: %.2f', res_orig)
# Is min_sep sufficiently large, given the data resolution ? If not, we we end up with some
# over-layering.
if min_sep < 5*res_orig:
warnings.warn(f'Huh ! min_sep={min_sep} is smaller than 5*res_orig={5*res_orig}.' +
'This could lead to an over-layering for thin groups !',
AmpycloudWarning)

# Rescale the data if warranted
if rescale_0_to_x is not None:
vals = minmax_scale(vals) * rescale_0_to_x
Expand Down Expand Up @@ -235,37 +223,40 @@ def ncomp_from_gmm(vals: np.ndarray,
logger.debug('best_model_ind (raw): %i', best_model_ind)
logger.debug('best_ncomp (raw): %i', best_ncomp)

# If I found only one component, I can stop here
if best_ncomp == 1:
return best_ncomp, best_ids, abics
# Merge too close layers, but only if you are not planning to do it anyway
# afterwards based on the function merge_sligrolay in data.py module!
if min_sep > 0:
# If I found only one component, I can stop here
if best_ncomp == 1:
return best_ncomp, best_ids, abics

# If I found more than one component, let's make sure that they are sufficiently far apart.
# First, let's compute the mean component heights
mean_comp_heights = [np.mean(vals_orig[best_ids == i]) for i in range(ncomp[best_model_ind])]
# If I found more than one component, let's make sure that they are sufficiently far apart.
# First, let's compute the mean component heights
mean_comp_heights = [np.mean(vals_orig[best_ids == i]) for i in range(ncomp[best_model_ind])]

# These may not be ordered, so let's keep track of the indices
# First, let's deal with the fact that they are not ordered.
comp_ids = np.argsort(mean_comp_heights)
# These may not be ordered, so let's keep track of the indices
# First, let's deal with the fact that they are not ordered.
comp_ids = np.argsort(mean_comp_heights)

# Now loop throught the different components, check if they are far sufficiently far apart,
# and merge them otherwise.
for (ind, delta) in enumerate(np.diff(np.sort(mean_comp_heights))):
# Now loop throught the different components, check if they are far sufficiently far apart,
# and merge them otherwise.
for (ind, delta) in enumerate(np.diff(np.sort(mean_comp_heights))):

# If the the delta is large enough, move on ...
if delta >= min_sep:
continue
# If the the delta is large enough, move on ...
if delta >= min_sep:
continue

# Else, I have two components that are "too close" from each other. Let's merge them by
# re-assigning the ids accordingly.
best_ids[best_ids == comp_ids[ind+1]] = comp_ids[ind]
comp_ids[ind+1] = comp_ids[ind]
# Else, I have two components that are "too close" from each other. Let's merge them by
# re-assigning the ids accordingly.
best_ids[best_ids == comp_ids[ind+1]] = comp_ids[ind]
comp_ids[ind+1] = comp_ids[ind]

# Decrease the number of valid ids
best_ncomp -= 1
# Decrease the number of valid ids
best_ncomp -= 1

if not len(np.unique(best_ids)) == best_ncomp:
raise AmpycloudError('Ouch ! This error is impossible !')
if not len(np.unique(best_ids)) == best_ncomp:
raise AmpycloudError('Ouch ! This error is impossible !')

logger.debug('best_ncomp (final): %i', best_ncomp)
logger.debug('best_ncomp (final): %i', best_ncomp)

return best_ncomp, best_ids, abics
Loading