diff --git a/CHANGELOG.md b/CHANGELOG.md index 55cd7dd..c6d98c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +# 1.0.4 - January 2020 + +* Added PERT loss from FAIR methodology + # 1.0 - January 2020 * Open source release. diff --git a/README.md b/README.md index 291e8c1..7a33e02 100644 --- a/README.md +++ b/README.md @@ -8,14 +8,64 @@ python3 setup.py ## Using riskquant as a library +### simpleloss + +The simpleloss class uses a single value for frequency, and two values for a magnitude range that are mapped to a [lognormal distribution](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.lognorm.html). + + +The inputs to simpleloss are as follows: + + * Identifier: An identifying label, which need not be unique, (see "Uniqueness of identifiers" below) but is + intended to provide a way of identifying the scenario. + * Name: The name of the scenario, which should be more descriptive than the identifier. + * Frequency: The number of times that the loss will occur over some interval of time (e.g. 0.1 means 1 occurrence per 10 years on average). + * Low loss magnitude: A dollar value of the best-case scenario, given that the loss did occur. All our detection systems worked, so we + found out about the event and remediated quickly. + * High loss magnitude: A dollar value of the worst-cases scenario. Our detection systems didn't work, and the problem persisted for a long + time until it was unavoidable to notice it and stop it. + ```python >> from riskquant import simpleloss ->> s = simpleloss.SimpleLoss("ALICE", "Alice steals the data", 0.10, 1E6, 10E6) +>> s = simpleloss.SimpleLoss("T_PLANKTON", "Plankton steals the Krabby Patty recipe", 0.10, 100000, 1000000) >> s.annualized_loss() 40400.128269457266 ``` +### pertloss + +The pertloss class uses two values for a magnitude range that are mapped to a lognormal distribution, and +four values for frequency that are used to create a [Modified PERT distribution](https://www.tensorflow.org/probability/api_docs/python/tfp/experimental/substrates/numpy/distributions/PERT). + + +The inputs to pertloss are as follows: + + * Low loss magnitude: A dollar value of the best-case scenario, given that the loss did occur. All our detection systems worked, so we + found out about the event and remediated quickly. + * High loss magnitude: A dollar value of the worst-cases scenario. Our detection systems didn't work, and the problem persisted for a long + time until it was unavoidable to notice it and stop it. + * Minimum frequency: The lowest number of times a loss will occur over some interval of time + * Maximum frequency: The highest number of times a loss will occur over some interval of time + * Most likely frequency: The most likely number of times a loss will occur over some interval of time. Sets the skew of the distribution. + * Kurtosis: A number that controls the shape of the PERT distribution, with a default of 4. Higher values will cause a sharper peak. + In FAIR, this is called the "belief in the most likely" frequency, based on the confidence of the estimator in the most likely frequency. + With higher kurtosis, more samples in the simulation will be closer to the most likely frequency. + +```python +>> from riskquant import pertloss +>> p = pertloss.PERTLoss(10, 100, .1, .7, .3, kurtosis=1) +>> simulate_100 = p.simulate_years(100) +>> p.summarize_loss(simulate_100) + +{'minimum': 0, + 'tenth_percentile': 0, + 'mode': 0, + 'median': 1, + 'ninetieth_percentile': 2, + 'maximum': 6} +``` + + ## Using riskquant as a utility ```bash diff --git a/riskquant/__init__.py b/riskquant/__init__.py index 4b84747..f243cac 100644 --- a/riskquant/__init__.py +++ b/riskquant/__init__.py @@ -24,7 +24,7 @@ __appname__ = 'riskquant' -__version__ = '1.0.3' +__version__ = '1.0.4' NAME_VERSION = '%s %s' % (__appname__, __version__) diff --git a/riskquant/pertloss.py b/riskquant/pertloss.py new file mode 100644 index 0000000..d54538d --- /dev/null +++ b/riskquant/pertloss.py @@ -0,0 +1,114 @@ +"""A loss model based on a single loss scenario with + +* low_loss = Low loss amount +* high_loss = High loss amount +* min_freq: The lowest number of times a loss will occur +* max_freq: The highest number of times a loss will occur +* most_likely_freq: The most likely number of times a loss will occur over some interval of time. +* kurtosis: Defaults to 4. Controls the shape of the distribution. Higher values cause a sharper peak. + + +The range low_loss -> high_loss should represent the 90% confidence interval +that the loss will fall in that range. + +These values are then fit to a lognormal +distribution so that they fall at the 5% and 95% cumulative probability points. + +The range min_freq -> max_freq should represent the 90% confidence interval +that the frequency will fall in that range. + +The most_likely_freq will be used to skew the PERT distribution so that more of these values occur in the simulation. + +The kurtosis will be used to control the shape of the distribution; even more of the most_likely_freq values will +occur in the simulation with higher kurtosis. + +These values are then used to create Modified PERT distribution. + +""" +import math + +import numpy as np +from scipy.stats import lognorm, mode, norm +import tensorflow_probability as tfp + + +tfp = tfp.experimental.substrates.numpy +tfd = tfp.distributions +factor = -0.5 / norm.ppf(0.05) + + +class PERTLoss: + def __init__(self, low_loss, high_loss, min_freq, max_freq, most_likely_freq, kurtosis=4): + if min_freq >= max_freq: + # Min frequency must exceed max frequency + raise AssertionError + if not min_freq <= most_likely_freq <= max_freq: + # Most likely should be between min and max frequencies. + raise AssertionError + if low_loss >= high_loss: + # High loss must exceed low loss + raise AssertionError + + # Set up the lognormal distribution + mu = (math.log(low_loss) + math.log(high_loss)) / 2. # Average of the logn of low/high + shape = factor * (math.log(high_loss) - math.log(low_loss)) # Standard deviation + self.magnitude_distribution = lognorm(shape, scale=math.exp(mu)) + + # Set up the PERT distribution + # From FAIR: the most likely frequency will set the skew/peak, and + # the "confidence" in the most likely frequency will set the kurtosis/temp of the distribution. + self.frequency_distribution = tfd.PERT(low=min_freq, peak=most_likely_freq, high=max_freq, temperature=kurtosis) + + def annualized_loss(self): + """Expected mean loss per year as scaled by the most likely frequency + + :returns: Scalar of expected mean loss on an annualized basis.""" + + return self.frequency_distribution.mode().flat[0] * self.magnitude_distribution.mean() + + def single_loss(self): + """Draw a single loss amount. Not scaled by probability of occurrence. + + :returns: Scalar value of a randomly generated single loss amount.""" + + return self.magnitude_distribution.rvs() + + def simulate_losses_one_year(self): + """Generate a random frequency and random magnitude from distributions. + + :returns: Scalar value of one sample loss exposure.""" + sample_frequency = self.frequency_distribution.sample(1)[0] + sample_magnitude = self.single_loss() + loss = sample_frequency * sample_magnitude + return loss + + def simulate_years(self, n): + """Draw randomly to simulate n years of possible losses. + + :arg: n = Number of years to simulate + :returns: Numpy array of shape (n,) with loss amounts per year.""" + # Create array of possible frequencies + frequency_array = self.frequency_distribution.sample(n) + # The loss amounts for all the losses across all the years, generated all at once. + # This is much higher performance than generating one year at a time. + magnitude_array = self.magnitude_distribution.rvs(size=n) + # Multiply frequency times magnitude from each array. + loss_array = frequency_array * magnitude_array + return loss_array + + @staticmethod + def summarize_loss(loss_array): + """Get statistics about a numpy array. + Risk is a range of possibilities, not just one outcome. + + :arg: loss_array = Numpy array of simulated losses + :returns: Dictionary of statistics about the loss + """ + percentiles = np.percentile(loss_array, [10, 50, 90]).astype(int) + loss_summary = {'minimum': loss_array.min().astype(int), + 'tenth_percentile': percentiles[0], + 'mode': mode(loss_array)[0][0].astype(int), + 'median': percentiles[1], + 'ninetieth_percentile': percentiles[2], + 'maximum': loss_array.max().astype(int)} + return loss_summary diff --git a/tests/test_pertloss.py b/tests/test_pertloss.py new file mode 100644 index 0000000..0700df4 --- /dev/null +++ b/tests/test_pertloss.py @@ -0,0 +1,76 @@ +import unittest + +from riskquant import pertloss + + +class Test(unittest.TestCase): + def setUp(self): + min_freq = 0.1 + max_freq = .7 + most_likely = .3 + kurtosis = 1 + low_loss = 1 + high_loss = 10 + self.s = pertloss.PERTLoss(low_loss, high_loss, min_freq, max_freq, most_likely, kurtosis=kurtosis) + + def testAnnualized(self): + # Returns the mean of the configured distribution scaled by the mode of frequency distribution + self.assertAlmostEqual(self.s.annualized_loss(), 1.2120038962444237) + + def testDistribution(self): + # We defined the cdf(low) ~ 0.05 and the cdf(hi) ~ 0.95 so that + # it would be the 90% confidence interval. Check that it's true. + self.assertTrue(0.049 < self.s.magnitude_distribution.cdf(1) < 0.051) + self.assertTrue(0.949 < self.s.magnitude_distribution.cdf(10) < 0.951) + + def testSingleLoss(self): + # The mean of many single losses should be close to the + # mean of the distribution. We are not using probability p here. + iterations = 10000 + mean_loss = sum([self.s.single_loss() for _ in range(iterations)]) / iterations + self.assertGreater(mean_loss, 3.9) + self.assertLess(mean_loss, 4.2) + + def testSimulateLossesOneYear(self): + # Should return a list of zero or more losses that fall mostly within the + # configured range (1, 10) + losses = [] + for _ in range(100): + losses.append(self.s.simulate_losses_one_year()) + self.assertGreater(sum(losses), 0.05) + self.assertLess(sum(losses), 10000) + + def testSimulateYears(self): + # Should return a list of length == years + # whose mean is close to the annualized loss. + years = 10000 + losses = self.s.simulate_years(years) + self.assertEqual(len(losses), years) + mean_loss = sum(losses) / years + self.assertGreater(mean_loss, 0.5) + self.assertLess(mean_loss, 1.5) + + def testMinMaxFrequency(self): + # Min must be less than max. + with self.assertRaises(AssertionError): + pertloss.PERTLoss(10, 100, .7, .1, .3) # min > max + + def testMostLikelyFrequency(self): + # Most likely frequency must be between min and max. + with self.assertRaises(AssertionError): + pertloss.PERTLoss(10, 100, .1, .7, .8) # most_likely > max + + def testLowHighLoss(self): + # High loss must exceed low loss + with self.assertRaises(AssertionError): + pertloss.PERTLoss(100, 10, .1, .7, .3) # min > max + + def testSummary(self): + loss_array = self.s.simulate_years(1000) + summary = self.s.summarize_loss(loss_array) + self.assertEqual(summary['minimum'], 0) + self.assertEqual(summary['tenth_percentile'], 0) + self.assertEqual(summary['mode'], 0) + self.assertGreaterEqual(summary['median'], 1) + self.assertGreaterEqual(summary['ninetieth_percentile'], 2) + self.assertGreater(summary['maximum'], 5)