adding pertloss class and tests

Netflix-Skunkworks · Jan 31, 2020 · fd36325 · fd36325
1 parent d921d5f
commit fd36325
Show file tree

Hide file tree

Showing 5 changed files with 246 additions and 2 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# 1.0.4 - January 2020
+
+* Added PERT loss from FAIR methodology
+
 # 1.0 - January 2020
 
 * Open source release.
diff --git a/README.md b/README.md
@@ -8,14 +8,64 @@ python3 setup.py
 
 ## Using riskquant as a library
 
+### simpleloss  
+
+The simpleloss class uses a single value for frequency, and two values for a magnitude range that are mapped to a [lognormal distribution](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.lognorm.html).  
+
+
+The inputs to simpleloss are as follows:  
+
+ * Identifier: An identifying label, which need not be unique, (see "Uniqueness of identifiers" below) but is 
+  intended to provide a way of identifying the scenario. 
+ * Name: The name of the scenario, which should be more descriptive than the identifier.
+ * Frequency: The number of times that the loss will occur over some interval of time (e.g. 0.1 means 1 occurrence per 10 years on average).
+ * Low loss magnitude: A dollar value of the best-case scenario, given that the loss did occur. All our detection systems worked, so we
+  found out about the event and remediated quickly.
+ * High loss magnitude: A dollar value of the worst-cases scenario. Our detection systems didn't work, and the problem persisted for a long
+  time until it was unavoidable to notice it and stop it.
+
 ```python
 >> from riskquant import simpleloss
->> s = simpleloss.SimpleLoss("ALICE", "Alice steals the data", 0.10, 1E6, 10E6)
+>> s = simpleloss.SimpleLoss("T_PLANKTON", "Plankton steals the Krabby Patty recipe", 0.10, 100000, 1000000)
 >> s.annualized_loss()
 
 40400.128269457266
 ```
 
+### pertloss
+
+The pertloss class uses two values for a magnitude range that are mapped to a lognormal distribution, and
+four values for frequency that are used to create a [Modified PERT distribution](https://www.tensorflow.org/probability/api_docs/python/tfp/experimental/substrates/numpy/distributions/PERT).  
+
+
+The inputs to pertloss are as follows:  
+
+ * Low loss magnitude: A dollar value of the best-case scenario, given that the loss did occur. All our detection systems worked, so we
+  found out about the event and remediated quickly.
+ * High loss magnitude: A dollar value of the worst-cases scenario. Our detection systems didn't work, and the problem persisted for a long
+  time until it was unavoidable to notice it and stop it.
+ * Minimum frequency: The lowest number of times a loss will occur over some interval of time
+ * Maximum frequency: The highest number of times a loss will occur over some interval of time
+ * Most likely frequency: The most likely number of times a loss will occur over some interval of time.  Sets the skew of the distribution.
+ * Kurtosis: A number that controls the shape of the PERT distribution, with a default of 4.  Higher values will cause a sharper peak.  
+ In FAIR, this is called the "belief in the most likely" frequency, based on the confidence of the estimator in the most likely frequency.  
+ With higher kurtosis, more samples in the simulation will be closer to the most likely frequency. 
+
+```python
+>> from riskquant import pertloss
+>> p = pertloss.PERTLoss(10, 100, .1, .7, .3, kurtosis=1) 
+>> simulate_100 = p.simulate_years(100)
+>> p.summarize_loss(simulate_100)
+
+{'minimum': 0,
+ 'tenth_percentile': 0,
+ 'mode': 0,
+ 'median': 1,
+ 'ninetieth_percentile': 2,
+ 'maximum': 6}
+```
+
+
 ## Using riskquant as a utility
 
 ```bash

diff --git a/riskquant/__init__.py b/riskquant/__init__.py
@@ -24,7 +24,7 @@
 
 
 __appname__ = 'riskquant'
-__version__ = '1.0.3'
+__version__ = '1.0.4'
 
 NAME_VERSION = '%s %s' % (__appname__, __version__)
 

diff --git a/riskquant/pertloss.py b/riskquant/pertloss.py
@@ -0,0 +1,114 @@
+"""A loss model based on a single loss scenario with
+
+* low_loss = Low loss amount
+* high_loss = High loss amount
+* min_freq: The lowest number of times a loss will occur
+* max_freq: The highest number of times a loss will occur
+* most_likely_freq: The most likely number of times a loss will occur over some interval of time.
+* kurtosis: Defaults to 4.  Controls the shape of the distribution. Higher values cause a sharper peak.
+
+
+The range low_loss -> high_loss should represent the 90% confidence interval
+that the loss will fall in that range.
+
+These values are then fit to a lognormal
+distribution so that they fall at the 5% and 95% cumulative probability points.
+
+The range min_freq -> max_freq should represent the 90% confidence interval
+that the frequency will fall in that range.
+
+The most_likely_freq will be used to skew the PERT distribution so that more of these values occur in the simulation.
+
+The kurtosis will be used to control the shape of the distribution; even more of the most_likely_freq values will
+occur in the simulation with higher kurtosis.
+
+These values are then used to create Modified PERT distribution.
+
+"""
+import math
+
+import numpy as np
+from scipy.stats import lognorm, mode, norm
+import tensorflow_probability as tfp
+
+
+tfp = tfp.experimental.substrates.numpy
+tfd = tfp.distributions
+factor = -0.5 / norm.ppf(0.05)
+
+
+class PERTLoss:
+    def __init__(self, low_loss, high_loss, min_freq, max_freq, most_likely_freq, kurtosis=4):
+        if min_freq >= max_freq:
+            # Min frequency must exceed max frequency
+            raise AssertionError
+        if not min_freq <= most_likely_freq <= max_freq:
+            # Most likely should be between min and max frequencies.
+            raise AssertionError
+        if low_loss >= high_loss:
+            # High loss must exceed low loss
+            raise AssertionError
+
+        # Set up the lognormal distribution
+        mu = (math.log(low_loss) + math.log(high_loss)) / 2.  # Average of the logn of low/high
+        shape = factor * (math.log(high_loss) - math.log(low_loss))  # Standard deviation
+        self.magnitude_distribution = lognorm(shape, scale=math.exp(mu))
+
+        # Set up the PERT distribution
+        # From FAIR: the most likely frequency will set the skew/peak, and
+        # the "confidence" in the most likely frequency will set the kurtosis/temp of the distribution.
+        self.frequency_distribution = tfd.PERT(low=min_freq, peak=most_likely_freq, high=max_freq, temperature=kurtosis)
+
+    def annualized_loss(self):
+        """Expected mean loss per year as scaled by the most likely frequency
+
+        :returns: Scalar of expected mean loss on an annualized basis."""
+
+        return self.frequency_distribution.mode().flat[0] * self.magnitude_distribution.mean()
+
+    def single_loss(self):
+        """Draw a single loss amount. Not scaled by probability of occurrence.
+
+        :returns: Scalar value of a randomly generated single loss amount."""
+
+        return self.magnitude_distribution.rvs()
+
+    def simulate_losses_one_year(self):
+        """Generate a random frequency and random magnitude from distributions.
+
+        :returns: Scalar value of one sample loss exposure."""
+        sample_frequency = self.frequency_distribution.sample(1)[0]
+        sample_magnitude = self.single_loss()
+        loss = sample_frequency * sample_magnitude
+        return loss
+
+    def simulate_years(self, n):
+        """Draw randomly to simulate n years of possible losses.
+
+        :arg: n = Number of years to simulate
+        :returns: Numpy array of shape (n,) with loss amounts per year."""
+        # Create array of possible frequencies
+        frequency_array = self.frequency_distribution.sample(n)
+        # The loss amounts for all the losses across all the years, generated all at once.
+        # This is much higher performance than generating one year at a time.
+        magnitude_array = self.magnitude_distribution.rvs(size=n)
+        # Multiply frequency times magnitude from each array.
+        loss_array = frequency_array * magnitude_array
+        return loss_array
+
+    @staticmethod
+    def summarize_loss(loss_array):
+        """Get statistics about a numpy array.
+        Risk is a range of possibilities, not just one outcome.
+
+        :arg: loss_array = Numpy array of simulated losses
+        :returns: Dictionary of statistics about the loss
+        """
+        percentiles = np.percentile(loss_array, [10, 50, 90]).astype(int)
+        loss_summary = {'minimum': loss_array.min().astype(int),
+                        'tenth_percentile': percentiles[0],
+                        'mode': mode(loss_array)[0][0].astype(int),
+                        'median': percentiles[1],
+                        'ninetieth_percentile': percentiles[2],
+                        'maximum': loss_array.max().astype(int)}
+        return loss_summary
diff --git a/tests/test_pertloss.py b/tests/test_pertloss.py
@@ -0,0 +1,76 @@
+import unittest
+
+from riskquant import pertloss
+
+
+class Test(unittest.TestCase):
+    def setUp(self):
+        min_freq = 0.1
+        max_freq = .7
+        most_likely = .3
+        kurtosis = 1
+        low_loss = 1
+        high_loss = 10
+        self.s = pertloss.PERTLoss(low_loss, high_loss, min_freq, max_freq, most_likely, kurtosis=kurtosis)
+
+    def testAnnualized(self):
+        # Returns the mean of the configured distribution scaled by the mode of frequency distribution
+        self.assertAlmostEqual(self.s.annualized_loss(), 1.2120038962444237)
+
+    def testDistribution(self):
+        # We defined the cdf(low) ~ 0.05 and the cdf(hi) ~ 0.95 so that
+        # it would be the 90% confidence interval. Check that it's true.
+        self.assertTrue(0.049 < self.s.magnitude_distribution.cdf(1) < 0.051)
+        self.assertTrue(0.949 < self.s.magnitude_distribution.cdf(10) < 0.951)
+
+    def testSingleLoss(self):
+        # The mean of many single losses should be close to the
+        # mean of the distribution. We are not using probability p here.
+        iterations = 10000
+        mean_loss = sum([self.s.single_loss() for _ in range(iterations)]) / iterations
+        self.assertGreater(mean_loss, 3.9)
+        self.assertLess(mean_loss, 4.2)
+
+    def testSimulateLossesOneYear(self):
+        # Should return a list of zero or more losses that fall mostly within the
+        # configured range (1, 10)
+        losses = []
+        for _ in range(100):
+            losses.append(self.s.simulate_losses_one_year())
+            self.assertGreater(sum(losses), 0.05)
+            self.assertLess(sum(losses), 10000)
+
+    def testSimulateYears(self):
+        # Should return a list of length == years
+        # whose mean is close to the annualized loss.
+        years = 10000
+        losses = self.s.simulate_years(years)
+        self.assertEqual(len(losses), years)
+        mean_loss = sum(losses) / years
+        self.assertGreater(mean_loss, 0.5)
+        self.assertLess(mean_loss, 1.5)
+
+    def testMinMaxFrequency(self):
+        # Min must be less than max.
+        with self.assertRaises(AssertionError):
+            pertloss.PERTLoss(10, 100, .7, .1, .3)  # min > max
+
+    def testMostLikelyFrequency(self):
+        # Most likely frequency must be between min and max.
+        with self.assertRaises(AssertionError):
+            pertloss.PERTLoss(10, 100, .1, .7, .8)  # most_likely > max
+
+    def testLowHighLoss(self):
+        # High loss must exceed low loss
+        with self.assertRaises(AssertionError):
+            pertloss.PERTLoss(100, 10, .1, .7, .3)  # min > max
+
+    def testSummary(self):
+        loss_array = self.s.simulate_years(1000)
+        summary = self.s.summarize_loss(loss_array)
+        self.assertEqual(summary['minimum'], 0)
+        self.assertEqual(summary['tenth_percentile'], 0)
+        self.assertEqual(summary['mode'], 0)
+        self.assertGreaterEqual(summary['median'], 1)
+        self.assertGreaterEqual(summary['ninetieth_percentile'], 2)
+        self.assertGreater(summary['maximum'], 5)