Remove caches due to excessive memory use (Trusted-AI#317)

* Remove caches due to excessive memory use * Validate pandas objects passed into scan
Illia-Kryvoviaz · Jul 5, 2022 · 007b403 · 007b403
1 parent db843a1
commit 007b403
Show file tree

Hide file tree

Showing 7 changed files with 17 additions and 124 deletions.
diff --git a/aif360/detectors/mdss/MDSS.py b/aif360/detectors/mdss/MDSS.py
@@ -220,6 +220,14 @@ def scan(self, coordinates: pd.DataFrame, expectations: pd.Series, outcomes: pd.
         """
         np.random.seed(seed)
 
+        # Reset indexes
+        coordinates = coordinates.reset_index(drop = True)
+        expectations = expectations.reset_index(drop = True)
+        outcomes = outcomes.reset_index(drop = True)
+
+        assert len(coordinates) == len(expectations) == len(outcomes), \
+            f'Lengths of coordinates, expectations, and outcomes should be equal.'
+
         # Check that the appropriate scoring function is used
 
         if isinstance(self.scoring_function, BerkJones):

diff --git a/aif360/detectors/mdss/ScoringFunctions/BerkJones.py b/aif360/detectors/mdss/ScoringFunctions/BerkJones.py
@@ -41,12 +41,6 @@ def score(self, observed_sum: float, expectations: np.array, penalty: float, q:
         """
         alpha = self.alpha
 
-        key = tuple([observed_sum, len(expectations), penalty, q, alpha])
-        ans = self.score_cache.get(key)
-        if ans is not None:
-            self.cache_counter['score'] += 1
-            return ans
-
         if q < alpha:
             q = alpha
 
@@ -57,7 +51,6 @@ def score(self, observed_sum: float, expectations: np.array, penalty: float, q:
         )
         if q == 1:
             ans = observed_sum * np.log(q / alpha) - penalty
-            self.score_cache[key] = ans
             return ans
 
         a = observed_sum * np.log(q / alpha)
@@ -68,7 +61,6 @@ def score(self, observed_sum: float, expectations: np.array, penalty: float, q:
             - penalty
         )
 
-        self.score_cache[key] = ans
         return ans
 
     def qmle(self, observed_sum: float, expectations: np.array):
@@ -81,24 +73,15 @@ def qmle(self, observed_sum: float, expectations: np.array):
         :return: q MLE
         """
         alpha = self.alpha
-
-        key = tuple([observed_sum, len(expectations), alpha])
-        ans = self.qmle_cache.get(key)
-        if ans is not None:
-            self.cache_counter['qmle'] += 1
-            return ans
 
         if len(expectations) == 0:
-            self.qmle_cache[key] = 0
             return 0
         else:
             q = observed_sum / len(expectations)
 
         if (q < alpha):
-            self.qmle_cache[key] = alpha
             return alpha
 
-        self.qmle_cache[key] = q
         return q
 
     def compute_qs(self, observed_sum: float, expectations: np.array, penalty: float):
@@ -110,13 +93,6 @@ def compute_qs(self, observed_sum: float, expectations: np.array, penalty: float
         :param penalty: penalty coefficient
         """
         alpha = self.alpha
-
-        key = tuple([observed_sum, len(expectations), penalty, alpha])
-        ans = self.compute_qs_cache.get(key)
-        if ans is not None:
-            self.cache_counter['qs'] += 1
-            return ans
-
         q_mle = self.qmle(observed_sum, expectations)
 
         if self.score(observed_sum, expectations, penalty, q_mle) > 0:
@@ -134,5 +110,4 @@ def compute_qs(self, observed_sum: float, expectations: np.array, penalty: float
             q_max = 0
 
         ans = [exist, q_mle, q_min, q_max]
-        self.compute_qs_cache[key] = ans
         return ans
diff --git a/aif360/detectors/mdss/ScoringFunctions/Bernoulli.py b/aif360/detectors/mdss/ScoringFunctions/Bernoulli.py
@@ -33,14 +33,7 @@ def score(self, observed_sum: float, expectations: np.array, penalty: float, q:
             % (observed_sum, len(expectations), penalty, q)
         )
 
-        key = tuple([observed_sum, expectations.tostring(), penalty, q])
-        ans = self.score_cache.get(key)
-        if ans is not None:
-            self.cache_counter['score'] += 1
-            return ans
-
         ans = observed_sum * np.log(q) - np.log(1 - expectations + q * expectations).sum() - penalty
-        self.score_cache[key] = ans
         return ans
 
     def qmle(self, observed_sum: float, expectations: np.array):
@@ -50,16 +43,8 @@ def qmle(self, observed_sum: float, expectations: np.array):
         :param observed_sum: sum of observed binary outcomes for all i
         :param expectations: predicted outcomes for each data element i
         """
-        direction = self.direction
-
-        key = tuple([observed_sum, expectations.tostring()])
-        ans = self.qmle_cache.get(key)
-        if ans is not None:
-            self.cache_counter['qmle'] += 1
-            return ans
-
+        direction = self.direction        
         ans = optim.bisection_q_mle(self, observed_sum, expectations, direction=direction)
-        self.qmle_cache[key] = ans
         return ans
 
     def compute_qs(self, observed_sum: float, expectations: np.array, penalty: float):
@@ -71,13 +56,6 @@ def compute_qs(self, observed_sum: float, expectations: np.array, penalty: float
         :param penalty: penalty coefficient
         """
         direction = self.direction
-
-        key = tuple([observed_sum, expectations.tostring(), penalty])
-        ans = self.compute_qs_cache.get(key)
-        if ans is not None:
-            self.cache_counter['qs'] += 1
-            return ans
-
         q_mle = self.qmle(observed_sum, expectations)
 
         if self.score(observed_sum, expectations, penalty, q_mle) > 0:
@@ -95,7 +73,6 @@ def compute_qs(self, observed_sum: float, expectations: np.array, penalty: float
             exist, q_min, q_max = optim.direction_assertions(direction, q_min, q_max)
 
         ans = [exist, q_mle, q_min, q_max]
-        self.compute_qs_cache[key] = ans
         return ans
 
     def q_dscore(self, observed_sum:float, expectations:np.array, q:float):
@@ -110,12 +87,5 @@ def q_dscore(self, observed_sum:float, expectations:np.array, q:float):
         :param q: current value of q
         :return: q dscore/dq
         """
-        key = tuple([observed_sum, expectations.tostring(), q])
-        ans = self.qdscore_cache.get(key)
-        if ans is not None:
-            self.cache_counter['qdscore'] += 1
-            return ans
-
         ans = observed_sum - (q * expectations / (1 - expectations + q * expectations)).sum()
-        self.qdscore_cache[key] = ans
         return ans
diff --git a/aif360/detectors/mdss/ScoringFunctions/Gaussian.py b/aif360/detectors/mdss/ScoringFunctions/Gaussian.py
@@ -30,12 +30,6 @@ def score(
         :return: bias score for the current value of q
         """
 
-        key = tuple([observed_sum, expectations.sum(), penalty, q])
-        ans = self.score_cache.get(key)
-        if ans is not None:
-            self.cache_counter["score"] += 1
-            return ans
-
         assumed_var =  self.var
         expected_sum = expectations.sum()
         penalty /= self.var
@@ -56,20 +50,13 @@ def score(
             ans = 0
 
         ans -= penalty
-        self.score_cache[key] = ans
 
         return ans
 
     def qmle(self, observed_sum: float, expectations: np.array):
         """
         Computes the q which maximizes score (q_mle).
         """
-        key = tuple([observed_sum, expectations.sum()])
-        ans = self.qmle_cache.get(key)
-        if ans is not None:
-            self.cache_counter["qmle"] += 1
-            return ans
-
         expected_sum = expectations.sum()
 
         # Deals with case where observed_sum = expected_sum = 0
@@ -78,8 +65,7 @@ def qmle(self, observed_sum: float, expectations: np.array):
         else:
             ans = observed_sum / expected_sum
 
-        assert np.isnan(ans) == False, f'{expected_sum}, {observed_sum}, {ans}'
-        self.qmle_cache[key] = ans
+        assert np.isnan(ans) == False, f'{expected_sum}, {observed_sum}, {ans}' 
         return ans
 
     def compute_qs(self, observed_sum: float, expectations: np.array, penalty: float):
@@ -94,13 +80,6 @@ def compute_qs(self, observed_sum: float, expectations: np.array, penalty: float
         direction = self.direction
 
         q_mle = self.qmle(observed_sum, expectations)
-
-        key = tuple([observed_sum, expectations.sum(), penalty])
-        ans = self.compute_qs_cache.get(key)
-        if ans is not None:
-            self.cache_counter["qs"] += 1
-            return ans
-
         q_mle_score = self.score(observed_sum, expectations, penalty, q_mle)
 
         if q_mle_score > 0:
@@ -118,5 +97,4 @@ def compute_qs(self, observed_sum: float, expectations: np.array, penalty: float
             exist, q_min, q_max = optim.direction_assertions(direction, q_min, q_max)
 
         ans = [exist, q_mle, q_min, q_max]
-        self.compute_qs_cache[key] = ans
         return ans
diff --git a/aif360/detectors/mdss/ScoringFunctions/Poisson.py b/aif360/detectors/mdss/ScoringFunctions/Poisson.py
@@ -32,30 +32,16 @@ def score(self, observed_sum: float, expectations: np.array, penalty: float, q:
             "observed_sum=%.2f, expectations of length=%d, penalty=%.2f, q=%.2f"
             % (observed_sum, len(expectations), penalty, q)
         )
-        key = tuple([observed_sum, expectations.sum(), penalty, q])
-        ans = self.score_cache.get(key)
-        if ans is not None:
-            self.cache_counter['score'] += 1
-            return ans
 
         ans = observed_sum * np.log(q) + (expectations - q * expectations).sum() - penalty
-        self.score_cache[key] = ans
         return ans
 
     def qmle(self, observed_sum: float, expectations: np.array):
         """
         Computes the q which maximizes score (q_mle).
         """
         direction = self.direction
-
-        key = tuple([observed_sum, expectations.sum()])
-        ans = self.qmle_cache.get(key)
-        if ans is not None:
-            self.cache_counter['qmle'] += 1
-            return ans
-
         ans = optim.bisection_q_mle(self, observed_sum, expectations, direction=direction)
-        self.qmle_cache[key] = ans
         return ans
 
     def compute_qs(self, observed_sum: float, expectations: np.array, penalty: float):
@@ -68,15 +54,8 @@ def compute_qs(self, observed_sum: float, expectations: np.array, penalty: float
         """
 
         direction = self.direction
-
         q_mle = self.qmle(observed_sum, expectations)
 
-        key = tuple([observed_sum, expectations.tostring(), penalty])
-        ans = self.compute_qs_cache.get(key)
-        if ans is not None:
-            self.cache_counter['qs'] += 1
-            return ans
-
         if self.score(observed_sum, expectations, penalty, q_mle) > 0:
             exist = 1
             q_min = optim.bisection_q_min(self, observed_sum, expectations, penalty, q_mle)
@@ -92,7 +71,6 @@ def compute_qs(self, observed_sum: float, expectations: np.array, penalty: float
             exist, q_min, q_max = optim.direction_assertions(direction, q_min, q_max)
 
         ans = [exist, q_mle, q_min, q_max]
-        self.compute_qs_cache[key] = ans
         return ans
 
     def q_dscore(self, observed_sum, expectations, q):
@@ -107,12 +85,5 @@ def q_dscore(self, observed_sum, expectations, q):
         :param q: current value of q
         :return: q dscore/dq
         """
-        key = tuple([observed_sum, expectations.sum(), q])
-        ans = self.qdscore_cache.get(key)
-        if ans is not None:
-            self.cache_counter['qdscore'] += 1
-            return ans
-
         ans = observed_sum - (q * expectations).sum()
-        self.qdscore_cache[key] = ans
         return ans
diff --git a/aif360/detectors/mdss/ScoringFunctions/ScoringFunction.py b/aif360/detectors/mdss/ScoringFunctions/ScoringFunction.py
@@ -15,20 +15,11 @@ def __init__(self, **kwargs):
         Journal of Computational and Graphical Statistics, 25(2), 382-404.
         """
         self.kwargs = kwargs
-        self._reset()
         self.direction = kwargs.get('direction')
 
         directions = ['positive', 'negative']
         assert self.direction in directions, f"Expected one of {directions}, got {self.direction}"
 
-    def _reset(self):
-        self.score_cache = {}
-        self.dscore_cache = {}
-        self.qdscore_cache = {}
-        self.qmle_cache = {}
-        self.compute_qs_cache = {}
-        self.cache_counter = {"score": 0, "dscore": 0, "qdscore": 0, "qmle": 0, "qs": 0}
-
     def score(
         self, observed_sum: float, expectations: np.array, penalty: float, q: float
     ):

diff --git a/examples/demo_mdss_detector.ipynb b/examples/demo_mdss_detector.ipynb
@@ -348,7 +348,7 @@
     {
      "data": {
       "text/plain": [
-       "'Our detected priviledged group has a size of 147, we observe 0.5374149659863946 as the average risk of recidivism, but our model predicts 0.3827815971689547'"
+       "'Our detected priviledged group has a size of 147, we observe 0.5374149659863946 as the average risk of recidivism, but our model predicts 0.38278159716895366'"
       ]
      },
      "execution_count": 12,
@@ -379,7 +379,7 @@
     {
      "data": {
       "text/plain": [
-       "'Our detected priviledged group has a size of 732, we observe 0.3770491803278688 as the average risk of recidivism, but our model predicts 0.44470388217799317'"
+       "'Our detected priviledged group has a size of 732, we observe 0.3770491803278688 as the average risk of recidivism, but our model predicts 0.4447038821779929'"
       ]
      },
      "execution_count": 14,
@@ -784,7 +784,7 @@
     {
      "data": {
       "text/plain": [
-       "'Our detected privileged group has a size of 321, we observe 7844.840295856697 as the mean insurance costs, but our model predicts 5420.49326277455'"
+       "'Our detected privileged group has a size of 321, we observe 7844.8402958566985 as the mean insurance costs, but our model predicts 5420.493262774548'"
       ]
      },
      "execution_count": 28,
@@ -809,7 +809,7 @@
     {
      "data": {
       "text/plain": [
-       "'Our detected privileged group has a size of 115, we observe 21148.37389617392 as the mean insurance costs, but our model predicts 29694.035319112852'"
+       "'Our detected privileged group has a size of 115, we observe 21148.373896173915 as the mean insurance costs, but our model predicts 29694.035319112845'"
       ]
      },
      "execution_count": 29,
@@ -1152,7 +1152,7 @@
     {
      "data": {
       "text/plain": [
-       "'Our detected privileged group has a size of 31607, we observe 5.155584909121915 as the mean temperature, but our model predicts 11.932678437519867'"
+       "'Our detected privileged group has a size of 31607, we observe 5.155584909121934 as the mean temperature, but our model predicts 11.93267843751985'"
       ]
      },
      "execution_count": 42,
@@ -1176,7 +1176,7 @@
     {
      "data": {
       "text/plain": [
-       "'Our detected unprivileged group has a size of 55642, we observe 16.773802762911167 as the mean temperature, but our model predicts 11.932678437519867'"
+       "'Our detected unprivileged group has a size of 55642, we observe 16.773802762911078 as the mean temperature, but our model predicts 11.93267843751985'"
       ]
      },
      "execution_count": 43,
@@ -1533,7 +1533,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.7"
+   "version": "3.8.12"
   }
  },
  "nbformat": 4,