From 34bfe5e6b4a3496a5db71d4b0ccc98f8893e1c41 Mon Sep 17 00:00:00 2001
From: Matthew Bernhardt <mjbernha@mit.edu>
Date: Wed, 30 Jul 2025 16:49:28 -0400
Subject: [PATCH 1/2] tco-190 Filter searches before citation detector

** Why are these changes being introduced:

There is a transaction cost to calling the citation detector, both in
terms of time and processing time. If we can develop a way to flag
searches which have zero chance of being a citation, we can cut out this
expense by skipping the citation detector.

** Relevant ticket(s):

* https://mitlibraries.atlassian.net/browse/tco-190

** How does this address that need:

This moves the extract_features method in Detector::MlCitation into the
initialize method, allowing us to quickly check for whether a given term
has enough non-zero features to make it worth calling the detector.

From our analysis in the TACOS notebooks, we believe that phrases which
result in only two non-zero values among all their features will never
end up being a citation - and that this threshold will allow us to skip
the citation detector in 90% of searches.

The filtering is performed in a convenience method named
enough_nonzero_values? (naming things is hard).

** Document any side effects to this change:

The @detections instance variable is defined as false at the top of
the initialize method, before the first guard clause, so that we get a
consistent Boolean value in all conditions. This required one test to
change that previously expected a nil.
---
 Gemfile                                  |  1 +
 Gemfile.lock                             |  4 ++++
 app/models/detector/ml_citation.rb       | 24 ++++++++++++++++++------
 test/models/detector/ml_citation_test.rb | 22 +++++++++++++++++++++-
 test/test_helper.rb                      |  3 ++-
 5 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/Gemfile b/Gemfile
index d9b315e..36cf6f3 100644
--- a/Gemfile
+++ b/Gemfile
@@ -128,6 +128,7 @@ group :test do
   # Use system testing [https://guides.rubyonrails.org/testing.html#system-testing]
   gem 'capybara'
   gem 'climate_control'
+  gem 'mocha'
   gem 'selenium-webdriver'
   gem 'simplecov'
   gem 'simplecov-lcov'
diff --git a/Gemfile.lock b/Gemfile.lock
index 4e9d09b..70f2576 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -237,6 +237,8 @@ GEM
     matrix (0.4.2)
     mini_mime (1.1.5)
     minitest (5.25.5)
+    mocha (2.7.1)
+      ruby2_keywords (>= 0.0.5)
     msgpack (1.8.0)
     multi_json (1.15.0)
     net-http (0.6.0)
@@ -398,6 +400,7 @@ GEM
       rubocop (>= 1.75.0, < 2.0)
       rubocop-ast (>= 1.44.0, < 2.0)
     ruby-progressbar (1.13.0)
+    ruby2_keywords (0.0.5)
     rubyzip (2.4.1)
     sassc (2.4.0)
       ffi (~> 1.9)
@@ -518,6 +521,7 @@ DEPENDENCIES
   importmap-rails
   jbuilder
   mitlibraries-theme!
+  mocha
   omniauth
   omniauth-rails_csrf_protection
   omniauth_openid_connect
diff --git a/app/models/detector/ml_citation.rb b/app/models/detector/ml_citation.rb
index c105f5f..2942d88 100644
--- a/app/models/detector/ml_citation.rb
+++ b/app/models/detector/ml_citation.rb
@@ -7,11 +7,15 @@ class MlCitation
     # For now the initialize method just needs to consult the external lambda.
     #
     #   @param phrase String. Often a `Term.phrase`.
-    #   @return Nothing intentional. Data is written to Hash `@detections` during processing.
+    #   @return Nothing intentional. Data is written to Boolean `@detections` during processing.
     def initialize(phrase)
+      @detections = false
       return unless self.class.expected_env?
 
-      response = fetch(phrase)
+      features = extract_features(phrase)
+      return unless enough_nonzero_values?(features)
+
+      response = fetch(features)
       @detections = response unless response == 'Error'
     end
 
@@ -111,10 +115,10 @@ def define_lambda
     # define_payload defines the Hash that will be sent to the lambda.
     #
     # @return Hash
-    def define_payload(phrase)
+    def define_payload(features)
       {
         action: 'predict',
-        features: extract_features(phrase),
+        features: features,
         challenge_secret: self.class.lambda_secret
       }
     end
@@ -135,9 +139,9 @@ def extract_features(phrase)
     # error handling with the response.
     #
     # @return Boolean or 'Error'
-    def fetch(phrase)
+    def fetch(features)
       lambda = define_lambda
-      payload = define_payload(phrase)
+      payload = define_payload(features)
 
       response = lambda.post(self.class.lambda_path, payload.to_json)
 
@@ -151,5 +155,13 @@ def fetch(phrase)
         'Error'
       end
     end
+
+    # Enough_nonzero_values? checks that a provided hash contains at least three values which are not zero.
+    #
+    # @param hash Hash
+    # @return Integer
+    def enough_nonzero_values?(hash)
+      hash.values.count { |v| v != 0 } >= 3
+    end
   end
 end
diff --git a/test/models/detector/ml_citation_test.rb b/test/models/detector/ml_citation_test.rb
index 185949b..2b1a8c4 100644
--- a/test/models/detector/ml_citation_test.rb
+++ b/test/models/detector/ml_citation_test.rb
@@ -77,12 +77,32 @@ class MlCitationTest < ActiveSupport::TestCase
 
             assert_instance_of Detector::MlCitation, prediction
 
-            assert_nil(prediction.detections)
+            assert_equal(false, prediction.detections)
           end
         end
       end
     end
 
+    test 'lookup skips fetching a prediction for search phrases with less than three features' do
+      Detector::MlCitation.any_instance.expects(:fetch).never
+
+      with_enabled_mlcitation do
+        # This search phrase is expected to have only two non-zero feature values, which based on
+        # our analyses will never result in a predicted citation.
+        Detector::MlCitation.new('foobar (2025)')
+      end
+    end
+
+    test 'lookup does not skip fetching a prediction for search phrases with three or more features' do
+      Detector::MlCitation.any_instance.expects(:fetch).once
+
+      with_enabled_mlcitation do
+        # This search phrase is expected to have three non-zero feature values, which is the minimum
+        # number we expect to have any hope of a citation.
+        Detector::MlCitation.new('foobar (2025) 1234-76')
+      end
+    end
+
     # Record method
     test 'record does relevant work' do
       with_enabled_mlcitation do
diff --git a/test/test_helper.rb b/test/test_helper.rb
index d82bd7f..8fc69a8 100644
--- a/test/test_helper.rb
+++ b/test/test_helper.rb
@@ -13,6 +13,7 @@
 ENV['RAILS_ENV'] ||= 'test'
 require_relative '../config/environment'
 require 'rails/test_help'
+require 'mocha/minitest'
 
 VCR.configure do |config|
   config.ignore_localhost = false
@@ -124,4 +125,4 @@ def with_enabled_mlcitation
   }
 ensure
   ENV.replace(old_env)
-end
\ No newline at end of file
+end

From fb47d974158f7e5f1d988e4b05bd1547b619bd11 Mon Sep 17 00:00:00 2001
From: Matthew Bernhardt <mjbernha@mit.edu>
Date: Thu, 31 Jul 2025 16:39:25 -0400
Subject: [PATCH 2/2] Respond to code review feedback

This makes two slight changes after discussing code review feedback:

1. We move the assignment of the @detections instance variable down
   below the first guard clause in the Detector::MlCitation initializer.
   This better accounts for the fact that the Term model implements its
   own guard clause, resulting in the rest of the application seeing a
   detector result of null if the detector is not enabled via env vars.

2. Adds a note to the enough_nonzero_values? method comment explaining
   how we came to decide that three non-zero feature values is
   sufficient. This includes links to the notebooks where we conducted
   that analysis.
---
 app/models/detector/ml_citation.rb | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/app/models/detector/ml_citation.rb b/app/models/detector/ml_citation.rb
index 2942d88..f00f8d3 100644
--- a/app/models/detector/ml_citation.rb
+++ b/app/models/detector/ml_citation.rb
@@ -9,9 +9,10 @@ class MlCitation
     #   @param phrase String. Often a `Term.phrase`.
     #   @return Nothing intentional. Data is written to Boolean `@detections` during processing.
     def initialize(phrase)
-      @detections = false
       return unless self.class.expected_env?
 
+      @detections = false
+
       features = extract_features(phrase)
       return unless enough_nonzero_values?(features)
 
@@ -158,6 +159,11 @@ def fetch(features)
 
     # Enough_nonzero_values? checks that a provided hash contains at least three values which are not zero.
     #
+    # @note We chose 3 as our value here after analyzing the behavior of the citation detector across nearly a year of
+    #   search traffic. For searches which had only one or two features that are not zero, we found no actual citations.
+    #   To see the analyses, look at the "Filtering results" and "Surprising predictions" notebooks at
+    #   https://github.com/MITLibraries/tacos-notebooks/tree/main/notebooks/explorations
+    #
     # @param hash Hash
     # @return Integer
     def enough_nonzero_values?(hash)