From 34bfe5e6b4a3496a5db71d4b0ccc98f8893e1c41 Mon Sep 17 00:00:00 2001 From: Matthew Bernhardt Date: Wed, 30 Jul 2025 16:49:28 -0400 Subject: [PATCH 1/2] tco-190 Filter searches before citation detector ** Why are these changes being introduced: There is a transaction cost to calling the citation detector, both in terms of time and processing time. If we can develop a way to flag searches which have zero chance of being a citation, we can cut out this expense by skipping the citation detector. ** Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/tco-190 ** How does this address that need: This moves the extract_features method in Detector::MlCitation into the initialize method, allowing us to quickly check for whether a given term has enough non-zero features to make it worth calling the detector. From our analysis in the TACOS notebooks, we believe that phrases which result in only two non-zero values among all their features will never end up being a citation - and that this threshold will allow us to skip the citation detector in 90% of searches. The filtering is performed in a convenience method named enough_nonzero_values? (naming things is hard). ** Document any side effects to this change: The @detections instance variable is defined as false at the top of the initialize method, before the first guard clause, so that we get a consistent Boolean value in all conditions. This required one test to change that previously expected a nil. --- Gemfile | 1 + Gemfile.lock | 4 ++++ app/models/detector/ml_citation.rb | 24 ++++++++++++++++++------ test/models/detector/ml_citation_test.rb | 22 +++++++++++++++++++++- test/test_helper.rb | 3 ++- 5 files changed, 46 insertions(+), 8 deletions(-) diff --git a/Gemfile b/Gemfile index d9b315e..36cf6f3 100644 --- a/Gemfile +++ b/Gemfile @@ -128,6 +128,7 @@ group :test do # Use system testing [https://guides.rubyonrails.org/testing.html#system-testing] gem 'capybara' gem 'climate_control' + gem 'mocha' gem 'selenium-webdriver' gem 'simplecov' gem 'simplecov-lcov' diff --git a/Gemfile.lock b/Gemfile.lock index 4e9d09b..70f2576 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -237,6 +237,8 @@ GEM matrix (0.4.2) mini_mime (1.1.5) minitest (5.25.5) + mocha (2.7.1) + ruby2_keywords (>= 0.0.5) msgpack (1.8.0) multi_json (1.15.0) net-http (0.6.0) @@ -398,6 +400,7 @@ GEM rubocop (>= 1.75.0, < 2.0) rubocop-ast (>= 1.44.0, < 2.0) ruby-progressbar (1.13.0) + ruby2_keywords (0.0.5) rubyzip (2.4.1) sassc (2.4.0) ffi (~> 1.9) @@ -518,6 +521,7 @@ DEPENDENCIES importmap-rails jbuilder mitlibraries-theme! + mocha omniauth omniauth-rails_csrf_protection omniauth_openid_connect diff --git a/app/models/detector/ml_citation.rb b/app/models/detector/ml_citation.rb index c105f5f..2942d88 100644 --- a/app/models/detector/ml_citation.rb +++ b/app/models/detector/ml_citation.rb @@ -7,11 +7,15 @@ class MlCitation # For now the initialize method just needs to consult the external lambda. # # @param phrase String. Often a `Term.phrase`. - # @return Nothing intentional. Data is written to Hash `@detections` during processing. + # @return Nothing intentional. Data is written to Boolean `@detections` during processing. def initialize(phrase) + @detections = false return unless self.class.expected_env? - response = fetch(phrase) + features = extract_features(phrase) + return unless enough_nonzero_values?(features) + + response = fetch(features) @detections = response unless response == 'Error' end @@ -111,10 +115,10 @@ def define_lambda # define_payload defines the Hash that will be sent to the lambda. # # @return Hash - def define_payload(phrase) + def define_payload(features) { action: 'predict', - features: extract_features(phrase), + features: features, challenge_secret: self.class.lambda_secret } end @@ -135,9 +139,9 @@ def extract_features(phrase) # error handling with the response. # # @return Boolean or 'Error' - def fetch(phrase) + def fetch(features) lambda = define_lambda - payload = define_payload(phrase) + payload = define_payload(features) response = lambda.post(self.class.lambda_path, payload.to_json) @@ -151,5 +155,13 @@ def fetch(phrase) 'Error' end end + + # Enough_nonzero_values? checks that a provided hash contains at least three values which are not zero. + # + # @param hash Hash + # @return Integer + def enough_nonzero_values?(hash) + hash.values.count { |v| v != 0 } >= 3 + end end end diff --git a/test/models/detector/ml_citation_test.rb b/test/models/detector/ml_citation_test.rb index 185949b..2b1a8c4 100644 --- a/test/models/detector/ml_citation_test.rb +++ b/test/models/detector/ml_citation_test.rb @@ -77,12 +77,32 @@ class MlCitationTest < ActiveSupport::TestCase assert_instance_of Detector::MlCitation, prediction - assert_nil(prediction.detections) + assert_equal(false, prediction.detections) end end end end + test 'lookup skips fetching a prediction for search phrases with less than three features' do + Detector::MlCitation.any_instance.expects(:fetch).never + + with_enabled_mlcitation do + # This search phrase is expected to have only two non-zero feature values, which based on + # our analyses will never result in a predicted citation. + Detector::MlCitation.new('foobar (2025)') + end + end + + test 'lookup does not skip fetching a prediction for search phrases with three or more features' do + Detector::MlCitation.any_instance.expects(:fetch).once + + with_enabled_mlcitation do + # This search phrase is expected to have three non-zero feature values, which is the minimum + # number we expect to have any hope of a citation. + Detector::MlCitation.new('foobar (2025) 1234-76') + end + end + # Record method test 'record does relevant work' do with_enabled_mlcitation do diff --git a/test/test_helper.rb b/test/test_helper.rb index d82bd7f..8fc69a8 100644 --- a/test/test_helper.rb +++ b/test/test_helper.rb @@ -13,6 +13,7 @@ ENV['RAILS_ENV'] ||= 'test' require_relative '../config/environment' require 'rails/test_help' +require 'mocha/minitest' VCR.configure do |config| config.ignore_localhost = false @@ -124,4 +125,4 @@ def with_enabled_mlcitation } ensure ENV.replace(old_env) -end \ No newline at end of file +end From fb47d974158f7e5f1d988e4b05bd1547b619bd11 Mon Sep 17 00:00:00 2001 From: Matthew Bernhardt Date: Thu, 31 Jul 2025 16:39:25 -0400 Subject: [PATCH 2/2] Respond to code review feedback This makes two slight changes after discussing code review feedback: 1. We move the assignment of the @detections instance variable down below the first guard clause in the Detector::MlCitation initializer. This better accounts for the fact that the Term model implements its own guard clause, resulting in the rest of the application seeing a detector result of null if the detector is not enabled via env vars. 2. Adds a note to the enough_nonzero_values? method comment explaining how we came to decide that three non-zero feature values is sufficient. This includes links to the notebooks where we conducted that analysis. --- app/models/detector/ml_citation.rb | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/app/models/detector/ml_citation.rb b/app/models/detector/ml_citation.rb index 2942d88..f00f8d3 100644 --- a/app/models/detector/ml_citation.rb +++ b/app/models/detector/ml_citation.rb @@ -9,9 +9,10 @@ class MlCitation # @param phrase String. Often a `Term.phrase`. # @return Nothing intentional. Data is written to Boolean `@detections` during processing. def initialize(phrase) - @detections = false return unless self.class.expected_env? + @detections = false + features = extract_features(phrase) return unless enough_nonzero_values?(features) @@ -158,6 +159,11 @@ def fetch(features) # Enough_nonzero_values? checks that a provided hash contains at least three values which are not zero. # + # @note We chose 3 as our value here after analyzing the behavior of the citation detector across nearly a year of + # search traffic. For searches which had only one or two features that are not zero, we found no actual citations. + # To see the analyses, look at the "Filtering results" and "Surprising predictions" notebooks at + # https://github.com/MITLibraries/tacos-notebooks/tree/main/notebooks/explorations + # # @param hash Hash # @return Integer def enough_nonzero_values?(hash)