From 3197938d8e8bb66051806f3914ebeafca32ba55f Mon Sep 17 00:00:00 2001
From: jazairi <16103405+jazairi@users.noreply.github.com>
Date: Fri, 21 Nov 2025 15:04:25 -0500
Subject: [PATCH 1/3] Improve 'all' tab pagination to handle edge cases
Why these changes are being introduced:
The zipper merge we implemented naively queries n/2 results from each
API and interleaves them, where n is the per-page value. This works if
both APIs return many results, but it can cause problems in smaller,
unbalanced result sets.
For example, the query term `doc edgerton` returns 50 Primo results and
4 TIMDEX results. Page 1 only shows 14 results (4 TIMDEX and 10 Primo),
and each subsequent page returns only 10 (all Primo).
Relevant ticket(s):
- [USE-179](https://mitlibraries.atlassian.net/browse/USE-179)
How this addresses that need:
This implements more sophisticated logic that first checks the number
of hits returned by each API and passes that, along with the pagination
information, to a Merged Search Paginator class. This service object
develops a 'merge plan', calculates API offsets, and merges the results
for each page.
Queries on the 'all' tab now fetch twice from each API: once to
determine the total number of hits for the Merged Search Paginator
then again to fetch results at the appropriate offset. While hardly
ideal, this was the only option I could figure to avoid losing results.
I limited these extra calls to queries beyond page 1, which is the
only case where they are needed.
Side effects of this change:
* We now clear cache before each search controller test. This was done
to avoid odd test behavior, but I ran the suite 50 times without any
issues, so it might be excessively cautious.
* The search controller continues to grow with this new logic. I tried
to split things into multiple helper methods, so if we want to move
more things to service objects later, it might be easier to do so.
* A failing cassette has been replaced with a mock.
---
app/controllers/search_controller.rb | 146 +++++++---
app/models/merged_search_paginator.rb | 73 +++++
test/controllers/search_controller_test.rb | 279 +++++++++++++++++---
test/models/merged_search_paginator_test.rb | 58 ++++
test/vcr_cassettes/advanced_title_data.yml | 90 -------
5 files changed, 479 insertions(+), 167 deletions(-)
create mode 100644 app/models/merged_search_paginator.rb
create mode 100644 test/models/merged_search_paginator_test.rb
delete mode 100644 test/vcr_cassettes/advanced_title_data.yml
diff --git a/app/controllers/search_controller.rb b/app/controllers/search_controller.rb
index c7b1301a..222fefe2 100644
--- a/app/controllers/search_controller.rb
+++ b/app/controllers/search_controller.rb
@@ -88,48 +88,117 @@ def load_timdex_results
end
def load_all_results
- # Fetch results from both APIs in parallel
- primo_data, timdex_data = fetch_all_data
+ current_page = @enhanced_query[:page] || 1
+ per_page = ENV.fetch('RESULTS_PER_PAGE', '20').to_i
+ data = if current_page.to_i == 1
+ fetch_all_tab_first_page(current_page, per_page)
+ else
+ fetch_all_tab_deeper_pages(current_page, per_page)
+ end
- # Combine errors from both APIs
- @errors = combine_errors(primo_data[:errors], timdex_data[:errors])
+ @results = data[:results]
+ @errors = data[:errors]
+ @pagination = data[:pagination]
+ @show_primo_continuation = data[:show_primo_continuation]
+ end
- # Zipper merge results from both APIs
- @results = merge_results(primo_data[:results], timdex_data[:results])
+ def fetch_all_tab_first_page(current_page, per_page)
+ primo_data, timdex_data = parallel_fetch({ offset: 0, per_page: per_page }, { offset: 0, per_page: per_page })
- # Use Analyzer for combined pagination calculation
- @pagination = Analyzer.new(@enhanced_query, timdex_data[:hits], :all,
- primo_data[:hits]).pagination
+ paginator = build_paginator_from_data(primo_data, timdex_data, current_page, per_page)
- # Handle primo continuation for high page numbers
- @show_primo_continuation = primo_data[:show_continuation] || false
+ assemble_all_tab_result(paginator, primo_data, timdex_data, current_page, per_page)
end
- def fetch_all_data
- # Parallel fetching from both APIs
- primo_thread = Thread.new { fetch_primo_data }
- timdex_thread = Thread.new { fetch_timdex_data }
+ def fetch_all_tab_deeper_pages(current_page, per_page)
+ primo_summary, timdex_summary = parallel_fetch({ offset: 0, per_page: 1 }, { offset: 0, per_page: 1 })
+
+ paginator = build_paginator_from_data(primo_summary, timdex_summary, current_page, per_page)
+
+ primo_data, timdex_data = fetch_all_tab_page_chunks(paginator)
+
+ assemble_all_tab_result(paginator, primo_data, timdex_data, current_page, per_page, deeper: true)
+ end
+
+ # Launch parallel fetch threads for Primo and Timdex and return their data
+ def parallel_fetch(primo_opts = {}, timdex_opts = {})
+ primo_thread = Thread.new { fetch_primo_data(**primo_opts) }
+ timdex_thread = Thread.new { fetch_timdex_data(**timdex_opts) }
[primo_thread.value, timdex_thread.value]
end
+ # Build a paginator from raw API response data
+ def build_paginator_from_data(primo_data, timdex_data, current_page, per_page)
+ primo_total = primo_data[:hits] || 0
+ timdex_total = timdex_data[:hits] || 0
+
+ MergedSearchPaginator.new(
+ primo_total: primo_total,
+ timdex_total: timdex_total,
+ current_page: current_page,
+ per_page: per_page
+ )
+ end
+
+ # For deeper pages, compute merge_plan and api_offsets, then conditionally fetch page chunks
+ def fetch_all_tab_page_chunks(paginator)
+ merge_plan = paginator.merge_plan
+ primo_count = merge_plan.count(:primo)
+ timdex_count = merge_plan.count(:timdex)
+ primo_offset, timdex_offset = paginator.api_offsets
+
+ primo_thread = primo_count > 0 ? Thread.new { fetch_primo_data(offset: primo_offset, per_page: primo_count) } : nil
+ timdex_thread = if timdex_count > 0
+ Thread.new do
+ fetch_timdex_data(offset: timdex_offset, per_page: timdex_count)
+ end
+ end
+
+ primo_data = if primo_thread
+ primo_thread.value
+ else
+ { results: [], errors: nil, hits: paginator.primo_total, show_continuation: false }
+ end
+
+ timdex_data = if timdex_thread
+ timdex_thread.value
+ else
+ { results: [], errors: nil, hits: paginator.timdex_total }
+ end
+
+ [primo_data, timdex_data]
+ end
+
+ # Assemble the final result hash from paginator and API data
+ def assemble_all_tab_result(paginator, primo_data, timdex_data, current_page, per_page, deeper: false)
+ primo_total = primo_data[:hits] || 0
+ timdex_total = timdex_data[:hits] || 0
+
+ merged = paginator.merge_results(primo_data[:results] || [], timdex_data[:results] || [])
+ errors = combine_errors(primo_data[:errors], timdex_data[:errors])
+ pagination = Analyzer.new(@enhanced_query, timdex_total, :all, primo_total).pagination
+
+ show_primo_continuation = if deeper
+ page_offset = (current_page - 1) * per_page
+ primo_data[:show_continuation] || (page_offset >= Analyzer::PRIMO_MAX_OFFSET)
+ else
+ primo_data[:show_continuation]
+ end
+
+ { results: merged, errors: errors, pagination: pagination, show_primo_continuation: show_primo_continuation }
+ end
+
def combine_errors(*error_arrays)
all_errors = error_arrays.compact.flatten
all_errors.any? ? all_errors : nil
end
- def merge_results(primo_results, timdex_results)
- (primo_results || []).zip(timdex_results || []).flatten.compact
- end
-
- def fetch_primo_data
+ def fetch_primo_data(offset: nil, per_page: nil)
+ # Default to current page if not provided
current_page = @enhanced_query[:page] || 1
- per_page = if @active_tab == 'all'
- ENV.fetch('RESULTS_PER_PAGE', '20').to_i / 2
- else
- ENV.fetch('RESULTS_PER_PAGE', '20').to_i
- end
- offset = (current_page - 1) * per_page
+ per_page ||= ENV.fetch('RESULTS_PER_PAGE', '20').to_i
+ offset ||= (current_page - 1) * per_page
# Check if we're beyond Primo API limits before making the request.
if offset >= Analyzer::PRIMO_MAX_OFFSET
@@ -151,8 +220,9 @@ def fetch_primo_data
if results.empty?
docs = primo_response['docs'] if primo_response.is_a?(Hash)
if docs.nil? || docs.empty?
- # Only show continuation for pagination scenarios (page > 1), not for searches with no results
- show_continuation = true if current_page > 1
+ # Only show continuation for pagination scenarios (where offset is present), not for
+ # searches with no results
+ show_continuation = true if offset > 0
else
errors = [{ 'message' => 'No more results available at this page number.' }]
end
@@ -164,19 +234,10 @@ def fetch_primo_data
{ results: [], pagination: {}, errors: handle_primo_errors(e), show_continuation: false, hits: 0 }
end
- def fetch_timdex_data
- # For all tab, modify query to use half page size
- if @active_tab == 'all'
- per_page = ENV.fetch('RESULTS_PER_PAGE', '20').to_i / 2
- page = @enhanced_query[:page] || 1
- from_offset = ((page - 1) * per_page).to_s
-
- query_builder = QueryBuilder.new(@enhanced_query)
- query = query_builder.query
- query['from'] = from_offset
- else
- query = QueryBuilder.new(@enhanced_query).query
- end
+ def fetch_timdex_data(offset: nil, per_page: nil)
+ query = QueryBuilder.new(@enhanced_query).query
+ query['from'] = offset.to_s if offset
+ query['size'] = per_page.to_s if per_page
response = query_timdex(query)
errors = extract_errors(response)
@@ -223,7 +284,8 @@ def query_timdex(query)
def query_primo(per_page, offset)
# We generate unique cache keys to avoid naming collisions.
- cache_key = generate_cache_key(@enhanced_query)
+ # Include per_page and offset in the cache key to ensure pagination works correctly.
+ cache_key = generate_cache_key(@enhanced_query.merge(per_page: per_page, offset: offset))
Rails.cache.fetch("#{cache_key}/primo", expires_in: 12.hours) do
primo_search = PrimoSearch.new(@enhanced_query[:tab])
diff --git a/app/models/merged_search_paginator.rb b/app/models/merged_search_paginator.rb
new file mode 100644
index 00000000..030fa77a
--- /dev/null
+++ b/app/models/merged_search_paginator.rb
@@ -0,0 +1,73 @@
+# frozen_string_literal: true
+
+# MergedSearchPaginator encapsulates stateless merged pagination logic for combining two API result sets.
+# It calculates the merge plan, API offsets, and merges the results for a given page.
+class MergedSearchPaginator
+ attr_reader :primo_total, :timdex_total, :current_page, :per_page
+
+ def initialize(primo_total:, timdex_total:, current_page:, per_page:)
+ @primo_total = primo_total
+ @timdex_total = timdex_total
+ @current_page = current_page
+ @per_page = per_page
+ end
+
+ # Returns an array of :primo and :timdex symbols for the merged result order on this page
+ def merge_plan
+ total_results = primo_total + timdex_total
+ start_index = (current_page - 1) * per_page
+ end_index = [start_index + per_page, total_results].min
+ plan = []
+ primo_used = 0
+ timdex_used = 0
+ i = 0
+ while i < end_index
+ if primo_used < primo_total && (timdex_used >= timdex_total || primo_used <= timdex_used)
+ source = :primo
+ primo_used += 1
+ elsif timdex_used < timdex_total
+ source = :timdex
+ timdex_used += 1
+ end
+ plan << source if i >= start_index
+ i += 1
+ end
+ plan
+ end
+
+ # Returns [primo_offset, timdex_offset] for the start of this page
+ def api_offsets
+ start_index = (current_page - 1) * per_page
+ primo_offset = 0
+ timdex_offset = 0
+ i = 0
+ while i < start_index
+ if primo_offset < primo_total && (timdex_offset >= timdex_total || primo_offset <= timdex_offset)
+ primo_offset += 1
+ elsif timdex_offset < timdex_total
+ timdex_offset += 1
+ else
+ break
+ end
+ i += 1
+ end
+ [primo_offset, timdex_offset]
+ end
+
+ # Merges two result arrays according to the merge plan
+ def merge_results(primo_results, timdex_results)
+ merged = []
+ primo_idx = 0
+ timdex_idx = 0
+ merge_plan.each do |source|
+ if source == :primo
+ merged << primo_results[primo_idx] if primo_idx < primo_results.length
+ primo_idx += 1
+ else
+ merged << timdex_results[timdex_idx] if timdex_idx < timdex_results.length
+ timdex_idx += 1
+ end
+ end
+ merged
+ end
+end
diff --git a/test/controllers/search_controller_test.rb b/test/controllers/search_controller_test.rb
index aa60242a..2aa86d12 100644
--- a/test/controllers/search_controller_test.rb
+++ b/test/controllers/search_controller_test.rb
@@ -1,8 +1,13 @@
require 'test_helper'
class SearchControllerTest < ActionDispatch::IntegrationTest
+ # Clearing cache before each test to prevent any cache-related flakiness from threading.
+ setup do
+ Rails.cache.clear
+ end
+
def mock_primo_search_success
- # Mock the Primo search components to avoid external API calls
+ # Mock the Primo search components to avoid external API calls (single call)
sample_doc = {
api: 'primo',
title: 'Sample Primo Document Title',
@@ -24,6 +29,29 @@ def mock_primo_search_success
NormalizePrimoResults.expects(:new).returns(mock_normalizer)
end
+ def mock_primo_search_all_tab
+ # Mock the Primo search components for the all tab (multiple calls)
+ sample_doc = {
+ api: 'primo',
+ title: 'Sample Primo Document Title',
+ format: 'Article',
+ year: '2025',
+ creators: [
+ { value: 'Foo Barston', link: nil },
+ { value: 'Baz Quxley', link: nil }
+ ],
+ links: [{ 'kind' => 'full record', 'url' => 'https://example.com/record' }]
+ }
+
+ mock_primo = mock('primo_search')
+ mock_primo.expects(:search).returns({ 'docs' => [sample_doc], 'info' => { 'total' => 1 } }).at_least_once
+ PrimoSearch.expects(:new).returns(mock_primo).at_least_once
+
+ mock_normalizer = mock('normalizer')
+ mock_normalizer.expects(:normalize).returns([sample_doc]).at_least_once
+ NormalizePrimoResults.expects(:new).returns(mock_normalizer).at_least_once
+ end
+
def mock_primo_search_with_hits(total_hits)
sample_docs = (1..10).map do |i|
{
@@ -48,7 +76,7 @@ def mock_primo_search_with_hits(total_hits)
end
def mock_timdex_search_success
- # Mock the TIMDEX GraphQL client to avoid external API calls
+ # Mock the TIMDEX GraphQL client to avoid external API calls (single call)
sample_result = {
'api' => 'timdex',
'title' => 'Sample TIMDEX Document Title',
@@ -88,7 +116,51 @@ def mock_timdex_search_success
})
mock_response.stubs(:data).returns(mock_data)
- TimdexBase::Client.expects(:query).returns(mock_response)
+ TimdexBase::Client.expects(:query).returns(mock_response).at_least_once
+ end
+
+ def mock_timdex_search_all_tab
+ # Mock the TIMDEX GraphQL client for the all tab (multiple calls)
+ sample_result = {
+ 'api' => 'timdex',
+ 'title' => 'Sample TIMDEX Document Title',
+ 'timdexRecordId' => 'sample-record-123',
+ 'contentType' => [{ 'value' => 'Article' }],
+ 'dates' => [{ 'kind' => 'Publication date', 'value' => '2023' }],
+ 'contributors' => [{ 'value' => 'Foo Barston', 'kind' => 'Creator' }],
+ 'highlight' => [
+ {
+ 'matchedField' => 'summary',
+ 'matchedPhrases' => ['sample document']
+ }
+ ],
+ 'sourceLink' => 'https://example.com/record'
+ }
+
+ mock_response = mock('timdex_response')
+ mock_errors = mock('timdex_errors')
+ mock_errors.stubs(:details).returns({})
+ mock_errors.stubs(:to_h).returns({})
+ mock_response.stubs(:errors).returns(mock_errors)
+
+ mock_data = mock('timdex_data')
+ mock_search = mock('timdex_search')
+ mock_search.stubs(:to_h).returns({
+ 'hits' => 1,
+ 'aggregations' => {},
+ 'records' => [sample_result]
+ })
+ mock_data.stubs(:search).returns(mock_search)
+ mock_data.stubs(:to_h).returns({
+ 'search' => {
+ 'hits' => 1,
+ 'aggregations' => {},
+ 'records' => [sample_result]
+ }
+ })
+ mock_response.stubs(:data).returns(mock_data)
+
+ TimdexBase::Client.expects(:query).returns(mock_response).at_least_once
end
def mock_timdex_search_with_hits(total_hits)
@@ -126,13 +198,13 @@ def mock_timdex_search_with_hits(total_hits)
})
mock_response.stubs(:data).returns(mock_data)
- TimdexBase::Client.expects(:query).returns(mock_response)
+ TimdexBase::Client.expects(:query).returns(mock_response).at_least_once
# Mock the results normalization
normalized_results = sample_results.map { |result| result.merge({ source: 'TIMDEX' }) }
mock_normalizer = mock('normalizer')
- mock_normalizer.expects(:normalize).returns(normalized_results)
- NormalizeTimdexResults.expects(:new).returns(mock_normalizer)
+ mock_normalizer.expects(:normalize).returns(normalized_results).at_least_once
+ NormalizeTimdexResults.expects(:new).returns(mock_normalizer).at_least_once
end
test 'index shows basic search form by default' do
@@ -353,16 +425,50 @@ def mock_timdex_search_with_hits(total_hits)
end
test 'highlights partial is not rendered for results with no relevant highlights' do
- VCR.use_cassette('advanced title data',
- allow_playback_repeats: true,
- match_requests_on: %i[method uri body]) do
- get '/results?title=data&advanced=true'
- assert_response :success
+ # Stub TIMDEX response for this test to avoid VCR cassette mismatches.
+ sample_result = {
+ 'api' => 'timdex',
+ 'title' => 'Sample TIMDEX Document Title',
+ 'timdexRecordId' => 'sample-record-123',
+ 'contentType' => [{ 'value' => 'Article' }],
+ 'dates' => [{ 'kind' => 'Publication date', 'value' => '2023' }],
+ 'contributors' => [{ 'value' => 'Foo Barston', 'kind' => 'Creator' }],
+ 'highlight' => [],
+ 'sourceLink' => 'https://example.com/record'
+ }
- # We shouldn't see any highlighted terms because all of the matches will be on title, which is included in
- # SearchHelper#displayed_fields
- assert_select '#results .result-highlights ul li', { count: 0 }
- end
+ mock_response = mock('timdex_response')
+ mock_errors = mock('timdex_errors')
+ mock_errors.stubs(:details).returns({})
+ mock_errors.stubs(:to_h).returns({})
+ mock_response.stubs(:errors).returns(mock_errors)
+
+ mock_data = mock('timdex_data')
+ mock_search = mock('timdex_search')
+ mock_search.stubs(:to_h).returns({
+ 'hits' => 1,
+ 'aggregations' => {},
+ 'records' => [sample_result]
+ })
+ mock_data.stubs(:search).returns(mock_search)
+ mock_data.stubs(:to_h).returns({
+ 'search' => {
+ 'hits' => 1,
+ 'aggregations' => {},
+ 'records' => [sample_result]
+ }
+ })
+ mock_response.stubs(:data).returns(mock_data)
+
+ TimdexBase::Client.expects(:query).returns(mock_response).at_least_once
+
+ # Use the TIMDEX tab route to exercise highlighting behavior without running advanced search/VCR
+ get '/results?q=data&tab=timdex'
+ assert_response :success
+
+ # We shouldn't see any highlighted terms because all of the matches will be on title, which is included in
+ # SearchHelper#displayed_fields
+ assert_select '#results .result-highlights ul li', { count: 0 }
end
test 'searches with zero results are handled gracefully' do
@@ -646,8 +752,8 @@ def source_filter_count(controller)
# Tab functionality tests for USE
test 'results defaults to all tab when no tab parameter provided' do
# Mock both APIs since 'all' tab calls both
- mock_primo_search_success
- mock_timdex_search_success
+ mock_primo_search_all_tab
+ mock_timdex_search_all_tab
get '/results?q=test'
assert_response :success
@@ -799,7 +905,7 @@ def source_filter_count(controller)
})
mock_response.stubs(:data).returns(mock_data)
- TimdexBase::Client.expects(:query).returns(mock_response)
+ TimdexBase::Client.expects(:query).returns(mock_response).at_least_once
get '/results?q=nonexistentterm&tab=timdex'
assert_response :success
@@ -809,8 +915,8 @@ def source_filter_count(controller)
end
test 'all tab displays results from both TIMDEX and Primo' do
- mock_primo_search_success
- mock_timdex_search_success
+ mock_primo_search_all_tab
+ mock_timdex_search_all_tab
get '/results?q=test&tab=all'
assert_response :success
@@ -823,7 +929,7 @@ def source_filter_count(controller)
test 'all tab handles API errors gracefully' do
# Mock Primo to fail
PrimoSearch.expects(:new).raises(StandardError.new('Primo API Error'))
- mock_timdex_search_success
+ mock_timdex_search_all_tab
get '/results?q=test&tab=all'
assert_response :success
@@ -831,7 +937,7 @@ def source_filter_count(controller)
end
test 'all tab is default when no tab specified' do
- mock_primo_search_success
+ mock_primo_search_all_tab
mock_timdex_search_success
get '/results?q=test'
@@ -842,8 +948,8 @@ def source_filter_count(controller)
end
test 'all tab shows as active in navigation' do
- mock_primo_search_success
- mock_timdex_search_success
+ mock_primo_search_all_tab
+ mock_timdex_search_all_tab
get '/results?q=test&tab=all'
assert_response :success
@@ -852,16 +958,24 @@ def source_filter_count(controller)
end
test 'all tab shows primo continuation when page exceeds API offset limit' do
- mock_timdex_search_success
-
- # Mock Primo API to return empty results for high page number (beyond offset limit)
+ sample_doc = {
+ api: 'primo',
+ title: 'Sample Primo Document Title',
+ format: 'Article',
+ year: '2025',
+ creators: [
+ { value: 'Foo Barston', link: nil },
+ { value: 'Baz Quxley', link: nil }
+ ],
+ links: [{ 'kind' => 'full record', 'url' => 'https://example.com/record' }]
+ }
mock_primo = mock('primo_search')
- mock_primo.expects(:search).returns({ 'docs' => [], 'info' => { 'total' => 1000 } })
- PrimoSearch.expects(:new).returns(mock_primo)
-
+ mock_primo.expects(:search).returns({ 'docs' => [sample_doc], 'info' => { 'total' => 1 } }).at_least_once
+ PrimoSearch.expects(:new).returns(mock_primo).at_least_once
mock_normalizer = mock('normalizer')
- mock_normalizer.expects(:normalize).returns([])
- NormalizePrimoResults.expects(:new).returns(mock_normalizer)
+ mock_normalizer.expects(:normalize).returns([sample_doc]).at_least_once
+ NormalizePrimoResults.expects(:new).returns(mock_normalizer).at_least_once
+ mock_timdex_search_success
get '/results?q=test&tab=all&page=49'
assert_response :success
@@ -873,7 +987,24 @@ def source_filter_count(controller)
end
test 'all tab pagination displays combined hit counts' do
- mock_primo_search_with_hits(500)
+ sample_docs = (1..10).map do |i|
+ {
+ title: "Sample Primo Document Title \\#{i}",
+ format: 'Article',
+ year: '2025',
+ creators: [{ value: "Author \\#{i}", link: nil }],
+ links: [{ 'kind' => 'full record', 'url' => "https://example.com/record\\#{i}" }]
+ }
+ end
+ mock_primo = mock('primo_search')
+ mock_primo.expects(:search).returns({
+ 'docs' => sample_docs,
+ 'info' => { 'total' => 500 }
+ }).at_least_once
+ PrimoSearch.expects(:new).returns(mock_primo).at_least_once
+ mock_normalizer = mock('normalizer')
+ mock_normalizer.expects(:normalize).returns(sample_docs).at_least_once
+ NormalizePrimoResults.expects(:new).returns(mock_normalizer).at_least_once
mock_timdex_search_with_hits(300)
get '/results?q=test&tab=all'
@@ -885,7 +1016,24 @@ def source_filter_count(controller)
end
test 'all tab pagination includes next page link when more results available' do
- mock_primo_search_with_hits(500)
+ sample_docs = (1..10).map do |i|
+ {
+ title: "Sample Primo Document Title \\#{i}",
+ format: 'Article',
+ year: '2025',
+ creators: [{ value: "Author \\#{i}", link: nil }],
+ links: [{ 'kind' => 'full record', 'url' => "https://example.com/record\\#{i}" }]
+ }
+ end
+ mock_primo = mock('primo_search')
+ mock_primo.expects(:search).returns({
+ 'docs' => sample_docs,
+ 'info' => { 'total' => 500 }
+ }).at_least_once
+ PrimoSearch.expects(:new).returns(mock_primo).at_least_once
+ mock_normalizer = mock('normalizer')
+ mock_normalizer.expects(:normalize).returns(sample_docs).at_least_once
+ NormalizePrimoResults.expects(:new).returns(mock_normalizer).at_least_once
mock_timdex_search_with_hits(300)
get '/results?q=test&tab=all'
@@ -896,7 +1044,24 @@ def source_filter_count(controller)
end
test 'all tab pagination on page 2 includes previous page link' do
- mock_primo_search_with_hits(500)
+ sample_docs = (1..10).map do |i|
+ {
+ title: "Sample Primo Document Title \\#{i}",
+ format: 'Article',
+ year: '2025',
+ creators: [{ value: "Author \\#{i}", link: nil }],
+ links: [{ 'kind' => 'full record', 'url' => "https://example.com/record\\#{i}" }]
+ }
+ end
+ mock_primo = mock('primo_search')
+ mock_primo.expects(:search).returns({
+ 'docs' => sample_docs,
+ 'info' => { 'total' => 500 }
+ }).at_least_once
+ PrimoSearch.expects(:new).returns(mock_primo).at_least_once
+ mock_normalizer = mock('normalizer')
+ mock_normalizer.expects(:normalize).returns(sample_docs).at_least_once
+ NormalizePrimoResults.expects(:new).returns(mock_normalizer).at_least_once
mock_timdex_search_with_hits(300)
get '/results?q=test&tab=all&page=2'
@@ -908,4 +1073,48 @@ def source_filter_count(controller)
# Should show current range (21-40 for page 2)
assert_select '.pagination-container .current', text: /21 - 40 of 800/
end
+
+ test 'merge_results handles unbalanced API responses correctly' do
+ # Test case 1: Primo has fewer results than TIMDEX
+ paginator = MergedSearchPaginator.new(primo_total: 3, timdex_total: 5, current_page: 1, per_page: 8)
+ primo_results = %w[P1 P2 P3]
+ timdex_results = %w[T1 T2 T3 T4 T5]
+ merged = paginator.merge_results(primo_results, timdex_results)
+ expected = %w[P1 T1 P2 T2 P3 T3 T4 T5]
+ assert_equal expected, merged
+
+ # Test case 2: TIMDEX has fewer results than Primo
+ paginator = MergedSearchPaginator.new(primo_total: 5, timdex_total: 3, current_page: 1, per_page: 8)
+ primo_results = %w[P1 P2 P3 P4 P5]
+ timdex_results = %w[T1 T2 T3]
+ merged = paginator.merge_results(primo_results, timdex_results)
+ expected = %w[P1 T1 P2 T2 P3 T3 P4 P5]
+ assert_equal expected, merged
+
+ # Test case 3: Results exceed per_page limit (default 20)
+ paginator = MergedSearchPaginator.new(primo_total: 15, timdex_total: 15, current_page: 1, per_page: 20)
+ primo_results = (1..15).map { |i| "P#{i}" }
+ timdex_results = (1..15).map { |i| "T#{i}" }
+ merged = paginator.merge_results(primo_results, timdex_results)
+ assert_equal 20, merged.length
+ assert_equal 'P1', merged[0]
+ assert_equal 'T1', merged[1]
+ assert_equal 'P2', merged[2]
+ assert_equal 'T2', merged[3]
+
+ # Test case 4: One array is empty
+ paginator = MergedSearchPaginator.new(primo_total: 0, timdex_total: 3, current_page: 1, per_page: 3)
+ primo_results = []
+ timdex_results = %w[T1 T2 T3]
+ merged = paginator.merge_results(primo_results, timdex_results)
+ assert_equal %w[T1 T2 T3], merged
+
+ # Test case 5: more than 10 results from a single source can display when appropriate
+ paginator = MergedSearchPaginator.new(primo_total: 7, timdex_total: 11, current_page: 1, per_page: 18)
+ primo_results = (1..7).map { |i| "P#{i}" }
+ timdex_results = (1..11).map { |i| "T#{i}" }
+ merged = paginator.merge_results(primo_results, timdex_results)
+ expected = %w[P1 T1 P2 T2 P3 T3 P4 T4 P5 T5 P6 T6 P7 T7 T8 T9 T10 T11]
+ assert_equal expected, merged
+ end
end
diff --git a/test/models/merged_search_paginator_test.rb b/test/models/merged_search_paginator_test.rb
new file mode 100644
index 00000000..8627a5e7
--- /dev/null
+++ b/test/models/merged_search_paginator_test.rb
@@ -0,0 +1,58 @@
+# frozen_string_literal: true
+
+require 'test_helper'
+
+class MergedSearchPaginatorTest < ActiveSupport::TestCase
+ test 'merge_plan handles balanced results' do
+ paginator = MergedSearchPaginator.new(primo_total: 3, timdex_total: 3, current_page: 1, per_page: 6)
+ assert_equal(%i[primo timdex primo timdex primo timdex], paginator.merge_plan)
+ end
+
+ test 'merge_plan handles unbalanced results' do
+ paginator = MergedSearchPaginator.new(primo_total: 6, timdex_total: 2, current_page: 1, per_page: 8)
+ assert_equal(%i[primo timdex primo timdex primo primo primo primo], paginator.merge_plan)
+ end
+
+ test 'api_offsets are calculated as expected' do
+ paginator = MergedSearchPaginator.new(primo_total: 10, timdex_total: 10, current_page: 2, per_page: 5)
+ assert_equal([3, 2], paginator.api_offsets)
+ end
+
+ test 'merge_results handles even results' do
+ paginator = MergedSearchPaginator.new(primo_total: 2, timdex_total: 2, current_page: 1, per_page: 4)
+ primo = %w[P1 P2]
+ timdex = %w[T1 T2]
+ assert_equal(%w[P1 T1 P2 T2], paginator.merge_results(primo, timdex))
+ end
+
+ test 'merge_results with shorter array' do
+ paginator = MergedSearchPaginator.new(primo_total: 3, timdex_total: 1, current_page: 1, per_page: 4)
+ primo = %w[P1 P2 P3]
+ timdex = %w[T1]
+ assert_equal(%w[P1 T1 P2 P3], paginator.merge_results(primo, timdex))
+ end
+
+ test 'api_offsets breaks when start_index exceeds totals' do
+ # Use very small totals and request a page far beyond available results to exercise the break
+ paginator = MergedSearchPaginator.new(primo_total: 1, timdex_total: 1, current_page: 5, per_page: 20)
+ primo_offset, timdex_offset = paginator.api_offsets
+
+ # Offsets should stop at the available totals (1 each)
+ assert_equal 1, primo_offset
+ assert_equal 1, timdex_offset
+ end
+
+ test 'merge_plan returns all primo when timdex is empty' do
+ paginator = MergedSearchPaginator.new(primo_total: 2, timdex_total: 0, current_page: 1, per_page: 5)
+ plan = paginator.merge_plan
+
+ assert_equal %i[primo primo], plan
+ end
+
+ test 'merge_plan returns all timdex when primo is empty' do
+ paginator = MergedSearchPaginator.new(primo_total: 0, timdex_total: 2, current_page: 1, per_page: 5)
+ plan = paginator.merge_plan
+
+ assert_equal %i[timdex timdex], plan
+ end
+end
diff --git a/test/vcr_cassettes/advanced_title_data.yml b/test/vcr_cassettes/advanced_title_data.yml
deleted file mode 100644
index 846c4e8b..00000000
--- a/test/vcr_cassettes/advanced_title_data.yml
+++ /dev/null
@@ -1,90 +0,0 @@
----
-http_interactions:
-- request:
- method: post
- uri: https://FAKE_TIMDEX_HOST/graphql
- body:
- encoding: UTF-8
- string: '{"query":"query TimdexSearch__BaseQuery($q: String, $citation: String,
- $contributors: String, $fundingInformation: String, $identifiers: String,
- $locations: String, $subjects: String, $title: String, $index: String, $from:
- String, $booleanType: String, $accessToFilesFilter: [String!], $contentTypeFilter:
- [String!], $contributorsFilter: [String!], $formatFilter: [String!], $languagesFilter:
- [String!], $literaryFormFilter: String, $placesFilter: [String!], $sourceFilter:
- [String!], $subjectsFilter: [String!]) {\n search(searchterm: $q, citation:
- $citation, contributors: $contributors, fundingInformation: $fundingInformation,
- identifiers: $identifiers, locations: $locations, subjects: $subjects, title:
- $title, index: $index, from: $from, booleanType: $booleanType, accessToFilesFilter:
- $accessToFilesFilter, contentTypeFilter: $contentTypeFilter, contributorsFilter:
- $contributorsFilter, formatFilter: $formatFilter, languagesFilter: $languagesFilter,
- literaryFormFilter: $literaryFormFilter, placesFilter: $placesFilter, sourceFilter:
- $sourceFilter, subjectsFilter: $subjectsFilter) {\n hits\n records {\n timdexRecordId\n title\n source\n contentType\n contributors
- {\n kind\n value\n }\n publicationInformation\n dates
- {\n kind\n value\n }\n links {\n kind\n restrictions\n text\n url\n }\n notes
- {\n kind\n value\n }\n highlight {\n matchedField\n matchedPhrases\n }\n provider\n rights
- {\n kind\n description\n uri\n }\n sourceLink\n summary\n }\n aggregations
- {\n accessToFiles {\n key\n docCount\n }\n contentType
- {\n key\n docCount\n }\n contributors {\n key\n docCount\n }\n format
- {\n key\n docCount\n }\n languages {\n key\n docCount\n }\n literaryForm
- {\n key\n docCount\n }\n places {\n key\n docCount\n }\n source
- {\n key\n docCount\n }\n subjects {\n key\n docCount\n }\n }\n }\n}","variables":{"from":"0","title":"data","booleanType":"AND","index":"FAKE_TIMDEX_INDEX"},"operationName":"TimdexSearch__BaseQuery"}'
- headers:
- Accept-Encoding:
- - gzip;q=1.0,deflate;q=0.6,identity;q=0.3
- Accept:
- - application/json
- User-Agent:
- - MIT Libraries Client
- Content-Type:
- - application/json
- response:
- status:
- code: 200
- message: OK
- headers:
- Server:
- - Cowboy
- Date:
- - Thu, 25 Apr 2024 20:57:17 GMT
- Report-To:
- - '{"group":"heroku-nel","max_age":3600,"endpoints":[{"url":"https://nel.heroku.com/reports?ts=1714078637&sid=67ff5de4-ad2b-4112-9289-cf96be89efed&s=Oe%2BY3GtI7ZglEtcdCIpU4KA2AQDyWWWXZ%2BJu0RXMXp0%3D"}]}'
- Reporting-Endpoints:
- - heroku-nel=https://nel.heroku.com/reports?ts=1714078637&sid=67ff5de4-ad2b-4112-9289-cf96be89efed&s=Oe%2BY3GtI7ZglEtcdCIpU4KA2AQDyWWWXZ%2BJu0RXMXp0%3D
- Nel:
- - '{"report_to":"heroku-nel","max_age":3600,"success_fraction":0.005,"failure_fraction":0.05,"response_headers":["Via"]}'
- Connection:
- - keep-alive
- X-Frame-Options:
- - SAMEORIGIN
- X-Xss-Protection:
- - '0'
- X-Content-Type-Options:
- - nosniff
- X-Permitted-Cross-Domain-Policies:
- - none
- Referrer-Policy:
- - strict-origin-when-cross-origin
- Content-Type:
- - application/json; charset=utf-8
- Vary:
- - Accept, Origin
- Etag:
- - W/"cea195da477c7f17058ba8ea7172e175"
- Cache-Control:
- - max-age=0, private, must-revalidate
- X-Request-Id:
- - 9b9ae3f1-d1cc-4e08-b449-6505a46abce8
- X-Runtime:
- - '0.367373'
- Strict-Transport-Security:
- - max-age=63072000; includeSubDomains
- Content-Length:
- - '42683'
- Via:
- - 1.1 vegur
- body:
- encoding: ASCII-8BIT
- string: !binary |-
- {"data":{"search":{"hits":10000,"records":[{"timdexRecordId":"alma:990002860400106761","title":"Data data","contentType":["Language material"],"contributors":[{"kind":"Not specified","value":"Deep Sea Drilling Project. Information Handling Group"}],"publicationInformation":null,"dates":[{"kind":"Publication date","value":"1976"}],"links":null,"notes":[{"kind":"Title Statement of Responsibility","value":["Information Handling Group, Deep Sea Drilling Project"]},{"kind":"General Note","value":["Title from caption"]},{"kind":"General Note","value":["Description based on: #12 (Nov. 1978)"]},{"kind":"Numbering Peculiarities Note","value":["Some numbers are revised edition"]}],"highlight":[{"matchedField":"title","matchedPhrases":["\u003cspan class=\"highlight\"\u003eData\u003c/span\u003e \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e"]}],"provider":null,"rights":null,"sourceLink":"https://mit.primo.exlibrisgroup.com/discovery/fulldisplay?vid=01MIT_INST:MIT\u0026docid=alma990002860400106761","summary":["A series of bulletins, each with a distinctive title, describing the various data processing activities of the Deep Sea Drilling Project and the Information Handling Group."]},{"timdexRecordId":"alma:9935147137306761","title":"Big data, open data and data development","contentType":["Language material"],"contributors":[{"kind":"author","value":"Monino, Jean-Louis"},{"kind":"author","value":"Sedkaoui, Soraya"}],"publicationInformation":null,"dates":[{"kind":"Publication date","value":"2016"}],"links":[{"kind":"Digital object URL","restrictions":null,"text":"O'Reilly Online Learning: Academic/Public Library Edition","url":"https://na06.alma.exlibrisgroup.com/view/uresolver/01MIT_INST/openurl?u.ignore_date_coverage=true\u0026portfolio_pid=53562583990006761\u0026Force_direct=true"},{"kind":"Digital object URL","restrictions":null,"text":"Wiley Online Library UBCM all Online Books","url":"https://na06.alma.exlibrisgroup.com/view/uresolver/01MIT_INST/openurl?u.ignore_date_coverage=true\u0026portfolio_pid=53629737340006761\u0026Force_direct=true"}],"notes":[{"kind":"Title Statement of Responsibility","value":["Jean-Louis Monino, Soraya Sedkaoui"]},{"kind":"General Note","value":["Description based upon print version of record"]},{"kind":"Bibliography Note","value":["Includes bibliographical references and index"]},{"kind":"Source of Description Note","value":["Description based on print version record"]}],"highlight":[{"matchedField":"title","matchedPhrases":["Big \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e, open \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e and \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e development"]}],"provider":null,"rights":null,"sourceLink":"https://mit.primo.exlibrisgroup.com/discovery/fulldisplay?vid=01MIT_INST:MIT\u0026docid=alma9935147137306761","summary":["The world has become digital and technological advances have multiplied circuits with access to data, their processing and their diffusion. New technologies have now reached a certain maturity. Data are available to everyone, anywhere on the planet. The number of Internet users in 2014 was 2.9 billion or 41% of the world population. The need for knowledge is becoming apparent in order to understand this multitude of data. We must educate, inform and train the masses. The development of related technologies, such as the advent of the Internet, social networks, \"cloud-computing\" (digital factories), has increased the available volumes of data. Currently, each individual creates, consumes, uses digital information: more than 3.4 million e-mails are sent worldwide every second, or 107,000 billion annually with 14,600 e-mails per year per person, but more than 70% are spam. Billions of pieces of content are shared on social networks such as Facebook, more than 2.46 million every minute. We spend more than 4.8 hours a day on the Internet using a computer, and 2.1 hours using a mobile. Data, this new ethereal manna from heaven, is produced in real time. It comes in a continuous stream from a multitude of sources which are generally heterogeneous. This accumulation of data of all types (audio, video, files, photos, etc.) generates new activities, the aim of which is to analyze this enormous mass of information. It is then necessary to adapt and try new approaches, new methods, new knowledge and new ways of working, resulting in new properties and new challenges since SEO logic must be created and implemented. At company level, this mass of data is difficult to manage. Its interpretation is primarily a challenge. This impacts those who are there to \"manipulate\" the mass and requires a specific infrastructure for creation, storage, processing, analysis and recovery. The biggest challenge lies in \"the valuing of data\" available in quantity, diversity and access speed."]},{"timdexRecordId":"alma:9935242752006761","title":"Strata Data Superstream Series: Data Warehouses, Data Lakes, and Data Lakehouses","contentType":["Projected medium"],"contributors":null,"publicationInformation":null,"dates":[{"kind":"Publication date","value":"2021"}],"links":[{"kind":"Digital object URL","restrictions":null,"text":"O'Reilly Online Learning: Academic/Public Library Edition","url":"https://na06.alma.exlibrisgroup.com/view/uresolver/01MIT_INST/openurl?u.ignore_date_coverage=true\u0026portfolio_pid=53646255750006761\u0026Force_direct=true"}],"notes":null,"highlight":[{"matchedField":"title","matchedPhrases":["Strata \u003cspan class=\"highlight\"\u003eData\u003c/span\u003e Superstream Series: \u003cspan class=\"highlight\"\u003eData\u003c/span\u003e Warehouses, \u003cspan class=\"highlight\"\u003eData\u003c/span\u003e Lakes, and \u003cspan class=\"highlight\"\u003eData\u003c/span\u003e Lakehouses"]}],"provider":null,"rights":null,"sourceLink":"https://mit.primo.exlibrisgroup.com/discovery/fulldisplay?vid=01MIT_INST:MIT\u0026docid=alma9935242752006761","summary":null},{"timdexRecordId":"alma:990034993430106761","title":"The Data Revolution : Big Data, Open Data, Data Infrastructures \u0026 Their Consequences","contentType":["Language material"],"contributors":[{"kind":"Not specified","value":"Kitchin, Rob"}],"publicationInformation":null,"dates":[{"kind":"Publication date","value":"2014"}],"links":[{"kind":"Digital object URL","restrictions":null,"text":"EBSCOhost Ebooks","url":"http://search.ebscohost.com/login.aspx?direct=true\u0026scope=site\u0026db=nlebk\u0026db=nlabk\u0026AN=801594"},{"kind":"EBSCOhost","restrictions":null,"text":null,"url":"http://search.ebscohost.com/login.aspx?direct=true\u0026scope=site\u0026db=nlebk\u0026db=nlabk\u0026AN=801594"}],"notes":[{"kind":"Title Statement of Responsibility","value":["Rob Kitchin"]},{"kind":"Bibliography Note","value":["Includes bibliographical references and index"]},{"kind":"Source of Description Note","value":["Print version record"]}],"highlight":[{"matchedField":"title","matchedPhrases":["The \u003cspan class=\"highlight\"\u003eData\u003c/span\u003e Revolution : Big \u003cspan class=\"highlight\"\u003eData\u003c/span\u003e, Open \u003cspan class=\"highlight\"\u003eData\u003c/span\u003e, \u003cspan class=\"highlight\"\u003eData\u003c/span\u003e Infrastructures \u0026 Their Consequences"]}],"provider":null,"rights":null,"sourceLink":"https://mit.primo.exlibrisgroup.com/discovery/fulldisplay?vid=01MIT_INST:MIT\u0026docid=alma990034993430106761","summary":["A seminal text, written by one of the world's leading experts in the field. In contrast to the hype and hubris of much media and business coverage, it provides a synoptic and truly critical analysis of 'big data', 'open data' and the emerging data landscape."]},{"timdexRecordId":"alma:990022970670106761","title":"The data revolution : big data, open data, data infrastructures \u0026 their consequences","contentType":["Language material"],"contributors":[{"kind":"author","value":"Kitchin, Rob"}],"publicationInformation":null,"dates":[{"kind":"Publication date","value":"2014"}],"links":null,"notes":[{"kind":"Title Statement of Responsibility","value":["Rob Kitchin"]},{"kind":"Bibliography Note","value":["Includes bibliographical references (pages 193-214) and index"]}],"highlight":[{"matchedField":"title","matchedPhrases":["The \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e revolution : big \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e, open \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e, \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e infrastructures \u0026 their consequences"]}],"provider":null,"rights":null,"sourceLink":"https://mit.primo.exlibrisgroup.com/discovery/fulldisplay?vid=01MIT_INST:MIT\u0026docid=alma990022970670106761","summary":["\"Traditionally, data has been a scarce commodity which, given its value, has been either jealously guarded or expensively traded. In recent years, technological developments and political lobbying have turned this position on its head. Data now flow as a deep and wide torrent, are low in cost and supported by robust infrastructures, and are increasingly open and accessible. A data revolution is underway, one that is already reshaping how knowledge is produced, business conducted, and governance enacted, as well as raising many questions concerning surveillance, privacy, security, profiling, social sorting, and intellectual property rights. In contrast to the hype and hubris of much media and business coverage, The Data Revolution provides a synoptic and critical analysis of the emerging data landscape.\"--Excerpted from publisher's description."]},{"timdexRecordId":"alma:9935068007606761","title":"Data architecture : a primer for the data scientist : big data, data warehouse and data vault","contentType":["Language material"],"contributors":[{"kind":"author","value":"Inmon, W. H"},{"kind":"author","value":"Linstedt, Dan"},{"kind":"editor","value":"Elliot, Steven"},{"kind":"designer","value":"Rogers, Mark"}],"publicationInformation":["Morgan Kaufmann; 2015; Amsterdam, Netherlands","©2015"],"dates":[{"kind":"Publication date","value":"2015"}],"links":[{"kind":"Digital object URL","restrictions":null,"text":"O'Reilly Online Learning: Academic/Public Library Edition","url":"https://na06.alma.exlibrisgroup.com/view/uresolver/01MIT_INST/openurl?u.ignore_date_coverage=true\u0026portfolio_pid=53545576320006761\u0026Force_direct=true"},{"kind":"Digital object URL","restrictions":null,"text":"Elsevier ScienceDirect Books Complete","url":"https://na06.alma.exlibrisgroup.com/view/uresolver/01MIT_INST/openurl?u.ignore_date_coverage=true\u0026portfolio_pid=53545576310006761\u0026Force_direct=true"}],"notes":[{"kind":"Title Statement of Responsibility","value":["W. H. Inmon, Dan Linstedt ; Steven Elliot, executive editor ; Mark Rogers, designer"]},{"kind":"General Note","value":["Includes index"]},{"kind":"Source of Description Note","value":["Description based on print version record"]}],"highlight":[{"matchedField":"title","matchedPhrases":["\u003cspan class=\"highlight\"\u003eData\u003c/span\u003e architecture : a primer for the \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e scientist : big \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e, \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e warehouse and \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e vault"]}],"provider":null,"rights":null,"sourceLink":"https://mit.primo.exlibrisgroup.com/discovery/fulldisplay?vid=01MIT_INST:MIT\u0026docid=alma9935068007606761","summary":["Today, the world is trying to create and educate data scientists because of the phenomenon of Big Data. And everyone is looking deeply into this technology. But no one is looking at the larger architectural picture of how Big Data needs to fit within the existing systems (data warehousing systems). Taking a look at the larger picture into which Big Data fits gives the data scientist the necessary context for how pieces of the puzzle should fit together. Most references on Big Data look at only one tiny part of a much larger whole. Until data gathered can be put into an existing framework or a"]},{"timdexRecordId":"alma:9935114452906761","title":"Java data analysis : data mining, big data analysis, NoSQL, and data visualization","contentType":["Language material"],"contributors":[{"kind":"author","value":"Hubbard, John R"}],"publicationInformation":null,"dates":[{"kind":"Publication date","value":"2017"}],"links":[{"kind":"Digital object URL","restrictions":null,"text":"O'Reilly Online Learning: Academic/Public Library Edition","url":"https://na06.alma.exlibrisgroup.com/view/uresolver/01MIT_INST/openurl?u.ignore_date_coverage=true\u0026portfolio_pid=53555617160006761\u0026Force_direct=true"}],"notes":[{"kind":"Title Statement of Responsibility","value":["John R. Hubbard"]},{"kind":"General Note","value":["Includes index"]},{"kind":"Source of Description Note","value":["Description based on online resource; title from PDF title page (ebrary, viewed October 18, 2017)"]}],"highlight":[{"matchedField":"title","matchedPhrases":["Java \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e analysis : \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e mining, big \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e analysis, NoSQL, and \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e visualization"]}],"provider":null,"rights":null,"sourceLink":"https://mit.primo.exlibrisgroup.com/discovery/fulldisplay?vid=01MIT_INST:MIT\u0026docid=alma9935114452906761","summary":["Get the most out of the popular Java libraries and tools to perform efficient data analysis About This Book Get your basics right for data analysis with Java and make sense of your data through effective visualizations. Use various Java APIs and tools such as Rapidminer and WEKA for effective data analysis and machine learning. This is your companion to understanding and implementing a solid data analysis solution using Java Who This Book Is For If you are a student or Java developer or a budding data scientist who wishes to learn the fundamentals of data analysis and learn to perform data analysis with Java, this book is for you. Some familiarity with elementary statistics and relational databases will be helpful but is not mandatory, to get the most out of this book. A firm understanding of Java is required. What You Will Learn Develop Java programs that analyze data sets of nearly any size, including text Implement important machine learning algorithms such as regression, classification, and clustering Interface with and apply standard open source Java libraries and APIs to analyze and visualize data Process data from both relational and non-relational databases and from time-series data Employ Java tools to visualize data in various forms Understand multimedia data analysis algorithms and implement them in Java. In Detail Data analysis is a process of inspecting, cleansing, transforming, and modeling data with the aim of discovering useful information. Java is one of the most popular languages to perform your data analysis tasks. This book will help you learn the tools and techniques in Java to conduct data analysis without any hassle. After getting a quick overview of what data science is and the steps involved in the process, you'll learn the statistical data analysis techniques and implement them using the popular Java APIs and libraries. Through practical examples, you will also learn the machine learning concepts such as classification and regression. In the process, you'll familiarize yourself with tools such as Rapidminer and WEKA and see how these Java-based tools can be used effectively for analysis. You will also learn how to analyze text and other types of multimedia. Learn to work with relational, NoSQL, and time-series data. This book will also show you how you can utilize different Java-based libraries to create insightful and easy to understand plots and graphs. By the end of this book, you will have a solid understanding of..."]},{"timdexRecordId":"alma:990004603640106761","title":"Data with semantics : data models and data management","contentType":["Language material"],"contributors":[{"kind":"Not specified","value":"Thompson, J. Patrick"}],"publicationInformation":null,"dates":[{"kind":"Publication date","value":"1989"}],"links":null,"notes":[{"kind":"Title Statement of Responsibility","value":["J. Patrick Thompson"]},{"kind":"General Note","value":["Includes index"]},{"kind":"Bibliography Note","value":["Bibliography: p. 465-468"]}],"highlight":[{"matchedField":"title","matchedPhrases":["\u003cspan class=\"highlight\"\u003eData\u003c/span\u003e with semantics : \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e models and \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e management"]}],"provider":null,"rights":null,"sourceLink":"https://mit.primo.exlibrisgroup.com/discovery/fulldisplay?vid=01MIT_INST:MIT\u0026docid=alma990004603640106761","summary":null},{"timdexRecordId":"aspace:repositories-2-resources-1273","title":"\"Data Driven\" Film Interviews Collection","contentType":["Archival materials"],"contributors":[{"kind":"source","value":"Zernike, Kate"},{"kind":"source","value":"Stubbe, JoAnne"},{"kind":"source","value":"Sive, Hazel L."},{"kind":"source","value":"Schwettmann, Sarah"},{"kind":"source","value":"Royden, Leigh Handy"},{"kind":"source","value":"Malanotte-Rizzoli, Paola, 1946-"},{"kind":"source","value":"Pardue, Mary Lou"},{"kind":"source","value":"Orr-Weaver, Terry L."},{"kind":"source","value":"McNutt, Marcia Kemper, 1952-"},{"kind":"source","value":"Lehmann, Ruth"},{"kind":"source","value":"Hopkins, Nancy (Nancy H.)"},{"kind":"source","value":"Gibson, Lorna J."},{"kind":"source","value":"Chisholm, Sallie W."},{"kind":"source","value":"Ceyer, Sylvia Teresse"},{"kind":"source","value":"Bhatia, Sangeeta, 1968-"},{"kind":"source","value":"Bailyn, Lotte"},{"kind":"Creator","value":"Wicked Delicate Films"},{"kind":"Creator","value":"Massachusetts Institute of Technology. MIT Press"}],"publicationInformation":null,"dates":[{"kind":"creation","value":"2018-08-28"}],"links":null,"notes":[{"kind":"Historical Note","value":["A Study on the Status of Women Faculty in the School of Science at MIT: How a Committee on Women Faculty came to be established by the Dean of the School of Science, what the Committee and the Dean learned and accomplished, and recommendations for the future. MIT Faculty Newsletter , March 1999.","In 1995 the Dean of Science established a Committee to analyze the status of women faculty in the six departments in the School of Science at the Masschusetts Institute of Technology (MIT). The Committee submitted a report of its findings in August, 1996 and amended reports in 1997 and 1998. The Committee discovered that junior women faculty felt well supported within their departments. In contrast to junior women, many tenured women faculty felt marginalized and excluded from a significant role in their departments. Marginalization increased as women progressed through their careers at MIT.","View the March 1999 MIT Faculty Newsletter for more information on this report."]},{"kind":"Scope and Contents","value":["This collection consists of video interviews and transcripts with 17 female Masschusetts Institute of Technology faculty members and the short documentary using the interviews, \"The Uprising\". The interviews focus on women faculty in science and engineering at MIT, and more broadly gender equity issues in STEM fields. Specifically referencing events discussed the 1999 report, Study on the Status of Women Faculty in Science at MIT. The interviews were produced by Wicked Delicate Films in conjunction with the MIT Press and MIT Libraries. Interviews may be used in a future film, Data Driven. A documentary was made by Wicked Delicate Films called \"Picture A Scientist\" which featured some of the footage."]}],"highlight":[{"matchedField":"title","matchedPhrases":["\"\u003cspan class=\"highlight\"\u003eData\u003c/span\u003e Driven\" Film Interviews Collection"]}],"provider":null,"rights":[{"kind":"Conditions Governing Access","description":"Most of the collection is open for reading room access only per the donor agreement. Interviews with Nancy Hopkins are fully restricted. See access notes for individual items for more details.","uri":null},{"kind":"Conditions Governing Use","description":"Access to collections in the Department of Distinctive Collections is not authorization to publish. Please see the MIT Libraries Permissions Policy for permission information. Copyright of some items in this collection may be held by respective creators, not by the donor of the collection or MIT.","uri":null}],"sourceLink":"https://archivesspace.mit.edu/repositories/2/resources/1273","summary":null},{"timdexRecordId":"alma:9935428911006761","title":"The data revolution : a critical analysis of big data, open data \u0026 data infrastructures","contentType":["Language material"],"contributors":[{"kind":"author","value":"Kitchin, Rob"}],"publicationInformation":null,"dates":[{"kind":"Publication date","value":"2022"}],"links":null,"notes":[{"kind":"Title Statement of Responsibility","value":["Rob Kitchin"]},{"kind":"Bibliography Note","value":["Includes bibliographical references (pages 309-345) and index"]}],"highlight":[{"matchedField":"title","matchedPhrases":["The \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e revolution : a critical analysis of big \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e, open \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e \u0026 \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e infrastructures"]}],"provider":null,"rights":null,"sourceLink":"https://mit.primo.exlibrisgroup.com/discovery/fulldisplay?vid=01MIT_INST:MIT\u0026docid=alma9935428911006761","summary":null},{"timdexRecordId":"alma:9935181245506761","title":"Intelligent data analysis : from data gathering to data comprehension","contentType":["Language material"],"contributors":[{"kind":"editor","value":"Gupta, Deepak"}],"publicationInformation":null,"dates":[{"kind":"Publication date","value":"2020"}],"links":[{"kind":"Digital object URL","restrictions":null,"text":"O'Reilly Online Learning: Academic/Public Library Edition","url":"https://na06.alma.exlibrisgroup.com/view/uresolver/01MIT_INST/openurl?u.ignore_date_coverage=true\u0026portfolio_pid=53635037230006761\u0026Force_direct=true"},{"kind":"Digital object URL","restrictions":null,"text":"Wiley Online Library","url":"https://na06.alma.exlibrisgroup.com/view/uresolver/01MIT_INST/openurl?u.ignore_date_coverage=true\u0026portfolio_pid=53640988780006761\u0026Force_direct=true"}],"notes":[{"kind":"Title Statement of Responsibility","value":["edited by Deepak Gupta [and three others]"]},{"kind":"Bibliography Note","value":["Includes bibliographical references and index"]},{"kind":"Source of Description Note","value":["Description based on print version record"]}],"highlight":[{"matchedField":"title","matchedPhrases":["Intelligent \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e analysis : from \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e gathering to \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e comprehension"]}],"provider":null,"rights":null,"sourceLink":"https://mit.primo.exlibrisgroup.com/discovery/fulldisplay?vid=01MIT_INST:MIT\u0026docid=alma9935181245506761","summary":["\"The new tool for analyses is ?Intelligent Data Analysis (IDA)?. IDA can be defined as the use of specialized statistical, pattern recognition, machine learning, data abstraction, and visualization tools for analysis of data and discovery of mechanisms that created the data. Such data are typically complex, meaning that they are characterized by many records, many variables, subtle interactions between variables, or a combination of all three. Engineering, computing sciences, database science, machine learning, and even artificial intelligence are bringing their powers to this newly born data analysis discipline. The main idea underlying the concept of Intelligent Data Analysis is extracting knowledge from a very large amount of data, with a very large amount of variables; data that represents very complex, non-linear, real-life problems. Moreover, IDA can help when starting from the raw data, coping with prediction tasks without knowing the theoretical description of the underlying process, classification tasks of new events based on past ones, or modeling the aforementioned unknown process. Classification, prediction, and modeling are the cornerstones that Intelligent Data Analysis can bring to us\"--"]},{"timdexRecordId":"alma:990009384570106761","title":"e-Data : turning data into information with data warehousing","contentType":["Language material"],"contributors":[{"kind":"Not specified","value":"Dyché, Jill"}],"publicationInformation":null,"dates":[{"kind":"Publication date","value":"2000"}],"links":null,"notes":[{"kind":"Title Statement of Responsibility","value":["Jill Dyché"]},{"kind":"Bibliography Note","value":["Includes bibliographical references and index"]}],"highlight":[{"matchedField":"title","matchedPhrases":["e-\u003cspan class=\"highlight\"\u003eData\u003c/span\u003e : turning \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e into information with \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e warehousing"]}],"provider":null,"rights":null,"sourceLink":"https://mit.primo.exlibrisgroup.com/discovery/fulldisplay?vid=01MIT_INST:MIT\u0026docid=alma990009384570106761","summary":null},{"timdexRecordId":"alma:9935166318106761","title":"Data protection : ensuring data availability","contentType":["Language material"],"contributors":[{"kind":"author","value":"De Guise, Preston"}],"publicationInformation":null,"dates":[{"kind":"Publication date","value":"2020"}],"links":[{"kind":"Digital object URL","restrictions":null,"text":"Taylor \u0026 Francis Evidence Based Ebook Collection","url":"https://na06.alma.exlibrisgroup.com/view/uresolver/01MIT_INST/openurl?u.ignore_date_coverage=true\u0026portfolio_pid=53659508850006761\u0026Force_direct=true"}],"notes":[{"kind":"Title Statement of Responsibility","value":["Preston De Guise"]},{"kind":"Source of Description Note","value":["Description based on print version record"]}],"highlight":[{"matchedField":"title","matchedPhrases":["\u003cspan class=\"highlight\"\u003eData\u003c/span\u003e protection : ensuring \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e availability"]}],"provider":null,"rights":null,"sourceLink":"https://mit.primo.exlibrisgroup.com/discovery/fulldisplay?vid=01MIT_INST:MIT\u0026docid=alma9935166318106761","summary":["\"This book arms readers with information for making decisions on how to protect data from loss in the cloud, on-site, or both. It explains the changing face of data recovery and techniques for dealing with big data. The second edition has new chapters on ethical and legal issues, convergent data protection, architecture, smart data protection, and protection at the edge. It also includes expanded chapters on data protection in the cloud and protecting infrastructure. Key Features: Protect data and systems from ransomware and other cyberthreats Become compliant with legal requirements for protecting data Protect data in the cloud, on-premises, or in mixed environments Tackle deduplication to ensure data integrity Author Bio: Preston de Guise has been working with data recovery products for his entire career - designing, implementing and supporting solutions for governments, universities, and businesses ranging from SMEs to Fortune 500 companies. This broad exposure to industry verticals and business sizes has enabled Preston to understand not only the technical requirements of data protection and recovery, but the management and procedural aspects too\"--"]},{"timdexRecordId":"alma:9935511044006761","title":"Data protection ensuring data availability","contentType":["Language material"],"contributors":[{"kind":"Not specified","value":"De Guise, Preston"}],"publicationInformation":null,"dates":[{"kind":"Publication date","value":"2020"}],"links":[{"kind":"Digital object URL","restrictions":null,"text":"Taylor \u0026 Francis eBooks Complete","url":"https://www.taylorfrancis.com/books/9780367463496"},{"kind":"Digital object URL","restrictions":null,"text":null,"url":"https://www.taylorfrancis.com/books/9780367463496"}],"notes":[{"kind":"Title Statement of Responsibility","value":["by Preston De Guise"]},{"kind":"General Note","value":["6.5 Self-Reflection"]},{"kind":"Source of Description Note","value":["OCLC-licensed vendor bibliographic record"]}],"highlight":[{"matchedField":"title","matchedPhrases":["\u003cspan class=\"highlight\"\u003eData\u003c/span\u003e protection ensuring \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e availability"]}],"provider":null,"rights":null,"sourceLink":"https://mit.primo.exlibrisgroup.com/discovery/fulldisplay?vid=01MIT_INST:MIT\u0026docid=alma9935511044006761","summary":null},{"timdexRecordId":"alma:9935181015606761","title":"Data analytics and big data","contentType":["Language material"],"contributors":[{"kind":"author","value":"Sedkaoui, Soraya"}],"publicationInformation":["ISTE Ltd/John Wiley and Sons Inc; 2018; Hoboken, New Jersey"],"dates":[{"kind":"Publication date","value":"2018"}],"links":[{"kind":"Digital object URL","restrictions":null,"text":"Wiley Online Library","url":"https://na06.alma.exlibrisgroup.com/view/uresolver/01MIT_INST/openurl?u.ignore_date_coverage=true\u0026portfolio_pid=53636613240006761\u0026Force_direct=true"}],"notes":[{"kind":"Title Statement of Responsibility","value":["Soraya Sedkaoui"]},{"kind":"Bibliography Note","value":["Includes bibliographical references and index"]},{"kind":"Source of Description Note","value":["Description based on print version record"]}],"highlight":[{"matchedField":"title","matchedPhrases":["\u003cspan class=\"highlight\"\u003eData\u003c/span\u003e analytics and big \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e"]}],"provider":null,"rights":null,"sourceLink":"https://mit.primo.exlibrisgroup.com/discovery/fulldisplay?vid=01MIT_INST:MIT\u0026docid=alma9935181015606761","summary":null},{"timdexRecordId":"alma:9935084024406761","title":"Data Preprocessing in Data Mining","contentType":["Language material"],"contributors":[{"kind":"author","value":"García, Salvador"},{"kind":"author","value":"Luengo, Julián"},{"kind":"author","value":"Herrera, Francisco"}],"publicationInformation":null,"dates":[{"kind":"Publication date","value":"2015"}],"links":[{"kind":"Digital object URL","restrictions":null,"text":"SpringerLink Books Engineering","url":"https://na06.alma.exlibrisgroup.com/view/uresolver/01MIT_INST/openurl?u.ignore_date_coverage=true\u0026portfolio_pid=53622311660006761\u0026Force_direct=true"}],"notes":[{"kind":"Title Statement of Responsibility","value":["by Salvador García, Julián Luengo, Francisco Herrera"]},{"kind":"General Note","value":["Description based upon print version of record"]},{"kind":"Bibliography Note","value":["Includes bibliographical references and index"]}],"highlight":[{"matchedField":"title","matchedPhrases":["\u003cspan class=\"highlight\"\u003eData\u003c/span\u003e Preprocessing in \u003cspan class=\"highlight\"\u003eData\u003c/span\u003e Mining"]}],"provider":null,"rights":null,"sourceLink":"https://mit.primo.exlibrisgroup.com/discovery/fulldisplay?vid=01MIT_INST:MIT\u0026docid=alma9935084024406761","summary":["Data Preprocessing for Data Mining addresses one of the most important issues within the well-known Knowledge Discovery from Data process. Data directly taken from the source will likely have inconsistencies, errors or most importantly, it is not ready to be considered for a data mining process. Furthermore, the increasing amount of data in recent science, industry and business applications, calls to the requirement of more complex tools to analyze it. Thanks to data preprocessing, it is possible to convert the impossible into possible, adapting the data to fulfill the input demands of each data mining algorithm. Data preprocessing includes the data reduction techniques, which aim at reducing the complexity of the data, detecting or removing irrelevant and noisy elements from the data. This book is intended to review the tasks that fill the gap between the data acquisition from the source and the data mining process. A comprehensive look from a practical point of view, including basic concepts and surveying the techniques proposed in the specialized literature, is given.Each chapter is a stand-alone guide to a particular data preprocessing topic, from basic concepts and detailed descriptions of classical algorithms, to an incursion of an exhaustive catalog of recent developments. The in-depth technical descriptions make this book suitable for technical professionals, researchers, senior undergraduate and graduate students in data science, computer science and engineering."]},{"timdexRecordId":"alma:990021246000106761","title":"Data mining and data warehousing","contentType":["Language material"],"contributors":[{"kind":"Not specified","value":"Mourya, S. K"},{"kind":"Not specified","value":"Gupta, Shalu"}],"publicationInformation":null,"dates":[{"kind":"Publication date","value":"2013"}],"links":null,"notes":[{"kind":"Title Statement of Responsibility","value":["S.K. Mourya, Shalu Gupta"]},{"kind":"General Note","value":["Includes index"]}],"highlight":[{"matchedField":"title","matchedPhrases":["\u003cspan class=\"highlight\"\u003eData\u003c/span\u003e mining and \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e warehousing"]}],"provider":null,"rights":null,"sourceLink":"https://mit.primo.exlibrisgroup.com/discovery/fulldisplay?vid=01MIT_INST:MIT\u0026docid=alma990021246000106761","summary":null},{"timdexRecordId":"alma:990013541970106761","title":"Data mining and data visualization","contentType":["Language material"],"contributors":[{"kind":"Not specified","value":"Rao, C. Radhakrishna (Calyampudi Radhakrishna)"},{"kind":"Not specified","value":"Wegman, Edward J"},{"kind":"Not specified","value":"Solka, Jeffrey L"}],"publicationInformation":null,"dates":[{"kind":"Publication date","value":"2005"}],"links":null,"notes":[{"kind":"Title Statement of Responsibility","value":["edited by C.R. Rao, E.J. Wegman, J.L. Solka"]},{"kind":"Bibliography Note","value":["Includes bibliographical references and index"]}],"highlight":[{"matchedField":"title","matchedPhrases":["\u003cspan class=\"highlight\"\u003eData\u003c/span\u003e mining and \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e visualization"]}],"provider":null,"rights":null,"sourceLink":"https://mit.primo.exlibrisgroup.com/discovery/fulldisplay?vid=01MIT_INST:MIT\u0026docid=alma990013541970106761","summary":null},{"timdexRecordId":"alma:9935095680506761","title":"Data mining and data visualization","contentType":["Language material"],"contributors":[{"kind":"Not specified","value":"Rao, C. Radhakrishna (Calyampudi Radhakrishna)"},{"kind":"Not specified","value":"Wegman, Edward J"},{"kind":"Not specified","value":"Solka, Jeffrey L"}],"publicationInformation":null,"dates":[{"kind":"Publication date","value":"2005"}],"links":[{"kind":"Digital object URL","restrictions":null,"text":"Elsevier ScienceDirect Books Complete","url":"https://na06.alma.exlibrisgroup.com/view/uresolver/01MIT_INST/openurl?u.ignore_date_coverage=true\u0026portfolio_pid=53551559090006761\u0026Force_direct=true"}],"notes":[{"kind":"Title Statement of Responsibility","value":["edited by C.R. Rao, E.J. Wegman, J.L. Solka"]},{"kind":"General Note","value":["Description based upon print version of record"]},{"kind":"Bibliography Note","value":["Includes bibliographical references and index"]}],"highlight":[{"matchedField":"title","matchedPhrases":["\u003cspan class=\"highlight\"\u003eData\u003c/span\u003e mining and \u003cspan class=\"highlight\"\u003edata\u003c/span\u003e visualization"]}],"provider":null,"rights":null,"sourceLink":"https://mit.primo.exlibrisgroup.com/discovery/fulldisplay?vid=01MIT_INST:MIT\u0026docid=alma9935095680506761","summary":["This book focuses on dealing with large-scale data, a field commonly referred to as data mining. The book is divided into three sections. The first deals with an introduction to statistical aspects of data mining and machine learning and includes applications to text analysis, computer intrusion detection, and hiding of information in digital files. The second section focuses on a variety of statistical methodologies that have proven to be effective in data mining applications. These include clustering, classification, multivariate density estimation, tree-based methods, pattern recognition, o"]},{"timdexRecordId":"alma:9935146343506761","title":"Data Mining on Multimedia Data","contentType":["Language material"],"contributors":[{"kind":"author","value":"Perner, Petra"}],"publicationInformation":null,"dates":[{"kind":"Publication date","value":"2003"}],"links":[{"kind":"Digital object URL","restrictions":null,"text":"SpringerLink Books Lecture Notes In Computer Science Archive","url":"https://na06.alma.exlibrisgroup.com/view/uresolver/01MIT_INST/openurl?u.ignore_date_coverage=true\u0026portfolio_pid=53562407410006761\u0026Force_direct=true"},{"kind":"Digital object URL","restrictions":null,"text":"Springer Nature - Springer Book Archive - Collection 2000-2004","url":"https://na06.alma.exlibrisgroup.com/view/uresolver/01MIT_INST/openurl?u.ignore_date_coverage=true\u0026portfolio_pid=53562407390006761\u0026Force_direct=true"},{"kind":"Digital object URL","restrictions":null,"text":"Springer Nature - Springer Lecture Notes in Computer Science eBooks","url":"https://na06.alma.exlibrisgroup.com/view/uresolver/01MIT_INST/openurl?u.ignore_date_coverage=true\u0026portfolio_pid=53562407400006761\u0026Force_direct=true"}],"notes":[{"kind":"Title Statement of Responsibility","value":["by Petra Perner"]},{"kind":"General Note","value":["Bibliographic Level Mode of Issuance: Monograph"]},{"kind":"Bibliography Note","value":["Includes bibliographical references and index"]}],"highlight":[{"matchedField":"title","matchedPhrases":["\u003cspan class=\"highlight\"\u003eData\u003c/span\u003e Mining on Multimedia \u003cspan class=\"highlight\"\u003eData\u003c/span\u003e"]}],"provider":null,"rights":null,"sourceLink":"https://mit.primo.exlibrisgroup.com/discovery/fulldisplay?vid=01MIT_INST:MIT\u0026docid=alma9935146343506761","summary":["Despite being a young field of research and development, data mining has proved to be a successful approach to extracting knowledge from huge collections of structured digital data collection as usually stored in databases. Whereas data mining was done in early days primarily on numerical data, nowadays multimedia and Internet applications drive the need to develop data mining methods and techniques that can work on all kinds of data such as documents, images, and signals. This book introduces the basic concepts of mining multimedia data and demonstrates how to apply these methods in various application fields. It is written for students, ambitioned professionals from industry and medicine, and for scientists who want to contribute R\u0026D work to the field or apply this new technology."]}],"aggregations":{"accessToFiles":[{"key":"unknown: check with owning institution","docCount":3527},{"key":"MIT authentication required","docCount":51}],"contentType":[{"key":"language material","docCount":32648},{"key":"polygon data","docCount":1680},{"key":"article","docCount":1492},{"key":"thesis","docCount":1349},{"key":"dataset","docCount":1326},{"key":"manuscript language material","docCount":1076},{"key":"projected medium","docCount":999},{"key":"point data","docCount":954},{"key":"vector data","docCount":383},{"key":"raster data","docCount":373}],"contributors":[{"key":"geological survey (u.s.)","docCount":2408},{"key":"massachusetts institute of technology. department of electrical engineering and computer science","docCount":1057},{"key":"national bureau of economic research","docCount":805},{"key":"united states. government accountability office","docCount":781},{"key":"environmental systems research institute (redlands, calif.)","docCount":738},{"key":"institute of electrical and electronics engineers","docCount":604},{"key":"east view cartographic, incorporated","docCount":561},{"key":"association for computing machinery","docCount":432},{"key":"massachusetts institute of technology. department of electrical engineering and computer science.","docCount":396},{"key":"owen, andrew","docCount":383}],"format":[{"key":"electronic resource","docCount":4849},{"key":"shapefile","docCount":3057},{"key":"geotiff","docCount":373},{"key":"geopackage","docCount":78},{"key":"pdf","docCount":19},{"key":"jpeg","docCount":17},{"key":"tiff","docCount":10}],"languages":[{"key":"english","docCount":37029},{"key":"eng","docCount":1659},{"key":"en_us","docCount":1427},{"key":"en","docCount":918},{"key":"in english","docCount":375},{"key":"original language in english","docCount":132},{"key":"german","docCount":93},{"key":"french","docCount":83},{"key":"russian","docCount":35},{"key":"spanish","docCount":30}],"literaryForm":[{"key":"nonfiction","docCount":27311},{"key":"fiction","docCount":4953}],"places":[{"key":"earth (planet)","docCount":355},{"key":"china","docCount":321},{"key":"united states","docCount":256},{"key":"europe","docCount":179},{"key":"puerto rico","docCount":143},{"key":"ecuador","docCount":107},{"key":"republic of ecuador","docCount":107},{"key":"canada","docCount":101},{"key":"india","docCount":89},{"key":"paraguay","docCount":87}],"source":[{"key":"mit alma","docCount":35308},{"key":"opengeometadata gis resources","docCount":3527},{"key":"dspace@mit","docCount":3333},{"key":"woods hole open access server","docCount":789},{"key":"zenodo","docCount":645},{"key":"abdul latif jameel poverty action lab dataverse","docCount":60},{"key":"mit gis resources","docCount":51},{"key":"research databases","docCount":15},{"key":"libguides","docCount":7},{"key":"mit archivesspace","docCount":1}],"subjects":[{"key":"society","docCount":4284},{"key":"datasets","docCount":3290},{"key":"boundaries","docCount":2846},{"key":"united states","docCount":2653},{"key":"data mining","docCount":2151},{"key":"database management","docCount":1960},{"key":"artificial intelligence","docCount":1921},{"key":"big data","docCount":1573},{"key":"economy","docCount":1026},{"key":"census","docCount":985}]}}}}
- recorded_at: Thu, 25 Apr 2024 20:57:18 GMT
-recorded_with: VCR 6.2.0
From 0106e5f94150518bf71cb5bdbd2f310932d43f50 Mon Sep 17 00:00:00 2001
From: jazairi <16103405+jazairi@users.noreply.github.com>
Date: Wed, 3 Dec 2025 15:25:04 -0500
Subject: [PATCH 2/3] Refactor all tab logic to service and enable cache
Why these changes are being introduced:
In discussions of the PR in review for USE-179, we determined that the
proposed pagination improvements could be more efficient. We had also
determined that the code changes were difficult to follow, and could
use better documentation.
Relevant ticket(s):
- USE-179
How this addresses that need:
This commit caches the page 1 'summary' API calls, which we use to
gather the hit counts from each API to calculate pagination on deeper
pages. It also abstracts the 'all' tab code to a 'Merged Search Service',
mirroring the design pattern of the Merged Search Paginator, and adds
docstrings to the methods in that service.
Side effects of this change:
We are still making two API calls on deeper pages when the hit totals
are not cached. I could not find a workaround to this while still
supporting nonlinear pagination. However, the vast majority of users
(even bots, presumably) will begin their search at page 1, so hopefully
this is a rare occurrence.
---
app/controllers/search_controller.rb | 106 +---------
app/models/merged_search_service.rb | 235 +++++++++++++++++++++
test/controllers/search_controller_test.rb | 23 ++
test/models/merged_search_service_test.rb | 202 ++++++++++++++++++
4 files changed, 469 insertions(+), 97 deletions(-)
create mode 100644 app/models/merged_search_service.rb
create mode 100644 test/models/merged_search_service_test.rb
diff --git a/app/controllers/search_controller.rb b/app/controllers/search_controller.rb
index 222fefe2..00a431f7 100644
--- a/app/controllers/search_controller.rb
+++ b/app/controllers/search_controller.rb
@@ -90,11 +90,15 @@ def load_timdex_results
def load_all_results
current_page = @enhanced_query[:page] || 1
per_page = ENV.fetch('RESULTS_PER_PAGE', '20').to_i
- data = if current_page.to_i == 1
- fetch_all_tab_first_page(current_page, per_page)
- else
- fetch_all_tab_deeper_pages(current_page, per_page)
- end
+
+ service = MergedSearchService.new(
+ enhanced_query: @enhanced_query,
+ active_tab: @active_tab,
+ cache: Rails.cache,
+ primo_fetcher: ->(offset:, per_page:, query: nil) { fetch_primo_data(offset: offset, per_page: per_page) },
+ timdex_fetcher: ->(offset:, per_page:, query: nil) { fetch_timdex_data(offset: offset, per_page: per_page) }
+ )
+ data = service.fetch(page: current_page, per_page: per_page)
@results = data[:results]
@errors = data[:errors]
@@ -102,98 +106,6 @@ def load_all_results
@show_primo_continuation = data[:show_primo_continuation]
end
- def fetch_all_tab_first_page(current_page, per_page)
- primo_data, timdex_data = parallel_fetch({ offset: 0, per_page: per_page }, { offset: 0, per_page: per_page })
-
- paginator = build_paginator_from_data(primo_data, timdex_data, current_page, per_page)
-
- assemble_all_tab_result(paginator, primo_data, timdex_data, current_page, per_page)
- end
-
- def fetch_all_tab_deeper_pages(current_page, per_page)
- primo_summary, timdex_summary = parallel_fetch({ offset: 0, per_page: 1 }, { offset: 0, per_page: 1 })
-
- paginator = build_paginator_from_data(primo_summary, timdex_summary, current_page, per_page)
-
- primo_data, timdex_data = fetch_all_tab_page_chunks(paginator)
-
- assemble_all_tab_result(paginator, primo_data, timdex_data, current_page, per_page, deeper: true)
- end
-
- # Launch parallel fetch threads for Primo and Timdex and return their data
- def parallel_fetch(primo_opts = {}, timdex_opts = {})
- primo_thread = Thread.new { fetch_primo_data(**primo_opts) }
- timdex_thread = Thread.new { fetch_timdex_data(**timdex_opts) }
-
- [primo_thread.value, timdex_thread.value]
- end
-
- # Build a paginator from raw API response data
- def build_paginator_from_data(primo_data, timdex_data, current_page, per_page)
- primo_total = primo_data[:hits] || 0
- timdex_total = timdex_data[:hits] || 0
-
- MergedSearchPaginator.new(
- primo_total: primo_total,
- timdex_total: timdex_total,
- current_page: current_page,
- per_page: per_page
- )
- end
-
- # For deeper pages, compute merge_plan and api_offsets, then conditionally fetch page chunks
- def fetch_all_tab_page_chunks(paginator)
- merge_plan = paginator.merge_plan
- primo_count = merge_plan.count(:primo)
- timdex_count = merge_plan.count(:timdex)
- primo_offset, timdex_offset = paginator.api_offsets
-
- primo_thread = primo_count > 0 ? Thread.new { fetch_primo_data(offset: primo_offset, per_page: primo_count) } : nil
- timdex_thread = if timdex_count > 0
- Thread.new do
- fetch_timdex_data(offset: timdex_offset, per_page: timdex_count)
- end
- end
-
- primo_data = if primo_thread
- primo_thread.value
- else
- { results: [], errors: nil, hits: paginator.primo_total, show_continuation: false }
- end
-
- timdex_data = if timdex_thread
- timdex_thread.value
- else
- { results: [], errors: nil, hits: paginator.timdex_total }
- end
-
- [primo_data, timdex_data]
- end
-
- # Assemble the final result hash from paginator and API data
- def assemble_all_tab_result(paginator, primo_data, timdex_data, current_page, per_page, deeper: false)
- primo_total = primo_data[:hits] || 0
- timdex_total = timdex_data[:hits] || 0
-
- merged = paginator.merge_results(primo_data[:results] || [], timdex_data[:results] || [])
- errors = combine_errors(primo_data[:errors], timdex_data[:errors])
- pagination = Analyzer.new(@enhanced_query, timdex_total, :all, primo_total).pagination
-
- show_primo_continuation = if deeper
- page_offset = (current_page - 1) * per_page
- primo_data[:show_continuation] || (page_offset >= Analyzer::PRIMO_MAX_OFFSET)
- else
- primo_data[:show_continuation]
- end
-
- { results: merged, errors: errors, pagination: pagination, show_primo_continuation: show_primo_continuation }
- end
-
- def combine_errors(*error_arrays)
- all_errors = error_arrays.compact.flatten
- all_errors.any? ? all_errors : nil
- end
-
def fetch_primo_data(offset: nil, per_page: nil)
# Default to current page if not provided
current_page = @enhanced_query[:page] || 1
diff --git a/app/models/merged_search_service.rb b/app/models/merged_search_service.rb
new file mode 100644
index 00000000..69c8ec51
--- /dev/null
+++ b/app/models/merged_search_service.rb
@@ -0,0 +1,235 @@
+require 'digest'
+
+# Orchestrates merged "all" tab searches across Primo and TIMDEX.
+#
+# Handles parallel fetches, per-query totals caching, pagination calculation via
+# `MergedSearchPaginator`, and assembly of a controller-friendly response hash.
+class MergedSearchService
+ # Time to live value for cache expiration.
+ TTL = 10.minutes
+
+ # Initialize a new MergedSearchService.
+ #
+ # @param enhanced_query [Hash] query hash produced by `Enhancer`
+ # @param active_tab [String] the currently active tab (e.g. 'all')
+ # @param cache [Object] optional cache store responding to `read`/`write` (defaults to `Rails.cache`)
+ # @param primo_fetcher [#call] optional callable used to fetch Primo results; should accept `offset:, per_page:, query:`
+ # @param timdex_fetcher [#call] optional callable used to fetch TIMDEX results; should accept `offset:, per_page:, query:`
+ def initialize(enhanced_query:, active_tab:, cache: Rails.cache, primo_fetcher: nil, timdex_fetcher: nil)
+ @enhanced_query = enhanced_query
+ @active_tab = active_tab
+ @cache = cache
+ @primo_fetcher = primo_fetcher || method(:default_primo_fetch)
+ @timdex_fetcher = timdex_fetcher || method(:default_timdex_fetch)
+ end
+
+ # Execute merged search orchestration for the requested page.
+ #
+ # @param page [Integer] page number to fetch
+ # @param per_page [Integer] number of results per page
+ # @return [Hash] keys: :results, :errors, :pagination, :show_primo_continuation
+ def fetch(page:, per_page:)
+ current_page = (page || 1).to_i
+ per_page = (per_page || 20).to_i
+
+ if current_page == 1
+ primo_data, timdex_data = parallel_fetch(offset: 0, per_page: per_page)
+
+ totals = { primo: primo_data[:hits].to_i, timdex: timdex_data[:hits].to_i }
+ write_cached_totals(totals)
+
+ paginator = build_paginator_from_totals(totals, current_page, per_page)
+
+ results = assemble_all_tab_result(paginator, primo_data, timdex_data, current_page, per_page)
+
+ return results
+ end
+
+ totals = @cache.read(totals_cache_key)
+
+ unless totals
+ primo_summary, timdex_summary = parallel_fetch(offset: 0, per_page: 1)
+ totals = { primo: primo_summary[:hits].to_i, timdex: timdex_summary[:hits].to_i }
+ write_cached_totals(totals)
+ end
+
+ paginator = build_paginator_from_totals(totals, current_page, per_page)
+ primo_data, timdex_data = fetch_all_tab_page_chunks(paginator)
+
+ assemble_all_tab_result(paginator, primo_data, timdex_data, current_page, per_page, deeper: true)
+ end
+
+ private
+
+ # Generate the cache key used to store per-query totals for this enhanced query/tab.
+ #
+ # @return [String] cache key ending in '/totals'
+ def totals_cache_key
+ base = generate_cache_key(@enhanced_query.merge(tab: @active_tab))
+ "#{base}/totals"
+ end
+
+ # Persist per-query totals to cache(s).
+ #
+ # The method writes to the injected cache (if available) and to
+ # `Rails.cache`. Additional marker keys are written to improve test
+ # discoverability for stores that are probed with `read_matched`.
+ #
+ # @param totals [Hash] { primo: Integer, timdex: Integer }
+ def write_cached_totals(totals)
+ @cache.write(totals_cache_key, totals, expires_in: TTL) if @cache.respond_to?(:write)
+ Rails.cache.write(totals_cache_key, totals, expires_in: TTL)
+ Rails.cache.write("#{totals_cache_key}_marker_totals", totals, expires_in: TTL)
+ merged_key = "merged_search_totals:#{totals_cache_key}"
+ Rails.cache.write(merged_key, totals, expires_in: TTL)
+ end
+
+ # Perform parallel fetches from Primo and TIMDEX using the configured
+ # fetchers. Each fetcher should return the usual response hash including
+ # `:results` and `:hits`.
+ #
+ # WARNING: exceptions raised inside these threads will not automatically
+ # propagate to the caller; callers/tests should account for this.
+ #
+ # @param offset [Integer] api offset to request
+ # @param per_page [Integer] number of items to request
+ # @return [Array] [primo_response, timdex_response]
+ def parallel_fetch(offset:, per_page:)
+ primo = nil
+ timdex = nil
+ threads = []
+ threads << Thread.new { primo = @primo_fetcher.call(offset: offset, per_page: per_page, query: @enhanced_query) }
+ threads << Thread.new { timdex = @timdex_fetcher.call(offset: offset, per_page: per_page, query: @enhanced_query) }
+ threads.each(&:join)
+ [primo, timdex]
+ end
+
+ # Compute API offsets from the paginator and fetch the page-sized chunks
+ # required to assemble the merged page.
+ #
+ # @param paginator [MergedSearchPaginator]
+ # @return [Array] [primo_data, timdex_data]
+ def fetch_all_tab_page_chunks(paginator)
+ merge_plan = paginator.merge_plan
+ primo_count = merge_plan.count(:primo)
+ timdex_count = merge_plan.count(:timdex)
+ primo_offset, timdex_offset = paginator.api_offsets
+
+ primo_thread = if primo_count > 0
+ Thread.new do
+ @primo_fetcher.call(offset: primo_offset, per_page: primo_count, query: @enhanced_query)
+ end
+ end
+ timdex_thread = if timdex_count > 0
+ Thread.new do
+ @timdex_fetcher.call(offset: timdex_offset, per_page: timdex_count, query: @enhanced_query)
+ end
+ end
+
+ primo_data = if primo_thread
+ primo_thread.value
+ else
+ { results: [], errors: nil, hits: paginator.primo_total,
+ show_continuation: false }
+ end
+ timdex_data = timdex_thread ? timdex_thread.value : { results: [], errors: nil, hits: paginator.timdex_total }
+
+ [primo_data, timdex_data]
+ end
+
+ # Assemble the final hash returned to the controller for rendering.
+ #
+ # @param paginator [MergedSearchPaginator]
+ # @param primo_data [Hash] response from Primo fetcher
+ # @param timdex_data [Hash] response from TIMDEX fetcher
+ # @param current_page [Integer]
+ # @param per_page [Integer]
+ # @param deeper [Boolean] whether this was a deeper-page flow
+ # @return [Hash] response with :results, :errors, :pagination, :show_primo_continuation
+ def assemble_all_tab_result(paginator, primo_data, timdex_data, current_page, per_page, deeper: false)
+ primo_total = primo_data[:hits] || 0
+ timdex_total = timdex_data[:hits] || 0
+
+ merged = paginator.merge_results(primo_data[:results] || [], timdex_data[:results] || [])
+ errors = combine_errors(primo_data[:errors], timdex_data[:errors])
+ pagination = Analyzer.new(@enhanced_query, timdex_total, :all, primo_total).pagination
+
+ show_primo_continuation = if deeper
+ page_offset = (current_page - 1) * per_page
+ primo_data[:show_continuation] || (page_offset >= Analyzer::PRIMO_MAX_OFFSET)
+ else
+ primo_data[:show_continuation]
+ end
+
+ { results: merged, errors: errors, pagination: pagination, show_primo_continuation: show_primo_continuation }
+ end
+
+ # Merge multiple error arrays into a single array or nil when empty.
+ #
+ # @return [Array, nil]
+ def combine_errors(*error_arrays)
+ all_errors = error_arrays.compact.flatten
+ all_errors.any? ? all_errors : nil
+ end
+
+ # Build a `MergedSearchPaginator` given cached totals.
+ #
+ # @param totals [Hash] { primo: Integer, timdex: Integer }
+ # @return [MergedSearchPaginator]
+ def build_paginator_from_totals(totals, current_page, per_page)
+ MergedSearchPaginator.new(primo_total: totals[:primo] || 0, timdex_total: totals[:timdex] || 0,
+ current_page: current_page, per_page: per_page)
+ end
+
+ # Default Primo fetcher used when no custom fetcher is injected.
+ #
+ # @param offset [Integer]
+ # @param per_page [Integer]
+ # @param query [Hash]
+ # @return [Hash] response including :results and :hits
+ def default_primo_fetch(offset:, per_page:, query:)
+ if offset && offset >= Analyzer::PRIMO_MAX_OFFSET
+ return { results: [], pagination: {}, errors: nil, show_continuation: true, hits: 0 }
+ end
+
+ per_page ||= ENV.fetch('RESULTS_PER_PAGE', '20').to_i
+ primo_search = PrimoSearch.new
+ raw = primo_search.search(query[:q], per_page, offset)
+ hits = raw.dig('info', 'total') || 0
+ results = NormalizePrimoResults.new(raw, query[:q]).normalize
+ { results: results, pagination: Analyzer.new(query, hits, :primo).pagination, errors: nil,
+ show_continuation: false, hits: hits }
+ rescue StandardError => e
+ { results: [], pagination: {}, errors: [{ 'message' => e.message }], show_continuation: false, hits: 0 }
+ end
+
+ # Default TIMDEX fetcher used when no custom fetcher is injected.
+ #
+ # @param offset [Integer]
+ # @param per_page [Integer]
+ # @param query [Hash]
+ # @return [Hash] response including :results and :hits
+ def default_timdex_fetch(offset:, per_page:, query:)
+ q = QueryBuilder.new(query).query
+ q['from'] = offset.to_s if offset
+ q['size'] = per_page.to_s if per_page
+
+ resp = TimdexBase::Client.query(TimdexSearch::BaseQuery, variables: q)
+ data = resp.data.to_h
+ hits = data.dig('search', 'hits') || 0
+ raw_results = data.dig('search', 'records') || []
+ results = NormalizeTimdexResults.new(raw_results, query[:q]).normalize
+ { results: results, pagination: Analyzer.new(query, hits, :timdex).pagination, errors: nil, hits: hits }
+ rescue StandardError => e
+ { results: [], pagination: {}, errors: [{ 'message' => e.message }], hits: 0 }
+ end
+
+ # Generate a cache key based on the supplied query hash.
+ #
+ # @param query [Hash]
+ # @return [String] MD5 hex digest
+ def generate_cache_key(query)
+ sorted = query.sort_by { |k, _v| k.to_sym }.to_h
+ Digest::MD5.hexdigest(sorted.to_s)
+ end
+end
diff --git a/test/controllers/search_controller_test.rb b/test/controllers/search_controller_test.rb
index 2aa86d12..2421c3a8 100644
--- a/test/controllers/search_controller_test.rb
+++ b/test/controllers/search_controller_test.rb
@@ -805,6 +805,29 @@ def source_filter_count(controller)
assert_select 'a[href*="tab=website"]', count: 1
end
+ test 'all tab page 1 writes totals to cache' do
+ # This integration-level behavior is covered by unit tests on `MergedSearchService`.
+ # Here we assert the controller delegates to the service.
+ mock_service = mock('merged_service')
+ mock_service.expects(:fetch).returns({ results: [], errors: nil, pagination: {}, show_primo_continuation: false })
+ MergedSearchService.expects(:new).returns(mock_service)
+
+ get '/results?q=test'
+ assert_response :success
+ end
+
+ test 'all tab deeper page reads cached totals and avoids summary calls' do
+ # This behavior is covered in greater depth by `MergedSearchService` unit tests.
+ mock_service = mock('merged_service')
+ mock_service.expects(:fetch).with(page: 2,
+ per_page: 20).returns({ results: [],
+ errors: nil, pagination: {}, show_primo_continuation: false })
+ MergedSearchService.expects(:new).returns(mock_service)
+
+ get '/results?q=test&page=2'
+ assert_response :success
+ end
+
test 'results handles primo search errors gracefully' do
PrimoSearch.expects(:new).raises(StandardError.new('API Error'))
diff --git a/test/models/merged_search_service_test.rb b/test/models/merged_search_service_test.rb
new file mode 100644
index 00000000..9cae280f
--- /dev/null
+++ b/test/models/merged_search_service_test.rb
@@ -0,0 +1,202 @@
+require 'test_helper'
+require 'ostruct'
+
+class MergedSearchServiceTest < ActiveSupport::TestCase
+ test 'page 1 writes totals to cache' do
+ mem_cache = ActiveSupport::Cache::MemoryStore.new
+ query = { q: 'test' }
+
+ primo_fetcher = lambda do |offset:, per_page:, query:|
+ { results: ['foo'], hits: 42, errors: nil, show_continuation: false }
+ end
+
+ timdex_fetcher = lambda do |offset:, per_page:, query:|
+ { results: ['bar'], hits: 37, errors: nil }
+ end
+
+ service = MergedSearchService.new(enhanced_query: query, active_tab: 'all', cache: mem_cache,
+ primo_fetcher: primo_fetcher, timdex_fetcher: timdex_fetcher)
+
+ res = service.fetch(page: 1, per_page: 20)
+ assert_equal 2, res[:results].length
+
+ # Verify cache written
+ key = service.send(:totals_cache_key)
+ cached = mem_cache.read(key)
+ refute_nil cached
+ assert_equal 42, cached[:primo]
+ assert_equal 37, cached[:timdex]
+ end
+
+ test 'deeper page reads cached totals and avoids summary calls' do
+ mem_cache = ActiveSupport::Cache::MemoryStore.new
+ query = { q: 'test' }
+
+ service = MergedSearchService.new(enhanced_query: query, active_tab: 'all', cache: mem_cache)
+
+ # populate cache so service uses it instead of summary calls
+ mem_cache.write(service.send(:totals_cache_key), { primo: 50, timdex: 50 })
+
+ # fetchers that would raise if a summary call (per_page == 1) is attempted
+ primo_fetcher = lambda do |offset:, per_page:, query:|
+ raise 'Summary call made' if per_page == 1
+
+ { results: ['foo'], hits: 50, errors: nil, show_continuation: false }
+ end
+
+ timdex_fetcher = lambda do |offset:, per_page:, query:|
+ raise 'Summary call made' if per_page == 1
+
+ { results: ['bar'], hits: 50, errors: nil }
+ end
+
+ service = MergedSearchService.new(enhanced_query: query, active_tab: 'all', cache: mem_cache,
+ primo_fetcher: primo_fetcher, timdex_fetcher: timdex_fetcher)
+
+ # Should not raise
+ assert_nothing_raised do
+ res = service.fetch(page: 2, per_page: 20)
+ assert res[:results].is_a?(Array)
+ end
+ end
+
+ test 'falls back to summary and writes cache when totals are missing' do
+ mem_cache = ActiveSupport::Cache::MemoryStore.new
+ q = { q: 'test' }
+
+ calls = []
+ primo_fetcher = lambda do |offset:, per_page:, query:|
+ calls << [:primo, offset, per_page]
+ if per_page == 1
+ { results: [], hits: 7, errors: nil, show_continuation: false }
+ else
+ { results: ['foo'], hits: 7, errors: nil, show_continuation: false }
+ end
+ end
+
+ timdex_fetcher = lambda do |offset:, per_page:, query:|
+ calls << [:timdex, offset, per_page]
+ if per_page == 1
+ { results: [], hits: 3, errors: nil }
+ else
+ { results: ['bar'], hits: 3, errors: nil }
+ end
+ end
+
+ svc = MergedSearchService.new(enhanced_query: q, active_tab: 'all', cache: mem_cache, primo_fetcher: primo_fetcher,
+ timdex_fetcher: timdex_fetcher)
+
+ res = svc.fetch(page: 2, per_page: 20)
+
+ # summary calls should have been made with per_page == 1
+ assert_includes calls, [:primo, 0, 1]
+ assert_includes calls, [:timdex, 0, 1]
+
+ # totals cached
+ key = svc.send(:totals_cache_key)
+ totals = mem_cache.read(key)
+ refute_nil totals
+ assert_equal 7, totals[:primo]
+ assert_equal 3, totals[:timdex]
+
+ assert res[:results].is_a?(Array)
+ end
+
+ test 'default_primo_fetch returns continuation when offset exceeds max' do
+ svc = MergedSearchService.new(enhanced_query: { q: 'foo' }, active_tab: 'all', cache: ActiveSupport::Cache::MemoryStore.new)
+ res = svc.send(:default_primo_fetch, offset: Analyzer::PRIMO_MAX_OFFSET, per_page: 20, query: { q: 'foo' })
+ assert_equal true, res[:show_continuation]
+ assert_equal 0, res[:hits]
+ end
+
+ test 'default_primo_fetch handles exceptions gracefully' do
+ svc = MergedSearchService.new(enhanced_query: { q: 'foo' }, active_tab: 'all', cache: ActiveSupport::Cache::MemoryStore.new)
+ PrimoSearch.expects(:new).raises(StandardError.new('boom'))
+ res = svc.send(:default_primo_fetch, offset: 0, per_page: 10, query: { q: 'foo' })
+ assert_equal 0, res[:hits]
+ assert res[:errors].is_a?(Array)
+ end
+
+ test 'default_timdex_fetch handles client errors gracefully' do
+ svc = MergedSearchService.new(enhanced_query: { q: 'foo' }, active_tab: 'all', cache: ActiveSupport::Cache::MemoryStore.new)
+ TimdexBase::Client.expects(:query).raises(StandardError.new('boom'))
+ res = svc.send(:default_timdex_fetch, offset: 0, per_page: 10, query: { q: 'foo' })
+ assert_equal 0, res[:hits]
+ assert res[:errors].is_a?(Array)
+ end
+
+ test 'fetch_all_tab_page_chunks handles zero-count branches' do
+ mem = ActiveSupport::Cache::MemoryStore.new
+ called = []
+ primo_fetcher = lambda { |offset:, per_page:, query:|
+ called << [:primo, offset, per_page]
+ { results: ['P'], hits: 5, errors: nil, show_continuation: false }
+ }
+ timdex_fetcher = lambda { |offset:, per_page:, query:|
+ called << [:timdex, offset, per_page]
+ { results: [], hits: 0, errors: nil }
+ }
+
+ svc = MergedSearchService.new(enhanced_query: { q: 'foo' }, active_tab: 'all', cache: mem,
+ primo_fetcher: primo_fetcher, timdex_fetcher: timdex_fetcher)
+
+ paginator = OpenStruct.new(
+ merge_plan: %i[primo primo],
+ api_offsets: [10, 0],
+ primo_total: 5,
+ timdex_total: 0
+ )
+
+ primo_data, timdex_data = svc.send(:fetch_all_tab_page_chunks, paginator)
+ assert primo_data[:results].is_a?(Array)
+ assert timdex_data[:results].is_a?(Array)
+ assert_equal 0, timdex_data[:hits]
+ end
+
+ test 'combine_errors merges arrays or returns nil' do
+ svc = MergedSearchService.new(enhanced_query: { q: 'foo' }, active_tab: 'all', cache: ActiveSupport::Cache::MemoryStore.new)
+ assert_nil svc.send(:combine_errors, nil, [])
+ merged = svc.send(:combine_errors, [{ 'message' => 'a' }], [{ 'message' => 'b' }])
+ assert_equal 2, merged.length
+ end
+
+ test 'default_primo_fetch returns normalized results on success' do
+ svc = MergedSearchService.new(enhanced_query: { q: 'foo' }, active_tab: 'all', cache: ActiveSupport::Cache::MemoryStore.new)
+ mock_primo = mock('primo_search')
+ mock_primo.expects(:search).returns({ 'info' => { 'total' => 12 }, 'docs' => [] })
+ PrimoSearch.expects(:new).returns(mock_primo)
+
+ mock_normalizer = mock('normalizer')
+ mock_normalizer.expects(:normalize).returns(['normalized'])
+ NormalizePrimoResults.expects(:new).returns(mock_normalizer)
+
+ mock_analyzer = mock('analyzer')
+ mock_analyzer.expects(:pagination).returns({ page: 1 })
+ Analyzer.expects(:new).returns(mock_analyzer)
+
+ res = svc.send(:default_primo_fetch, offset: 0, per_page: 10, query: { q: 'foo' })
+ assert_equal 12, res[:hits]
+ assert_equal ['normalized'], res[:results]
+ assert_equal({ page: 1 }, res[:pagination])
+ end
+
+ test 'default_timdex_fetch returns normalized results on success' do
+ svc = MergedSearchService.new(enhanced_query: { q: 'foo' }, active_tab: 'all', cache: ActiveSupport::Cache::MemoryStore.new)
+ fake_resp = OpenStruct.new(data: OpenStruct.new(to_h: { 'search' => { 'hits' => 5,
+ 'records' => [{ 'id' => 1 }] } }))
+ TimdexBase::Client.stubs(:query).returns(fake_resp)
+
+ mock_normalizer = mock('normalizer')
+ mock_normalizer.expects(:normalize).returns(['t_normalized'])
+ NormalizeTimdexResults.expects(:new).returns(mock_normalizer)
+
+ mock_analyzer = mock('analyzer')
+ mock_analyzer.expects(:pagination).returns({ page: 1 })
+ Analyzer.expects(:new).returns(mock_analyzer)
+
+ res = svc.send(:default_timdex_fetch, offset: 0, per_page: 10, query: { q: 'foo' })
+ assert_equal 5, res[:hits]
+ assert_equal ['t_normalized'], res[:results]
+ assert_equal({ page: 1 }, res[:pagination])
+ end
+end
From c8695b0f5ab771e34de6ba201187a86ea5ad38a7 Mon Sep 17 00:00:00 2001
From: Jeremy Prevost
Date: Thu, 4 Dec 2025 10:27:59 -0500
Subject: [PATCH 3/3] Add additional docs for merged_search_service
---
app/models/merged_search_service.rb | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/app/models/merged_search_service.rb b/app/models/merged_search_service.rb
index 69c8ec51..103c1a3f 100644
--- a/app/models/merged_search_service.rb
+++ b/app/models/merged_search_service.rb
@@ -32,6 +32,10 @@ def fetch(page:, per_page:)
current_page = (page || 1).to_i
per_page = (per_page || 20).to_i
+ # For page 1, we retrieve `per_page` results for each API and then store the totals
+ # We don't always use all of the results that were returned here, but the logic in the subsequent page requests
+ # accounts for that in the offset calculation. We retrieve the full per_page for each API to ensure we always get a
+ # full page 1 unless both APIs have less than per_page combined.
if current_page == 1
primo_data, timdex_data = parallel_fetch(offset: 0, per_page: per_page)
@@ -47,6 +51,8 @@ def fetch(page:, per_page:)
totals = @cache.read(totals_cache_key)
+ # If we don't have a stored totals value for the incoming query, we need to create one. This situation can happen
+ # if a user accesses (shared, bookmarked, refreshed, etc) a non-page 1 query after the cache has expired.
unless totals
primo_summary, timdex_summary = parallel_fetch(offset: 0, per_page: 1)
totals = { primo: primo_summary[:hits].to_i, timdex: timdex_summary[:hits].to_i }