# Elasticsearch Publications Explorer
Step-by-step exploration of Chalmers research publications database

## 1. Setup and Connection

In [8]:
# Import required libraries
import os
from elasticsearch import Elasticsearch
from dotenv import load_dotenv
import json
from datetime import datetime, timedelta
from pprint import pprint

# Load environment variables
load_dotenv()

print("Libraries loaded successfully!")

Libraries loaded successfully!


In [9]:
# Initialize Elasticsearch client for version 6.x
es = Elasticsearch(
    hosts=[os.getenv('ES_HOST')],
    http_auth=(os.getenv('ES_USER'), os.getenv('ES_PASS')),
    verify_certs=True
)

# Test connection
info = es.info()
print(f"Connected to Elasticsearch version: {info['version']['number']}")
print(f"Cluster name: {info['cluster_name']}")

Connected to Elasticsearch version: 6.8.23
Cluster name: chalmers-elk-test


## 2. Explore Index Structure

In [10]:
# Get basic info about publications index
index_name = "research-publications-static"
index_stats = es.indices.stats(index=index_name)

doc_count = index_stats['indices'][index_name]['total']['docs']['count']
size_in_bytes = index_stats['indices'][index_name]['total']['store']['size_in_bytes']
size_in_mb = size_in_bytes / (1024 * 1024)

print(f"Index: {index_name}")
print(f"Documents: {doc_count:,}")
print(f"Size: {size_in_mb:.1f} MB")

Index: research-publications-static
Documents: 713,270
Size: 7992.7 MB


In [11]:
# Get a sample document to understand structure
sample_query = {
    "query": {"match_all": {}},
    "size": 1
}

result = es.search(index=index_name, body=sample_query)

if result['hits']['hits']:
    sample_doc = result['hits']['hits'][0]['_source']
    print("Top-level fields in a publication:")
    for key in sorted(sample_doc.keys()):
        value_type = type(sample_doc[key]).__name__
        print(f"  {key}: {value_type}")

Top-level fields in a publication:
  Abstract: str
  AffiliatedIdsChalmers: list
  Categories: list
  CreatedBy: str
  CreatedDate: str
  DataObjects: list
  Datasets: list
  DetailsUrlEng: str
  DetailsUrlSwe: str
  HasImportErrors: bool
  HasImportMatchOnScopusDoi: bool
  HasImportMatchOnScopusId: bool
  HasOrganizations: bool
  HasPersons: bool
  Id: str
  IdentifierCplPubid: list
  IdentifierDoi: list
  IdentifierIsbn: list
  IdentifierPubmedId: list
  IdentifierScopusId: list
  Identifiers: list
  IncludedManuscripts: list
  IncludedPapers: list
  IncludedPapersLegacy: list
  IsDeleted: bool
  IsDraft: bool
  IsImported: bool
  Keywords: list
  Language: dict
  LatestEventDate: str
  NeedsAttention: bool
  Organizations: list
  Persons: list
  PossibleDuplicates: list
  Project: list
  PublicationType: dict
  Replacing: list
  Series: list
  Source: dict
  Title: str
  UpdatedBy: str
  UpdatedDate: str
  ValidatedBy: str
  ValidatedDate: str
  Year: int


In [12]:
# Look at a specific publication's structure
print("Sample publication:")
print(f"Title: {sample_doc.get('Title', 'No title')}")
print(f"Year: {sample_doc.get('Year', 'No year')}")
print(f"Has Abstract: {'Abstract' in sample_doc and bool(sample_doc['Abstract'])}")
print(f"Number of authors: {len(sample_doc.get('Persons', []))}")
print(f"Number of organizations: {len(sample_doc.get('Organizations', []))}")

Sample publication:
Title: RF Front-End Circuits and Architectures for IoT/LTE-A/5G Connectivity
Year: 2018
Has Abstract: False
Number of authors: 5
Number of organizations: 0


## 3. Simple Text Search

In [13]:
def simple_text_search(query_text, size=5):
    """
    Basic text search across all fields
    """
    body = {
        "query": {
            "query_string": {
                "query": query_text,
                "default_field": "*"
            }
        },
        "size": size,
        "_source": ["Title", "Year", "Abstract"]
    }
    
    response = es.search(index=index_name, body=body)
    return response

In [14]:
# Test simple search
test_query = "Optimized Gene-Based"
results = simple_text_search(test_query)
print(f"Found {results['hits']['total']} publications about {test_query}\n")

for hit in results['hits']['hits']:
    doc = hit['_source']
    print(f"- {doc.get('Title', 'No title')} ({doc.get('Year', 'N/A')})")

Found 29329 publications about Optimized Gene-Based

- Generating Optimized Trajectories for Robotic Spray Painting (2022)
- Optimized elliptic curve cryptography and efficient elliptic curve parameter generation (2002)
- Comments on "A module generator for optimized CMOS buffers" (1993)
- Real-time generation of fully optimized holograms for optical trapping applications (2011)
- Generalized Langevin dynamics in multiphase direct numerical simulations using hydrodynamically optimized memory kernels (2025)


## 4. Field-Specific Search

In [15]:
def search_by_title(title_text, size=5):
    """
    Search specifically in title field
    """
    body = {
        "query": {
            "match": {
                "Title": {
                    "query": title_text,
                    "operator": "and"  # All words must match
                }
            }
        },
        "size": size,
        "_source": ["Title", "Year", "Abstract"]
    }
    
    return es.search(index=index_name, body=body)

In [16]:
# Test title search
test_query = "Optimizing Gene-Based"
results = search_by_title(test_query)
print(f"Found {results['hits']['total']} publications with {test_query} in title\n")

for hit in results['hits']['hits'][:3]:
    doc = hit['_source']
    print(f"- {doc.get('Title', 'No title')}")
    print(f"  Year: {doc.get('Year', 'N/A')}")
    print()

Found 1 publications with Optimizing Gene-Based in title

- Optimizing Gene-Based Testing for Antibiotic Resistance Prediction
  Year: 2025



## 5. Understanding Author Search

In [17]:
# First, let's see how authors are structured
if 'Persons' in sample_doc and sample_doc['Persons']:
    print("Author structure example:")
    first_person = sample_doc['Persons'][0]
    print(json.dumps(first_person, indent=2)[:500] + "...")

Author structure example:
{
  "PersonData": {
    "Id": "1b807d61-badc-457d-a7e2-f71b45042e73",
    "FirstName": "Yan",
    "LastName": "Li",
    "DisplayName": "Yan Li",
    "BirthYear": 0,
    "IsDeleted": false,
    "HasPublications": true,
    "HasProjects": false,
    "IdentifierCid": [],
    "IdentifierCplPersonId": [],
    "IdentifierOrcid": [],
    "Identifiers": [],
    "OrganizationHome": [],
    "PdbCategories": []
  },
  "Organizations": [
    {
      "OrganizationData": {
        "Id": "a400fdb2-1614-41ad-92...


In [18]:
# Check if Persons is a nested field
mapping = es.indices.get_mapping(index=index_name)
persons_mapping = mapping[index_name]['mappings']['publication']['properties'].get('Persons', {})
is_nested = persons_mapping.get('type') == 'nested'
print(f"Persons field is nested: {is_nested}")

Persons field is nested: False


In [19]:
def search_by_author(author_name, size=5):
    """
    Search by author name - handles both nested and non-nested cases
    """
    # For Elasticsearch 6.x, let's use a simple approach
    body = {
        "query": {
            "match": {
                "Persons.PersonData.DisplayName": author_name
            }
        },
        "size": size,
        "_source": ["Title", "Year", "Persons.PersonData.DisplayName"]
    }
    
    return es.search(index=index_name, body=body)

In [20]:
# Test author search
author_results = search_by_author("Erik Anna")
print(f"Found {author_results['hits']['total']} publications\n")

for hit in author_results['hits']['hits'][:3]:
    doc = hit['_source']
    print(f"- {doc.get('Title', 'No title')} ({doc.get('Year', 'N/A')})")
    
    # Extract author names
    if 'Persons' in doc:
        authors = [p.get('PersonData', {}).get('DisplayName', 'Unknown') 
                  for p in doc['Persons'] if 'PersonData' in p]
        print(f"  Authors: {', '.join(authors[:3])}...")
    print()

Found 7041 publications

- The Subjective Judgement of Weld Quality and its Effect on Production Cost (2013)
  Authors: Anna Öberg, Erik Åstrand...

- Improved productivity by reduced variation in gas metal arc welding (GMAW) (2017)
  Authors: Anna Öberg, Erik Åstrand...

- The Construction Industry as a Loosely Coupled System - Implications for productivity and innovativity (2001)
  Authors: Anna Dubois, Lars-Erik Gadde...



## 6. Filtered Search

In [21]:
def search_with_filters(text_query=None, year=None, year_range=None, size=10):
    """
    Search with various filters
    """
    must_clauses = []
    
    # Add text query if provided
    if text_query:
        must_clauses.append({
            "multi_match": {
                "query": text_query,
                "fields": ["Title^2", "Abstract", "Keywords.Value"]
            }
        })
    
    # Add year filter
    if year:
        must_clauses.append({"term": {"Year": year}})
    elif year_range:
        must_clauses.append({"range": {"Year": year_range}})
    
    # Build query
    if must_clauses:
        query = {"bool": {"must": must_clauses}}
    else:
        query = {"match_all": {}}
    
    body = {
        "query": query,
        "size": size,
        "sort": [{"Year": {"order": "desc"}}, "_score"],
        "_source": ["Title", "Year", "Abstract"]
    }
    
    return es.search(index=index_name, body=body)

In [22]:
# Test filtered search - recent AI papers
recent_ai = search_with_filters(
    text_query="artificial intelligence",
    year_range={"gte": 2025}
)

print(f"Found {recent_ai['hits']['total']} AI publications from 2023 onwards\n")

for hit in recent_ai['hits']['hits'][:5]:
    doc = hit['_source']
    print(f"- {doc.get('Year')}: {doc.get('Title', 'No title')}")

Found 62 AI publications from 2023 onwards

- 2025: Preface: The 6th International Workshop on Requirements Engineering for Artificial Intelligence (RE4AI’25)
- 2025: O-RAN Intelligence Orchestration Framework for Quality-Driven Xapp Deployment and Sharing
- 2025: One test to predict them all: Rheological characterization of complex fluids via artificial neural network
- 2025: Nearly quantum-limited microwave amplification via interfering degenerate stimulated emission in a single artificial atom
- 2025: Design and in vitro anticancer assessment of a click chemistry-derived dinuclear copper artificial metallo-nuclease


## 7. Aggregations

In [23]:
def get_publication_stats():
    """
    Get aggregated statistics
    """
    body = {
        "size": 0,  # Don't return documents
        "aggs": {
            "by_year": {
                "terms": {
                    "field": "Year",
                    "size": 10,
                    "order": {"_key": "desc"}
                }
            },
            "by_type": {
                "terms": {
                    "field": "PublicationType.NameEng.keyword",
                    "size": 10
                }
            },
            "has_abstract": {
                "filter": {
                    "exists": {"field": "Abstract"}
                }
            }
        }
    }
    
    return es.search(index=index_name, body=body)

In [24]:
# Get and display statistics
stats = get_publication_stats()

print("Publications by year (recent):")
for bucket in stats['aggregations']['by_year']['buckets'][:5]:
    print(f"  {bucket['key']}: {bucket['doc_count']:,} publications")

print("\nTop publication types:")
for bucket in stats['aggregations']['by_type']['buckets'][:5]:
    print(f"  {bucket['key']}: {bucket['doc_count']:,}")

total_docs = stats['hits']['total']
with_abstract = stats['aggregations']['has_abstract']['doc_count']
print(f"\nPublications with abstracts: {with_abstract:,} / {total_docs:,} ({with_abstract/total_docs*100:.1f}%)")

Publications by year (recent):
  2025: 1,625 publications
  2024: 3,962 publications
  2023: 3,937 publications
  2022: 3,852 publications
  2021: 4,063 publications

Top publication types:
  Journal article: 42,519
  Paper in proceeding: 23,473
  Doctoral thesis: 5,219
  Other conference contribution: 4,913
  Licentiate thesis: 4,647

Publications with abstracts: 73,607 / 95,697 (76.9%)


## 8. Combined Search Function

In [25]:
def search_publications(query=None, filters=None, size=10, from_=0, fields=None):
    """
    Main search function combining all features
    
    Args:
        query: Free text search
        filters: Dict with filters like {"year": 2023, "author": "Name"}
        size: Number of results
        from_: Offset for pagination
        fields: List of fields to return
    """
    must_clauses = []
    
    # Text search
    if query:
        must_clauses.append({
            "multi_match": {
                "query": query,
                "fields": ["Title^3", "Abstract^2", "Keywords.Value"],
                "type": "best_fields"
            }
        })
    
    # Process filters
    if filters:
        for key, value in filters.items():
            if key == "year":
                must_clauses.append({"term": {"Year": value}})
            elif key == "year_range":
                must_clauses.append({"range": {"Year": value}})
            elif key == "author":
                must_clauses.append({
                    "match": {"Persons.PersonData.DisplayName": value}
                })
            elif key == "type":
                must_clauses.append({
                    "term": {"PublicationType.Id": value}
                })
    
    # Build query
    if must_clauses:
        es_query = {"bool": {"must": must_clauses}}
    else:
        es_query = {"match_all": {}}
    
    # Build request body
    body = {
        "query": es_query,
        "size": size,
        "from": from_,
        "sort": [{"Year": {"order": "desc"}}, "_score"]
    }
    
    # Add field selection
    if fields:
        body["_source"] = fields
    
    response = es.search(index=index_name, body=body)
    
    # Return formatted results
    return {
        "total": response["hits"]["total"],
        "hits": response["hits"]["hits"],
        "query_used": body  # For debugging
    }

In [26]:
# Test the combined function
results = search_publications(
    query="microwave",
    filters={
        "year_range": {"gte": 2020, "lte": 2024},
    },
    size=5,
    fields=["Title", "Year", "Abstract", "Persons.PersonData.DisplayName"],
    from_=20
)

print(f"Found {results['total']} publications\n")
print("Query used:")
print(json.dumps(results['query_used'], indent=2))
print("\nResults:")

for hit in results['hits']:
    doc = hit['_source']
    print(f"\n- {doc.get('Title', 'No title')}")
    print(f"  Year: {doc.get('Year', 'N/A')}")
    if 'Abstract' in doc:
        print(f"  Abstract: {doc['Abstract'][:100]}...")

Found 272 publications

Query used:
{
  "query": {
    "bool": {
      "must": [
        {
          "multi_match": {
            "query": "microwave",
            "fields": [
              "Title^3",
              "Abstract^2",
              "Keywords.Value"
            ],
            "type": "best_fields"
          }
        },
        {
          "range": {
            "Year": {
              "gte": 2020,
              "lte": 2024
            }
          }
        }
      ]
    }
  },
  "size": 5,
  "from": 20,
  "sort": [
    {
      "Year": {
        "order": "desc"
      }
    },
    "_score"
  ],
  "_source": [
    "Title",
    "Year",
    "Abstract",
    "Persons.PersonData.DisplayName"
  ]
}

Results:

- Analysis of a plasma reactor performance for direct nitrogen fixation by use of three-dimensional simulations and experiments
  Year: 2024
  Abstract: This study utilizes state-of-the-art in-situ measurements and advanced three-dimensional simulations...

- Intermodulation spe

## 9. Testing and Debugging

In [27]:
# Test various query patterns
test_queries = [
    {"description": "Simple text search", "params": {"query": "Antibiotics"}},
    {"description": "Year filter only", "params": {"filters": {"year": 2024}}},
    {"description": "Author search", "params": {"filters": {"author": "Erik"}}},
    {"description": "Combined search", "params": {
        "query": "quantum",
        "filters": {"year_range": {"gte": 2023}}
    }}
]

for test in test_queries:
    print(f"\nTest: {test['description']}")
    try:
        results = search_publications(**test['params'], size=2)
        print(f"  Found: {results['total']} results")
        if results['hits']:
            print(f"  First result: {results['hits'][0]['_source'].get('Title', 'No title')}")
    except Exception as e:
        print(f"  Error: {str(e)}")


Test: Simple text search
  Found: 156 results
  First result: Adaptation of Escherichia coli to ciprofloxacin and enrofloxacin: Differential proteomics of the SOS response and RecA-independent mechanisms

Test: Year filter only
  Found: 3962 results
  First result: Digitalisation and technological Innovations Towards Energy-Efficient Quarries

Test: Author search
  Found: 4689 results
  First result: Geometric Numerical Methods: From Random Fields to Shape Matching

Test: Combined search
  Found: 371 results
  First result: Flip-chip Integrated Superconducting Quantum Processors


## 10. Next Steps

Now that we have working search functions, we can:
1. Add more sophisticated filters
2. Implement scroll API for large result sets
3. Create specialized functions for common queries
4. Build an agent that uses these tools

What would you like to explore next?

In [28]:
def search_by_phrase(phrase, field="Title", size=5):
    """
    Search for exact phrase within a specific field
    The phrase must appear exactly as given, but can be part of a larger text
    
    Args:
        phrase: Exact phrase to search for
        field: Field to search in (default: Title)
        size: Number of results to return
    """
    body = {
        "query": {
            "match_phrase": {
                field: phrase
            }
        },
        "size": size,
        "_source": ["Title", "Year", "Abstract"],
        "highlight": {
            "fields": {
                field: {}  # This will show where the phrase was found
            }
        }
    }
    
    return es.search(index=index_name, body=body)

In [29]:
# Test all our search functions with the same query to compare results
test_phrase = "Optimizing Gene-Based"

print("=== COMPARING SEARCH FUNCTIONS ===")
print(f"Test phrase: '{test_phrase}'\n")

# 1. Simple text search (from cell 3)
print("1. simple_text_search (query_string across all fields):")
try:
    results = simple_text_search(test_phrase, size=3)
    print(f"   Found: {results['hits']['total']} results")
    for hit in results['hits']['hits']:
        print(f"   - {hit['_source'].get('Title', 'No title')[:80]}...")
except Exception as e:
    print(f"   Error: {str(e)}")

print("\n2. search_by_title (match with AND operator):")
try:
    results = search_by_title(test_phrase, size=3)
    print(f"   Found: {results['hits']['total']} results")
    for hit in results['hits']['hits']:
        print(f"   - {hit['_source'].get('Title', 'No title')[:80]}...")
except Exception as e:
    print(f"   Error: {str(e)}")

print("\n3. search_by_phrase (NEW - match_phrase):")
try:
    results = search_by_phrase(test_phrase, field="Title", size=3)
    print(f"   Found: {results['hits']['total']} results")
    for hit in results['hits']['hits']:
        print(f"   - {hit['_source'].get('Title', 'No title')[:80]}...")
        if 'highlight' in hit:
            print(f"     Highlighted: {hit['highlight']['Title'][0]}")
except Exception as e:
    print(f"   Error: {str(e)}")

print("\n4. search_with_filters (multi_match with field boosting):")
try:
    results = search_with_filters(text_query=test_phrase, size=3)
    print(f"   Found: {results['hits']['total']} results")
    for hit in results['hits']['hits']:
        print(f"   - {hit['_source'].get('Title', 'No title')[:80]}...")
except Exception as e:
    print(f"   Error: {str(e)}")

print("\n5. search_publications (final combined function):")
try:
    results = search_publications(query=test_phrase, size=3)
    print(f"   Found: {results['total']} results")
    for hit in results['hits']:
        print(f"   - {hit['_source'].get('Title', 'No title')[:80]}...")
except Exception as e:
    print(f"   Error: {str(e)}")

# Bonus: Let's also test author search
print("\n\n=== AUTHOR SEARCH COMPARISON ===")
test_author = "Fager"
print(f"Test author: '{test_author}'\n")

print("1. search_by_author function:")
try:
    results = search_by_author(test_author, size=3)
    print(f"   Found: {results['hits']['total']} results")
except Exception as e:
    print(f"   Error: {str(e)}")

print("\n2. search_publications with author filter:")
try:
    results = search_publications(filters={"author": test_author}, size=3)
    print(f"   Found: {results['total']} results")
except Exception as e:
    print(f"   Error: {str(e)}")

=== COMPARING SEARCH FUNCTIONS ===
Test phrase: 'Optimizing Gene-Based'

1. simple_text_search (query_string across all fields):
   Found: 28938 results
   - Optimizing Gene-Based Testing for Antibiotic Resistance Prediction...
   - Dampening variations in wind power generation-the effect of optimizing geographi...
   - Optimizing robot trajectories for automatic robot code generation...

2. search_by_title (match with AND operator):
   Found: 1 results
   - Optimizing Gene-Based Testing for Antibiotic Resistance Prediction...

3. search_by_phrase (NEW - match_phrase):
   Found: 1 results
   - Optimizing Gene-Based Testing for Antibiotic Resistance Prediction...
     Highlighted: <em>Optimizing</em> <em>Gene</em>-<em>Based</em> Testing for Antibiotic Resistance Prediction

4. search_with_filters (multi_match with field boosting):
   Found: 28309 results
   - Optimizing Gene-Based Testing for Antibiotic Resistance Prediction...
   - Optimizing hydration and performance of phosphogypsum 