In [2]:
from datetime import datetime
from elasticsearch import Elasticsearch, helpers
from datasets import load_dataset
import numpy as np


In [3]:
client = Elasticsearch("http://localhost:9200/")


In [4]:
print(client.info())

{'name': 'node-1', 'cluster_name': 'my-application-cluster', 'cluster_uuid': 'CKjkeoDKR3G-PRs-eQpY9w', 'version': {'number': '8.15.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '98adf7bf6bb69b66ab95b761c9e5aadb0bb059a3', 'build_date': '2024-09-19T10:06:03.564235954Z', 'build_snapshot': False, 'lucene_version': '9.11.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [5]:
# Define analyzers for Polish text processing
index_body = {
    "settings": {
        "analysis": {
            "analyzer": {
                "polish_with_synonyms": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "morfologik_stem",
                        "synonym", # Custom filter for synonyms
                        "lowercase"
                    ]
                },
                "polish_without_synonyms": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "morfologik_stem"
                    ]
                }
            },
            "filter": {
                "synonym": {
                    "type": "synonym",
                    "synonyms": [
                        "styczeń, sty, I",
                        "luty, lut, II",
                        "marzec, mar, III",
                        "kwiecień, kwi, IV",
                        "maj, V",
                        "czerwiec, cze, VI",
                        "lipiec, lip, VII",
                        "sierpień, sie, VIII",
                        "wrzesień, wrz, IX",
                        "październik, paź, X",
                        "listopad, lis, XI",
                        "grudzień, gru, XII"
                    ]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "text_with_synonyms": {
                "type": "text",
                "analyzer": "polish_with_synonyms"
            },
            "text_without_synonyms": {
                "type": "text",
                "analyzer": "polish_without_synonyms"
            }
        }
    }
}

index_name = "fiqa_pl_index"

if not client.indices.exists(index=index_name):
    client.indices.create(index=index_name, body=index_body)
    print(f"Index '{index_name}' created successfully.")
else:
    print(f"Index '{index_name}' already exists.")

Index 'fiqa_pl_index' already exists.


In [6]:
ds = load_dataset("clarin-knext/fiqa-pl", "corpus")

In [7]:
corpus_ds = ds["corpus"]

In [8]:
bulk_data = [
    {
        "_index": index_name,
        "_id": doc["_id"],
        "_source": {
            "title": doc["title"],
            "text_with_synonyms": doc["text"],
            "text_without_synonyms": doc["text"]
        }
    }
    for doc in corpus_ds
]

In [9]:
helpers.bulk(client, bulk_data) # bulk load the data into the index

(57638, [])

In [12]:
search_term = "kwiecień"

In [13]:
response_with_synonyms = client.search(
    index=index_name,
    body={
        "query": {
            "match": {
                "text_with_synonyms": search_term
            }
        }
    }
)
response_without_synonyms = client.search(
    index=index_name,
    body={
        "query": {
            "match": {
                "text_without_synonyms": search_term
            }
        }
    }
)

In [14]:
num_docs_with_synonyms = response_with_synonyms['hits']['total']['value']
num_docs_without_synonyms = response_without_synonyms['hits']['total']['value']


In [16]:
print(f"Number of documents for term {search_term} \n with synonyms: {num_docs_with_synonyms} \n without synonyms: {num_docs_without_synonyms}")

Number of documents for term kwiecień 
 with synonyms: 307 
 without synonyms: 258


### What are the strengths and weaknesses of regular expressions versus full text search regarding processing of text?

##### Strengths of Regular Expressions:
1. **Precision:** Allows for exact pattern matching.
2. **Flexibility:** Can be tailored for various text formats.
3. **Performance:** Faster for small datasets without indexing overhead.

#### Weaknesses of Regular Expressions:
1. **Complexity:** Difficult to write and maintain for complex patterns.
2. **Limited Context Awareness:** Lacks understanding of text semantics.
3. **Scalability Issues:** Can become inefficient with large datasets.

#### Strengths of Full Text Search:
1. **Contextual Understanding:** Incorporates natural language processing for better relevance.
2. **Scalability:** Efficiently handles large datasets with indexing.
3. **Advanced Features:** Supports stemming, lemmatization, and synonyms.

#### Weaknesses of Full Text Search:
1. **Less Precision:** May return broader, less relevant results.
2. **Overhead:** More complex to set up and maintain.
3. **Dependency on Indexing:** Requires re-indexing for frequently changing data.

### Can an LLM be applied in the context of searching for documents? Justify your answer, excluding the obvious observation that an LLM can be used to formulate the answer.

Yes, LLMs can enhance document searching by:

1. **Semantic Search:** Understanding meaning beyond keywords for relevant results.
2. **Query Expansion:** Generating synonyms and related terms to improve search.
3. **Contextual Relevance:** Ranking results based on context rather than just keywords.
4. **Natural Language Interaction:** Allowing users to ask questions in everyday language.
5. **Summarization and Extraction:** Providing concise insights from documents.
