In [1]:
from elasticsearch import Elasticsearch
from datasets import load_dataset
import numpy as np
from collections import defaultdict, OrderedDict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import warnings
from requests.packages.urllib3.exceptions import InsecureRequestWarning

# Wyłączenie ostrzeżeń
warnings.simplefilter('ignore', InsecureRequestWarning)

In [3]:
client = Elasticsearch(
    "http://localhost:9200",
    basic_auth=("elastic", "<hidden>"),
    verify_certs=False
)

In [4]:
index_definition = {
    "settings": {
        "analysis": {
            "filter": {
                "synonym_filter": {
                    "type": "synonym",
                    "synonyms": [
                        "styczeń, sty, I",
                        "luty, lut, II",
                        "marzec, mar, III",
                        "kwiecień, kwi, IV",
                        "maj, maj, V",
                        "czerwiec, czer, VI",
                        "lipiec, lip, VII",
                        "sierpień, sie, VIII",
                        "wrzesień, wrz, IX",
                        "październik, paź, X",
                        "listopad, lis, XI",
                        "grudzień, gru, XII"
                    ]
                }
            },
            "analyzer": {
                "analyzer_with_synonyms_with_lemmatizer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "synonym_filter",
                        "morfologik_stem",
                        "lowercase"
                    ]
                },
                "analyzer_without_synonyms_with_lemmatizer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "morfologik_stem",
                        "lowercase"
                    ]
                },
                "analyzer_without_synonyms_without_lemmatizer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase"
                    ]
                },
                "analyzer_with_synonyms_without_lemmatizer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "synonym_filter"
                    ]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "with_synonyms_with_lemmatizer": {
                "type": "text",
                "analyzer": "analyzer_with_synonyms_with_lemmatizer"
            },
            "without_synonyms_with_lemmatizer": {
                "type": "text",
                "analyzer": "analyzer_without_synonyms_with_lemmatizer"
            },
            "with_synonyms_without_lemmatizer": {
                "type": "text",
                "analyzer": "analyzer_with_synonyms_without_lemmatizer"
            },
            "without_synonyms_without_lemmatizer": {
                "type": "text",
                "analyzer": "analyzer_without_synonyms_without_lemmatizer"
            }
        }
    }
}

# client.indices.create(index='fiqa_pl_index_v3', body=index_definition)

Ładowanie danych do indeksu - zakomentowane bo już są załadowane

In [5]:
ds = load_dataset("clarin-knext/fiqa-pl", "corpus")

# for item in ds['corpus']:
#     document = {
#         "with_synonyms_with_lemmatizer": item['text'],
#         "without_synonyms_with_lemmatizer": item['text'],
#         "with_synonyms_without_lemmatizer": item['text'],
#         "without_synonyms_without_lemmatizer": item['text']
#     }

#     client.index(index="fiqa_pl_index_v3", id=item['_id'], body=document)
# WCZYTYWANIE V3

In [6]:
response_with_synonyms = client.search(
    index="fiqa_pl_index_v3",
    body={
        "query": {
            "match": {
                "with_synonyms_with_lemmatizer": "kwiecień"
            }
        }
    }
)

response_without_synonyms = client.search(
    index="fiqa_pl_index_v3",
    body={
        "query": {
            "match": {
                "without_synonyms_with_lemmatizer": "kwiecień"
            }
        }
    }
)

num_documents_with_synonyms = response_with_synonyms['hits']['total']['value']
num_documents_without_synonyms = response_without_synonyms['hits']['total']['value']
total_documents = client.count(index="fiqa_pl_index_v3")['count']

# Wyniki
print(f"Dokumenty zawierające 'kwiecień' (z synonimami): {num_documents_with_synonyms}")
print(f"Dokumenty zawierające 'kwiecień' (bez synonimów): {num_documents_without_synonyms}")
print(f"Liczba wszystkich dokumentów w indeksie: {total_documents}")

Dokumenty zawierające 'kwiecień' (z synonimami): 306
Dokumenty zawierające 'kwiecień' (bez synonimów): 257
Liczba wszystkich dokumentów w indeksie: 57638


In [7]:
ds_qa = load_dataset("clarin-knext/fiqa-pl-qrels")['test'] # zbiór testowy pytań i odpowiedzi - dla niego mieliśmy policzyć NDCG
ds_queries = load_dataset("clarin-knext/fiqa-pl","queries") # zapytania

Tworzymy posortowanego orderedDicta tylko do zapytań, które znajdują się w zbiorze testowym

In [8]:
test_query_ids = set(ds_qa['query-id'])
filtered_queries = OrderedDict(sorted((int(row['_id']), row['text']) for row in ds_queries['queries'] if int(row['_id']) in test_query_ids))

Funkcja przeszukująca teksty dla podanego zestawu zapytań

In [9]:
def search_text(index_name: str, queries: dict, k: int):

    model_results = [[] for _ in range(4)] 
    
    for query_id, query_text in queries.items():
        search_bodies = [
            {
                "query": {
                    "multi_match": {
                        "query": query_text,
                        "fields": ["with_synonyms_with_lemmatizer"]
                    }
                },
                "size": k
            },
            {
                "query": {
                    "multi_match": {
                        "query": query_text,
                        "fields": ["without_synonyms_with_lemmatizer"]
                    }
                },
                "size": k
            },
            {
                "query": {
                    "multi_match": {
                        "query": query_text,
                        "fields": ["with_synonyms_without_lemmatizer"]
                    }
                },
                "size": k
            },
            {
                "query": {
                    "multi_match": {
                        "query": query_text,
                        "fields": ["without_synonyms_without_lemmatizer"]
                    }
                },
                "size": k
            }
        ]
    
        # Wykonujemy zapytanie dla każdego analyzera i zapisujemy wyniki do odpowiedniej listy
        for i, search_body in enumerate(search_bodies):
            response = client.search(index=index_name, body=search_body)
            
            for hit in response['hits']['hits']:
                model_results[i].append({
                    'query-id': query_id,
                    'corpus-id': int(hit['_id']),
                    'analyzer': i
                })

    return model_results

In [10]:
search_results = search_text("fiqa_pl_index_v3", filtered_queries, 5) # NDCG 5

In [11]:
# search_results

In [12]:
def get_DCG(relevance_scores, k):
    return sum(rel / np.log2(idx + 2) for idx, rel in enumerate(relevance_scores[:k]))

def get_NDCG(model_results, relevant_documents, k):

    relevance_scores = [1 if doc['corpus-id'] in relevant_documents else 0 for doc in model_results]
    dcg_k = get_DCG(relevance_scores, k)

    num_docs = len(relevant_documents)
    ideal_relevance_scores = [1] * min(num_docs, k) + [0] * (k - min(num_docs, k))
    idcg_k = get_DCG(ideal_relevance_scores, k)

    ndcg_k = dcg_k / idcg_k if idcg_k > 0 else 0
    return ndcg_k

Tworzymy listę prawidłowych dopasowań tekstów do zapytań

In [13]:
correct_results = defaultdict(list)

for entry in ds_qa:
    query_id = entry['query-id']
    corpus_id = entry['corpus-id']
    correct_results[query_id].append(corpus_id)

correct_list = [{'query-id': query_id, 'corpuses-id': corpuses} for query_id, corpuses in correct_results.items()]

In [14]:
ndcgs = []
k=5

for j in range(len(search_results)):
    results = []
    for i in range(len(correct_list)):
        ndcg = get_NDCG(search_results[j][i*k:(i+1)*k], correct_list[i]['corpuses-id'], k)
        results.append((correct_list[i]['query-id'], float(ndcg)))

    ndcgs.append(results)

0 - with synonyms with lemmatizer

1 - without synonyms with lemmatizer

2 - with synonyms without lemmatizer

3 - without synonyms without lemmatizer


In [15]:
for i in range(len(ndcgs)):
    avg = sum(value for _, value in ndcgs[i]) / len(ndcgs[i])
    print(f"{i}: {round(avg*100,2)} %")

0: 18.58 %
1: 18.51 %
2: 13.84 %
3: 13.85 %


In [18]:
## KOD ZAKŁADA ZE WSZYSTKIE SCORY=1 <- mozna by zmienić

1. What are the strengths and weaknesses of regular expressions versus full text search regarding processing of text?
   
Regex must scan the entire document from start to finish with each search, making it effective for smaller datasets or specific text patterns. Regex provides greater control over the matching process, allowing for the detection of unusual or custom patterns. It works great in scenarios such as log file analysis, validating input data formats, or identifying highly specific text patterns like email addresses or dates.

Full-text search however is better suited for handling large datasets. Although building the necessary indexes may take time, the subsequent search process is incredibly fast. FTS is also the ideal choice when the meaning and context of words are important, such as in searches where relevance, semantics, or synonyms matter. FTS is particularly effective for queries written in natural language (the way humans typically communicate), as it can account for the context, meaning, and variations of terms, such as synonyms, enhancing search accuracy and flexibility.

2. Can an LLM be applied in the context of searching for documents? Justify your answer, excluding the obvious observation that an LLM can be used to formulate the answer.

LLMs excel at grasping context, managing ambiguity, and continuously learning from data, leading to more accurate and relevant search results. This makes LLMs a powerful enhancement to document search, going beyond the capabilities of traditional FTS. By incorporating LLMs into a search system, the experience becomes more intelligent, allowing for intent-based analysis of user queries. As a result, the system can retrieve documents that don’t necessarily contain the exact keywords but are still highly relevant because they convey similar ideas or concepts.