In [85]:
from elasticsearch import Elasticsearch
from datasets import load_dataset
import itertools
from tqdm.notebook import tqdm
import pandas as pd

es = Elasticsearch(['http://localhost:9200/'])

es.info().body

{'name': '65db7b99f9bf',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': 'oMd2u2xuSBugp0oySEszgw',
 'version': {'number': '8.10.4',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': 'b4a62ac808e886ff032700c391f45f1408b2538c',
  'build_date': '2023-10-11T22:04:35.506990650Z',
  'build_snapshot': False,
  'lucene_version': '9.7.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

## 3,4 Define an ES analyzer for Polish texts. Define another analyzer for Polish, without the synonym filter.

In [87]:
settings= {
  "analysis": {
    "analyzer": {
      "analyzer_with_synonym": {
        "type": "custom",
        "tokenizer": "standard",
        "filter": [
          "synonyms_filter",
          "lowercase",
          "morfologik_stem",
          "lowercase"
        ]
      },
      "analyzer_without_synonym": {
        "type": "custom",
        "tokenizer": "standard",
        "filter": [
          "lowercase",
          "morfologik_stem",
          "lowercase"
        ]
      },
      "analyzer_without_lematization": {
        "type": "custom",
        "tokenizer": "standard",
        "filter": [
          "synonyms_filter",
          "lowercase"
        ]
      },
      "analyzer_without_synonym_and_lematization": {
        "type": "custom",
        "tokenizer": "standard",
        "filter": [
          "lowercase"
        ]
      }
    },
    "filter": {
     "synonyms_filter": {
                "type": "synonym",
                "synonyms": [
            "styczeń, sty, I",
            "luty, lut, II",
            "marzec, mar, III",
            "kwiecień, kwi, IV",
            "maj, V",
            "czerwiec, cze, VI",
            "lipiec, lip, VII",
            "sierpień, sie, VIII",
            "wrzesień, wrz, IX",
            "październik, paź, X",
            "listopad, lis, XI",
            "grudzień, gru, XII"
          ]
        }
    }
  }
}

## 5 Define an ES index for storing the contents of the corpus from lab 1 using both analyzers. Use different names for the fields analyzed with a different pipeline.

In [89]:
INDEX_NAME1 = "analyzer_with_synonym"
INDEX_NAME2 = "analyzer_without_synonym"

In [90]:
es.indices.delete(index=INDEX_NAME1)
es.indices.delete(index=INDEX_NAME2)

ObjectApiResponse({'acknowledged': True})

In [91]:
es.indices.create(index=INDEX_NAME1, settings=settings)
es.indices.create(index=INDEX_NAME2, settings=settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'analyzer_without_synonym'})

## 6 Load the data to the ES index.

In [93]:
dataset = load_dataset("clarin-knext/fiqa-pl", "corpus")
dataset

Found cached dataset fiqa-pl (C:/Users/Macie/.cache/huggingface/datasets/clarin-knext___fiqa-pl/corpus/0.0.0/bada00640881ee3fd04c3b88df9edd435616d17e0a46faf05e63063858742140)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    corpus: Dataset({
        features: ['_id', 'title', 'text'],
        num_rows: 57638
    })
})

In [94]:
df=pd.DataFrame(dataset['corpus'])
df

Unnamed: 0,_id,title,text
0,3,,"Nie mówię, że nie podoba mi się też pomysł szk..."
1,31,,Tak więc nic nie zapobiega fałszywym ocenom po...
2,56,,Nigdy nie możesz korzystać z FSA dla indywidua...
3,59,,Samsung stworzył LCD i inne technologie płaski...
4,63,,Oto wymagania SEC: Federalne przepisy dotycząc...
...,...,...,...
57633,599946,,">Cóż, po pierwsze, drogi to coś więcej niż hob..."
57634,599953,,"Tak, robią. Na dotacje dla firm farmaceutyczny..."
57635,599966,,">To bardzo smutne, że nie rozumiesz ludzkiej n..."
57636,599975,,„Czy Twój CTO pozwolił dużej grupie użyć „„adm...


In [95]:
body = list(itertools.chain(*df.apply(lambda x: ({'index': {'_id': x._id}}, {'title': x.title, 'text': x.text}) , axis=1)))
chunk_size = 200
for chunk in tqdm([body[i:i + chunk_size] for i in range(0, len(body), chunk_size)]):
    es.bulk(
        index=INDEX_NAME1,
        body=chunk
    )
    es.bulk(
        index=INDEX_NAME2,
        body=chunk
    )


  0%|          | 0/577 [00:00<?, ?it/s]

In [99]:
print(es.count(index=INDEX_NAME1)['count'])
print(es.count(index=INDEX_NAME2)['count'])

57638
57638


## 7 Determine the number of documents containing the word styczeń (in any form) including and excluding the synonyms.

In [103]:
es.count(index=INDEX_NAME1,  analyzer='analyzer_with_synonym', q='text:styczeń')['count']

44123

Count of word 'styczeń' with synonymes is not reliable. Possible that elastic search takes wrong synonymes

In [101]:
es.count(index=INDEX_NAME2,  analyzer='analyzer_without_synonym', q='text:styczeń')['count']

29

## 8 Download the QA pairs for the FIQA dataset.

In [105]:
QA_dataset = load_dataset("clarin-knext/fiqa-pl-qrels", 'corpus')

Downloading readme:   0%|          | 0.00/201 [00:00<?, ?B/s]

Downloading and preparing dataset csv/clarin-knext--fiqa-pl-qrels to C:/Users/Macie/.cache/huggingface/datasets/clarin-knext___csv/clarin-knext--fiqa-pl-qrels-87c7ba66b4612e3c/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/210k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/18.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/25.3k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to C:/Users/Macie/.cache/huggingface/datasets/clarin-knext___csv/clarin-knext--fiqa-pl-qrels-87c7ba66b4612e3c/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

## What are the strengths and weaknesses of regular expressions versus full text search regarding processing of text?


The biggest advantage of regular search is how quickly we get acquainted with it. It is easy to use and very useful for smaller natural language processing tasks. In contrast, using elastic search is much more complicated. From installation problems to very extensive usage options. Without a doubt, it is a better tool when you want to solve very complex natural processing tasks.

## Is full text search applicable to the question answering problem? show at least 3 examples from the corpus to support your claim.

Full search can be applicable to the question answering problem. It should be noted that it searches out of context, so some of the results may not be what u are looking for.
usage examples:
- FAQs
- Legal Documents
- Medical Literature