### Imports

In [1]:
import time
import numpy as np
import pandas as pd
from typing import Callable, Union, List, Dict, Any, Tuple
from tqdm.notebook import tqdm

In [2]:
from index_utils import IndexUtil
from experiment_utils import ExperimentUtil

### Prepare index/mappings/settings

In [3]:
INDEX_NAME = 'index_analyzers'

In [4]:
INDEX = IndexUtil(INDEX_NAME)

In [5]:
DEAFULT_SETTINGS = IndexUtil.get_default_settings()

In [6]:
def get_mappings(analyzer:str = 'stop-english_standard_analyzer'):
    return {
        "properties": {
            "article_id": {
                "type": "keyword"
            },
            "text": {
                "type": "text",
                "analyzer": analyzer
            },
        }
    }

In [7]:
def document_mapping_func(doc: Dict[str, Any])->Dict[str, Any]:
    return {
        'article_id': doc['uuid'],
        'text': doc['text'],
    }

### Load datasets

In [8]:
DOCUMENTS_SQUAD, QUESTIONS_SQUAD = ExperimentUtil.load_dataset('squad_10k')
DOCUMENTS_SWIFT, QUESTIONS_SWIFT = ExperimentUtil.load_dataset('swift_ui')

### Experiment

In [20]:
def prepare_analyzers_list():
    filtered_analyzers = ['shingle-1-2-3_standard_analyzer','stop-english_stemmer-english_shingle-1-3-3_standard_analyzer','shingle-1-3-3_letter_analyzer']
    filtered_analyzers = filtered_analyzers + ['stop-english_keyword-repeat_stemmer-porter2_remove_duplicates_standard_analyzer']
    filtered_analyzers = filtered_analyzers + ['keyword-repeat_stemmer-porter2_remove_duplicates_standard_analyzer']
    filtered_analyzers = filtered_analyzers + ['ngram_standard_analyzer','2-3-ngram_standard_analyzer']
#     filtered_analyzers = filtered_analyzers + [analyzer for analyzer in DEAFULT_SETTINGS['analysis']['analyzer'] if 'shingle' not in analyzer and 'pattern' not in analyzer]
    return filtered_analyzers

In [24]:
def test_analyzers_impact(documents, questions, index = INDEX_NAME, query_fuc = INDEX.default_query):
    analyzers_ls = []
    hits_top_10_ls = []
    hits_top_5_ls = []
    hits_top_3_ls = []
    hits_top_1_ls = []
    filtered_analyzers = prepare_analyzers_list()
    for analyzer in tqdm(filtered_analyzers):
        INDEX.delete_index()
        INDEX.create_index(mappings=get_mappings(analyzer))
        INDEX.index_all_docs(documents, document_mapping_func)
        analyzers_ls.append(analyzer)
        time.sleep(1)
        all_hits = ExperimentUtil.validate(index, questions, query_fuc)
        hits_10, hits_5, hit_3, hits_1 = all_hits['hits@10'], all_hits['hits@5'], all_hits['hits@3'], all_hits['hits@1']
        hits_top_10_ls.append(hits_10)
        hits_top_5_ls.append(hits_5)
        hits_top_3_ls.append(hit_3)
        hits_top_1_ls.append(hits_1)
    return pd.DataFrame.from_dict({
        'analyzer':analyzers_ls,
        'hits@10':hits_top_10_ls,
        'hits@5':hits_top_5_ls,
        'hits@3':hits_top_3_ls,
        'hits@1':hits_top_1_ls
    })

In [25]:
# analyzers_swift_df = test_analyzers_impact(DOCUMENTS_SWIFT, QUESTIONS_SWIFT)

In [34]:
# analyzers_squad_df = test_analyzers_impact(DOCUMENTS_SQUAD, QUESTIONS_SQUAD)

In [15]:
# analyzers_swift_df.to_csv('results/analyzers_impact_swift.csv')

In [33]:
# analyzers_squad_df.to_csv('results/analyzers_impact_squad.csv')

In [35]:
analyzers_swift_df = pd.read_csv('results/analyzers_impact_swift.csv', index_col=[0])
analyzers_squad_df = pd.read_csv('results/analyzers_impact_squad.csv', index_col=[0])

### Explore results

#### SWIFT

In [18]:
analyzers_swift_df.sort_values('hits@10', ascending=False)

Unnamed: 0,analyzer,hits@10,hits@5,hits@3,hits@1
41,classic_analyzer,0.940217,0.858696,0.793478,0.592391
12,stop-english_standard_analyzer,0.940217,0.86413,0.804348,0.603261
42,stop-english_classic_analyzer,0.940217,0.853261,0.798913,0.586957
11,standard_analyzer,0.940217,0.86413,0.798913,0.608696
23,stemmer-english_letter_analyzer,0.934783,0.86413,0.804348,0.592391
22,stop-english_letter_analyzer,0.934783,0.869565,0.815217,0.592391
21,letter_analyzer,0.934783,0.875,0.809783,0.603261
20,stop-english_stemmer-porter2_standard_analyzer,0.934783,0.853261,0.793478,0.581522
17,stop-english_stemmer-english_standard_analyzer,0.934783,0.86413,0.798913,0.581522
16,stemmer-porter2_standard_analyzer,0.934783,0.86413,0.793478,0.581522


In [19]:
analyzers_swift_df.sort_values('hits@5', ascending=False)

Unnamed: 0,analyzer,hits@10,hits@5,hits@3,hits@1
28,stop-english_stemmer-lightenglish_letter_analyzer,0.923913,0.88587,0.820652,0.586957
24,stemmer-lightenglish_letter_analyzer,0.929348,0.880435,0.820652,0.581522
25,stemmer-lovins_letter_analyzer,0.918478,0.875,0.788043,0.554348
18,stop-english_stemmer-lightenglish_standard_ana...,0.929348,0.875,0.820652,0.570652
14,stemmer-lightenglish_standard_analyzer,0.923913,0.875,0.798913,0.559783
21,letter_analyzer,0.934783,0.875,0.809783,0.603261
2,shingle-1-3-3_letter_analyzer,0.934783,0.875,0.798913,0.592391
48,stop-english_stemmer-lightenglish_classic_anal...,0.929348,0.869565,0.815217,0.559783
29,stop-english_stemmer-lovins_letter_analyzer,0.918478,0.869565,0.777174,0.565217
49,stop-english_stemmer-lovins_classic_analyzer,0.913043,0.869565,0.788043,0.538043


#### SQUAD

In [37]:
analyzers_squad_df.sort_values('hits@10', ascending=False)

Unnamed: 0,analyzer,hits@10,hits@5,hits@3,hits@1
10,stop-english_stemmer-porter2_standard_analyzer,0.928,0.896,0.872,0.766
3,stop-english_keyword-repeat_stemmer-porter2_re...,0.927,0.894,0.868,0.767
40,stop-english_stemmer-porter2_classic_analyzer,0.926,0.894,0.869,0.765
4,keyword-repeat_stemmer-porter2_remove_duplicat...,0.924,0.901,0.87,0.771
6,stemmer-porter2_standard_analyzer,0.924,0.902,0.867,0.764
36,stemmer-porter2_classic_analyzer,0.922,0.899,0.865,0.765
7,stop-english_stemmer-english_standard_analyzer,0.918,0.886,0.858,0.751
37,stop-english_stemmer-english_classic_analyzer,0.916,0.883,0.854,0.75
4,stemmer-lightenglish_standard_analyzer,0.916,0.89,0.857,0.759
17,stop-english_stemmer-english_letter_analyzer,0.915,0.88,0.852,0.748


In [38]:
analyzers_squad_df.sort_values('hits@5', ascending=False)

Unnamed: 0,analyzer,hits@10,hits@5,hits@3,hits@1
6,stemmer-porter2_standard_analyzer,0.924,0.902,0.867,0.764
4,keyword-repeat_stemmer-porter2_remove_duplicat...,0.924,0.901,0.87,0.771
36,stemmer-porter2_classic_analyzer,0.922,0.899,0.865,0.765
10,stop-english_stemmer-porter2_standard_analyzer,0.928,0.896,0.872,0.766
40,stop-english_stemmer-porter2_classic_analyzer,0.926,0.894,0.869,0.765
3,stop-english_keyword-repeat_stemmer-porter2_re...,0.927,0.894,0.868,0.767
14,stemmer-lightenglish_letter_analyzer,0.914,0.89,0.856,0.753
4,stemmer-lightenglish_standard_analyzer,0.916,0.89,0.857,0.759
8,stop-english_stemmer-lightenglish_standard_ana...,0.914,0.889,0.859,0.754
34,stemmer-lightenglish_classic_analyzer,0.914,0.887,0.855,0.76
