### Imports

In [69]:
import time
import pandas as pd
from typing import Callable, Union, List, Dict, Any, Tuple
from tqdm.notebook import tqdm

In [2]:
from index_utils import IndexUtil
from scoring_utils import ScoringUtil
from experiment_utils import ExperimentUtil

### Prepare index/mappings/settings

In [3]:
INDEX_NAME = 'index_similarity'

In [4]:
INDEX = IndexUtil(INDEX_NAME)

In [36]:
def get_index_mappings(analyzer:str = 'stop-english_standard_analyzer'):
    return {
        "properties": {
            "article_id": {
                "type": "keyword"
            },
            "text": {
                "type": "text",
                "analyzer": analyzer
            },
        }
    }

In [68]:
def get_settings(similarity:str = 'bm25'):
    settings = IndexUtil.get_default_settings()
    IndexUtil.set_shards_in_settings(settings)
    ScoringUtil.set_similarity(settings, similarity)
    return settings

In [38]:
def document_mapping_func(doc: Dict[str, Any])->Dict[str, Any]:
    return {
        'article_id': doc['uuid'],
        'text': doc['text'],
    }

### Load datasets

In [18]:
DOCUMENTS_SQUAD, QUESTIONS_SQUAD = ExperimentUtil.load_dataset('squad_10k')
DOCUMENTS_SWIFT, QUESTIONS_SWIFT = ExperimentUtil.load_dataset('swift_ui')

### Experiment

In [39]:
def test_similarity_functions(documents, questions, index = INDEX_NAME, query_fuc = INDEX.default_query):
    similarity_ls = []
    analyzers_ls = []
    hits_top_10_ls = []
    hits_top_5_ls = []
    hits_top_3_ls = []
    hits_top_1_ls = []
    similarity_range=['bm25','dfr','dfi','ib','lm_d','lm_jm','tfidf']
    analyzers_range = ['stop-english_standard_analyzer','stemmer-porter2_standard_analyzer']
    for similarity in tqdm(similarity_range):
        for analyzer in analyzers_range:
            INDEX.delete_index()
            INDEX.create_index(get_index_mappings(analyzer), get_settings(similarity))
            INDEX.index_all_docs(documents, document_mapping_func)
            similarity_ls.append(similarity)
            analyzers_ls.append(analyzer)
            time.sleep(1)
            all_hits = ExperimentUtil.validate(index, questions, query_fuc)
            hits_10, hits_5, hit_3, hits_1 = all_hits['hits@10'], all_hits['hits@5'], all_hits['hits@3'], all_hits['hits@1']
            hits_top_10_ls.append(hits_10)
            hits_top_5_ls.append(hits_5)
            hits_top_3_ls.append(hit_3)
            hits_top_1_ls.append(hits_1)
    return pd.DataFrame.from_dict({
        'similarity_function':similarity_ls,
        'analyzer': analyzers_ls,
        'hits@10':hits_top_10_ls,
        'hits@5':hits_top_5_ls,
        'hits@3':hits_top_3_ls,
        'hits@1':hits_top_1_ls
    })

In [59]:
# similarity_swift_df = test_similarity_functions(DOCUMENTS_SWIFT, QUESTIONS_SWIFT)

In [61]:
# similarity_squad_df = test_similarity_functions(DOCUMENTS_SQUAD, QUESTIONS_SQUAD)

In [62]:
# similarity_swift_df.to_csv('results/similarity_functions_swift.csv')

In [63]:
# similarity_squad_df.to_csv('results/similarity_functions_squad.csv')

### Explore results

#### SWIFT_UI

In [53]:
similarity_swift_df_10 = similarity_swift_df.sort_values(['analyzer','hits@10'], ascending=False)
similarity_swift_df_10 = similarity_swift_df_10.style.set_caption("SWIFT_UI - impact of similarity function by hits@10")
similarity_swift_df_10

Unnamed: 0,similarity_function,analyzer,hits@10,hits@5,hits@3,hits@1
0,bm25,stop-english_standard_analyzer,0.940217,0.86413,0.804348,0.603261
4,dfi,stop-english_standard_analyzer,0.940217,0.86413,0.798913,0.608696
6,ib,stop-english_standard_analyzer,0.940217,0.86413,0.798913,0.608696
8,lm_d,stop-english_standard_analyzer,0.940217,0.880435,0.793478,0.592391
12,tfidf,stop-english_standard_analyzer,0.918478,0.847826,0.788043,0.570652
10,lm_jm,stop-english_standard_analyzer,0.875,0.782609,0.711957,0.51087
2,dfr,stop-english_standard_analyzer,0.869565,0.815217,0.744565,0.538043
13,tfidf,stemmer-porter2_standard_analyzer,0.945652,0.902174,0.798913,0.543478
5,dfi,stemmer-porter2_standard_analyzer,0.940217,0.86413,0.798913,0.608696
7,ib,stemmer-porter2_standard_analyzer,0.940217,0.86413,0.798913,0.608696


In [56]:
similarity_swift_df_5 = similarity_swift_df.sort_values(['analyzer','hits@5'], ascending=False)
similarity_swift_df_5 = similarity_swift_df_5.style.set_caption("SWIFT_UI - impact of similarity function by hits@5")
similarity_swift_df_5

Unnamed: 0,similarity_function,analyzer,hits@10,hits@5,hits@3,hits@1
8,lm_d,stop-english_standard_analyzer,0.940217,0.880435,0.793478,0.592391
0,bm25,stop-english_standard_analyzer,0.940217,0.86413,0.804348,0.603261
4,dfi,stop-english_standard_analyzer,0.940217,0.86413,0.798913,0.608696
6,ib,stop-english_standard_analyzer,0.940217,0.86413,0.798913,0.608696
12,tfidf,stop-english_standard_analyzer,0.918478,0.847826,0.788043,0.570652
2,dfr,stop-english_standard_analyzer,0.869565,0.815217,0.744565,0.538043
10,lm_jm,stop-english_standard_analyzer,0.875,0.782609,0.711957,0.51087
13,tfidf,stemmer-porter2_standard_analyzer,0.945652,0.902174,0.798913,0.543478
9,lm_d,stemmer-porter2_standard_analyzer,0.940217,0.875,0.809783,0.586957
1,bm25,stemmer-porter2_standard_analyzer,0.934783,0.86413,0.793478,0.581522


#### SQUAD_10k

In [66]:
similarity_squad_df_10 = similarity_squad_df.sort_values(['analyzer','hits@10'], ascending=False)
similarity_squad_df_10 = similarity_squad_df_10.style.set_caption("SQUAD_10k - impact of similarity function by hits@10")
similarity_squad_df_10

Unnamed: 0,similarity_function,analyzer,hits@10,hits@5,hits@3,hits@1
4,dfi,stop-english_standard_analyzer,0.905,0.871,0.83,0.717
0,bm25,stop-english_standard_analyzer,0.904,0.868,0.835,0.716
6,ib,stop-english_standard_analyzer,0.904,0.871,0.83,0.717
2,dfr,stop-english_standard_analyzer,0.896,0.848,0.818,0.698
8,lm_d,stop-english_standard_analyzer,0.887,0.839,0.793,0.656
12,tfidf,stop-english_standard_analyzer,0.882,0.839,0.8,0.65
10,lm_jm,stop-english_standard_analyzer,0.877,0.833,0.787,0.666
1,bm25,stemmer-porter2_standard_analyzer,0.924,0.902,0.867,0.764
13,tfidf,stemmer-porter2_standard_analyzer,0.909,0.873,0.837,0.69
3,dfr,stemmer-porter2_standard_analyzer,0.906,0.881,0.853,0.735


In [67]:
similarity_squad_df_5 = similarity_squad_df.sort_values(['analyzer','hits@5'], ascending=False)
similarity_squad_df_5 = similarity_squad_df_5.style.set_caption("SQUAD_10k - impact of similarity function by hits@5")
similarity_squad_df_5

Unnamed: 0,similarity_function,analyzer,hits@10,hits@5,hits@3,hits@1
4,dfi,stop-english_standard_analyzer,0.905,0.871,0.83,0.717
6,ib,stop-english_standard_analyzer,0.904,0.871,0.83,0.717
0,bm25,stop-english_standard_analyzer,0.904,0.868,0.835,0.716
2,dfr,stop-english_standard_analyzer,0.896,0.848,0.818,0.698
8,lm_d,stop-english_standard_analyzer,0.887,0.839,0.793,0.656
12,tfidf,stop-english_standard_analyzer,0.882,0.839,0.8,0.65
10,lm_jm,stop-english_standard_analyzer,0.877,0.833,0.787,0.666
1,bm25,stemmer-porter2_standard_analyzer,0.924,0.902,0.867,0.764
3,dfr,stemmer-porter2_standard_analyzer,0.906,0.881,0.853,0.735
13,tfidf,stemmer-porter2_standard_analyzer,0.909,0.873,0.837,0.69
