### Imports

In [1]:
import time
import pandas as pd
from typing import Callable, Union, List, Dict, Any, Tuple
from tqdm.notebook import tqdm

In [2]:
from index_utils import IndexUtil
from experiment_utils import ExperimentUtil

### Prepare index/mappings/settings

In [3]:
INDEX_NAME = 'index_shards'

In [4]:
INDEX = IndexUtil(INDEX_NAME)

In [5]:
def get_index_mappings(analyzer:str = 'stop-english_stemmer-porter2_standard_analyzer'):
    return {
        "properties": {
            "article_id": {
                "type": "keyword"
            },
            "text": {
                "type": "text",
                "analyzer": analyzer
            },
        }
    }

In [6]:
def get_settings(shards_number: int):
    settings = IndexUtil.get_default_settings()
    IndexUtil.set_shards_in_settings(settings, shards=shards_number)
    return settings

In [7]:
def document_mapping_func(doc: Dict[str, Any])->Dict[str, Any]:
    return {
        'article_id': doc['uuid'],
        'text': doc['text'],
    }

### Load datasets

In [8]:
DOCUMENTS_SQUAD, QUESTIONS_SQUAD = ExperimentUtil.load_dataset('squad_10k')
DOCUMENTS_SQUAD_TRAIN, QUESTIONS_SQUAD_TRAIN = ExperimentUtil.load_dataset('squad_train')
DOCUMENTS_SQUAD_BASE, QUESTIONS_SQUAD_BASE = ExperimentUtil.load_dataset('squad_1k')
DOCUMENTS_SWIFT, QUESTIONS_SWIFT = ExperimentUtil.load_dataset('swift_ui')

In [9]:
len(QUESTIONS_SQUAD_BASE)

500

### Query

In [10]:
def dfs_query(query, index=INDEX, index_name=INDEX_NAME, limit = 10):
    res = INDEX.elastic_connector.search(
        index=INDEX_NAME, 
        size=limit, 
        query={
            "multi_match" : {
              "query":    query, 
              "fields": ["text"]
            }
        },
        search_type='dfs_query_then_fetch'
    )
    results =  [(hit["_source"]) for hit in res['hits']['hits']]
    return results

### Experiment

In [11]:
def test_shards_impact(documents, questions, index = INDEX_NAME, query_fuc = INDEX.default_query, dfs_query_fun=dfs_query):
    shards = []
    hits_top_10_ls = []
    hits_top_5_ls = []
    hits_top_3_ls = []
    hits_top_1_ls = []
    hits_top_10_ls_dfs = []
    hits_top_5_ls_dfs = []
    hits_top_3_ls_dfs = []
    hits_top_1_ls_dfs = []
    shards_range=list(range(1,11))+[15,20,30]
    for shard_n in tqdm(shards_range):
        INDEX.delete_index()
        INDEX.create_index(get_index_mappings(), get_settings(shard_n))
        INDEX.index_all_docs(documents, document_mapping_func)
        shards.append(shard_n)
        time.sleep(1)
        # validation without DFS
        all_hits = ExperimentUtil.validate(index, questions, query_fuc)
        hits_10, hits_5, hit_3, hits_1 = all_hits['hits@10'], all_hits['hits@5'], all_hits['hits@3'], all_hits['hits@1']
        hits_top_10_ls.append(hits_10)
        hits_top_5_ls.append(hits_5)
        hits_top_3_ls.append(hit_3)
        hits_top_1_ls.append(hits_1)
        # validation with DFS
        all_hits_dfs = ExperimentUtil.validate(
            index,
            questions,
            lambda query,limit: dfs_query_fun(
                query=query
            )
        )
        hits_10_dfs, hits_5_dfs, hit_3_dfs, hits_1_dfs = all_hits_dfs['hits@10'], all_hits_dfs['hits@5'], all_hits_dfs['hits@3'], all_hits_dfs['hits@1']
        hits_top_10_ls_dfs.append(hits_10_dfs)
        hits_top_5_ls_dfs.append(hits_5_dfs)
        hits_top_3_ls_dfs.append(hit_3_dfs)
        hits_top_1_ls_dfs.append(hits_1_dfs)
    return pd.DataFrame.from_dict({
        'shards':shards,
        'hits@10':hits_top_10_ls,
        'hits@10_dfs':hits_top_10_ls_dfs,
        'hits@5':hits_top_5_ls,
        'hits@5_dfs':hits_top_5_ls_dfs,
        'hits@3':hits_top_3_ls,
        'hits@3_dfs':hits_top_3_ls_dfs,
        'hits@1':hits_top_1_ls,
        'hits@1_dfs':hits_top_1_ls_dfs
    })

In [54]:
def test_shards_impact_vs_index_size(documents, documents_base = DOCUMENTS_SQUAD_BASE, questions_base = QUESTIONS_SQUAD_BASE, index = INDEX_NAME, query_fuc = INDEX.default_query, dfs_query_fun=dfs_query):
    shards = []
    index_size_ls = []
    hits_top_10_ls = []
    hits_top_10_ls_dfs = []
    hits_top_10_ls_diff = []
    hits_top_10_ls_rel = []
    hits_top_1_ls = []
    hits_top_1_ls_dfs = []
    hits_top_1_ls_diff = []
    hits_top_1_ls_rel = []
    
    sizes_range=[0,4000,14000]
    shards_range=[10,20,30]
    for doc_size in tqdm(sizes_range):
        for shards_numer in shards_range:
            shards.append(shards_numer)
            docs_to_index = documents_base + documents[:doc_size]
            INDEX.delete_index()
            INDEX.create_index(get_index_mappings(), get_settings(shards_numer))
            INDEX.index_all_docs(docs_to_index, document_mapping_func)
            index_size_ls.append(len(docs_to_index))
            time.sleep(1)
            # validation without DFS
            all_hits = ExperimentUtil.validate(index, questions_base, query_fuc)
            hits_10, hits_1 = all_hits['hits@10'], all_hits['hits@1']
            hits_top_10_ls.append(hits_10)
            hits_top_1_ls.append(hits_1)
            # validation with DFS
            all_hits_dfs = ExperimentUtil.validate(
                index,
                questions_base,
                lambda query,limit: dfs_query_fun(
                    query=query
                )
            )
            hits_10_dfs, hits_1_dfs = all_hits_dfs['hits@10'], all_hits_dfs['hits@1']
            hits_top_10_ls_dfs.append(hits_10_dfs)
            hits_top_1_ls_dfs.append(hits_1_dfs)
            hits_top_10_ls_diff.append(hits_10_dfs-hits_10)
            hits_top_10_ls_rel.append((hits_10_dfs-hits_10)/hits_10_dfs*100)
            hits_top_1_ls_diff.append(hits_1_dfs-hits_1)
            hits_top_1_ls_rel.append((hits_1_dfs-hits_1)/hits_1_dfs*100)
    return pd.DataFrame.from_dict({
        'shards':shards,
        'index_size':index_size_ls,
        'hits@10':hits_top_10_ls,
        'hits@10_dfs':hits_top_10_ls_dfs,
        'hits@10_diff':hits_top_10_ls_diff,
        'hits@10_relative_diff[%]':hits_top_10_ls_rel,
        'hits@1':hits_top_1_ls,
        'hits@1_dfs':hits_top_1_ls_dfs,
        'hits@1_diff':hits_top_1_ls_diff,
        'hits@1_relative_diff[%]':hits_top_1_ls_rel,
    })

In [13]:
# similarity_swift_df = test_shards_impact(DOCUMENTS_SWIFT, QUESTIONS_SWIFT)

In [14]:
# similarity_swift_df.to_csv('results/shards_dfs_swift.csv')

In [15]:
# similarity_squad_df = test_shards_impact(DOCUMENTS_SQUAD, QUESTIONS_SQUAD)

In [16]:
# similarity_squad_df.to_csv('results/shards_dfs_squad.csv')

In [59]:
# index_size_impact_squad_df = test_shards_impact_vs_index_size(DOCUMENTS_SQUAD_TRAIN)

In [58]:
# index_size_impact_squad_df.to_csv('results/shards_impact_vs_index_size.csv')

### Explore results

In [60]:
index_size_impact_squad_df.sort_values(['shards','index_size'], ascending=True).style.set_caption("Impact of shards number for different index sizes")

Unnamed: 0,shards,index_size,hits@10,hits@10_dfs,hits@10_diff,hits@10_relative_diff[%],hits@1,hits@1_dfs,hits@1_diff,hits@1_relative_diff[%]
0,10,1000,0.96,0.972,0.012,1.234568,0.76,0.79,0.03,3.797468
3,10,5000,0.94,0.954,0.014,1.467505,0.714,0.738,0.024,3.252033
6,10,15000,0.92,0.922,0.002,0.21692,0.702,0.72,0.018,2.5
1,20,1000,0.954,0.972,0.018,1.851852,0.718,0.79,0.072,9.113924
4,20,5000,0.946,0.954,0.008,0.838574,0.708,0.738,0.03,4.065041
7,20,15000,0.916,0.922,0.006,0.650759,0.698,0.72,0.022,3.055556
2,30,1000,0.948,0.972,0.024,2.469136,0.732,0.79,0.058,7.341772
5,30,5000,0.946,0.954,0.008,0.838574,0.7,0.738,0.038,5.149051
8,30,15000,0.908,0.922,0.014,1.518438,0.686,0.72,0.034,4.722222


In [45]:
similarity_swift_df.style.set_caption("SWIFT_UI - impact of shards number and Distributed Frequency Search")

Unnamed: 0,shards,hits@10,hits@10_dfs,hits@5,hits@5_dfs,hits@3,hits@3_dfs,hits@1,hits@1_dfs
0,1,0.934783,0.934783,0.853261,0.853261,0.793478,0.793478,0.581522,0.581522
1,2,0.923913,0.934783,0.847826,0.853261,0.777174,0.793478,0.570652,0.581522
2,3,0.923913,0.934783,0.853261,0.853261,0.793478,0.793478,0.554348,0.581522
3,4,0.913043,0.934783,0.836957,0.853261,0.755435,0.793478,0.570652,0.581522
4,5,0.918478,0.934783,0.853261,0.853261,0.777174,0.793478,0.576087,0.581522
5,6,0.907609,0.934783,0.831522,0.853261,0.73913,0.793478,0.5,0.581522
6,7,0.913043,0.934783,0.842391,0.853261,0.766304,0.793478,0.527174,0.581522
7,8,0.891304,0.934783,0.820652,0.853261,0.728261,0.793478,0.483696,0.581522
8,9,0.929348,0.934783,0.853261,0.853261,0.798913,0.793478,0.505435,0.581522
9,10,0.907609,0.934783,0.831522,0.853261,0.755435,0.793478,0.521739,0.581522


In [46]:
similarity_squad_df.style.set_caption("SQUAD 10k - impact of shards number and Distributed Frequency Search")

Unnamed: 0,shards,hits@10,hits@10_dfs,hits@5,hits@5_dfs,hits@3,hits@3_dfs,hits@1,hits@1_dfs
0,1,0.928,0.928,0.896,0.896,0.871,0.871,0.766,0.766
1,2,0.926,0.928,0.892,0.896,0.867,0.872,0.764,0.766
2,3,0.923,0.928,0.892,0.896,0.865,0.872,0.759,0.766
3,4,0.924,0.928,0.888,0.896,0.869,0.871,0.761,0.766
4,5,0.918,0.928,0.893,0.896,0.87,0.872,0.75,0.766
5,6,0.924,0.928,0.886,0.896,0.854,0.871,0.759,0.766
6,7,0.922,0.928,0.89,0.896,0.861,0.871,0.754,0.766
7,8,0.921,0.928,0.883,0.896,0.855,0.872,0.751,0.766
8,9,0.918,0.928,0.887,0.896,0.86,0.872,0.744,0.766
9,10,0.919,0.928,0.886,0.896,0.861,0.872,0.748,0.766
