### Imports

In [1]:
import time
import spacy
import pandas as pd
from typing import Callable, Union, List, Dict, Any, Tuple
from tqdm.notebook import tqdm

from index_utils import IndexUtil
from experiment_utils import ExperimentUtil
from analyzer_utils import AnalyzerUtil

### Prepare index/mappings/settings

In [2]:
INDEX_NAME = 'index_additional_fields'

In [3]:
INDEX = IndexUtil(INDEX_NAME)

In [4]:
def get_settings():
    return {
        "number_of_shards": 1,
        "index": {
          "analysis": {
            "analyzer": {
            "stop-english_standard_analyzer": {
                "type": "custom",
                "tokenizer": "standard_tokenizer",
                "filter": [
                    "lowercase",
                    "stop-english_filter",
                ]
             },
            },
            "filter": {
              "stop-english_filter": {
                    "type": "stop",
                    "stopwords": "_english_"
              },
              "stemmer-porter2_filter": {
                    "type": "stemmer",
                    "language": "porter2"
              },
            },
            "tokenizer": {
                "standard_tokenizer": {
                    "type": "standard"
                }
            }
          }
        }
    }

In [5]:
def get_mappings(analyzer="stop-english_standard_analyzer"):
    return {
        "properties": {
            "article_id": {
                "type": "keyword"
            },
            "text": {
                "type": "text",
                "analyzer": analyzer,
            },
            "title": {
                "type": "text",
                "analyzer": analyzer,
            },
            "text_lemma": {
                "type": "text",
                "analyzer": analyzer,
            },
            "keywords": {
                "type": "text",
                "analyzer": analyzer,
            },
            "ne": {
                "type": "text",
                "analyzer": analyzer,
            },
        }
    }

In [6]:
def document_mapping_func(doc: Dict[str, Any])->Dict[str, Any]:
    return {
        'article_id': doc['uuid'],
        'text': doc['text'],
        'title': doc['title'],
        'text_lemma': doc['text_lemma'],
        'keywords': doc['keywords'],
        'ne': doc['ne'],
    }

### Load datasets

In [7]:
DOCUMENTS_SQUAD, QUESTIONS_SQUAD = ExperimentUtil.load_dataset('squad_10k_additional')
DOCUMENTS_SWIFT, QUESTIONS_SWIFT = ExperimentUtil.load_dataset('swift_additional')

### Query

In [24]:
nlp = spacy.load("en_core_web_sm")



In [27]:
def lemmatize_query(text):
    doc = nlp(text)
    text_lemma = ''
    return ' '.join([token.lemma_ for token in doc])

In [29]:
def custom_query(query, fields = ["text_lemma","text^2.0", "title"], query_type = 'cross_fields', index=INDEX, index_name=INDEX_NAME, limit = 10, lemmatize=False):
    res = INDEX.elastic_connector.search(
        index=INDEX_NAME, 
        size=limit, 
        query={
            "multi_match" : {
              "query":    query if not lemmatize else lemmatize_query(query), 
              "fields": fields,
              "type": query_type
            }
        },
        search_type='dfs_query_then_fetch'
    )
    results =  [(hit["_source"]) for hit in res['hits']['hits']]
    return results

### Experiment

In [31]:
def test_additional_fields_impact(documents, questions, index = INDEX_NAME, query_fuc = custom_query):
    fields_ls = []
    types_ls = []
    hits_top_10_ls = []
    hits_top_5_ls = []
    hits_top_3_ls = []
    hits_top_1_ls = []
    
    fields_range=[
        ["text"],
        ["text_lemma"],
        ["ne"],
        ["title"],
        ["keywords"],
        ["text","title"],
        ["text","title","keywords","text_lemma","ne"],
    ]
    
    types_range = ['cross_fields','most_fields', 'best_fields']
    
    INDEX.delete_index()
    INDEX.create_index(get_mappings(), get_settings())
    INDEX.index_all_docs(documents, document_mapping_func)
    
    for fields in tqdm(fields_range):
        for query_type in tqdm(types_range):
            lemmatize = 'text_lemma' in fields
            fields_ls.append(str(fields))
            types_ls.append(query_type)
            time.sleep(1)
            all_hits = ExperimentUtil.validate(
                index,
                questions,
                lambda query,limit: query_fuc(
                    query=query,
                    fields=fields,
                    query_type=query_type,
                    limit=limit,
                    lemmatize=lemmatize,
                )
            )
            hits_10, hits_5, hit_3, hits_1 = all_hits['hits@10'], all_hits['hits@5'], all_hits['hits@3'], all_hits['hits@1']
            hits_top_10_ls.append(hits_10)
            hits_top_5_ls.append(hits_5)
            hits_top_3_ls.append(hit_3)
            hits_top_1_ls.append(hits_1)

    return pd.DataFrame.from_dict({
        'fields':fields_ls,
        'query_type': types_ls,
        'hits@10':hits_top_10_ls,
        'hits@5':hits_top_5_ls,
        'hits@3':hits_top_3_ls,
        'hits@1':hits_top_1_ls
    })

In [35]:
# additional_fields_swift_df = test_additional_fields_impact(DOCUMENTS_SWIFT, QUESTIONS_SWIFT)

In [36]:
# additional_fields_swift_df.to_csv('results/additional_fields_swift.csv')

In [39]:
# additional_fields_squad_df = test_additional_fields_impact(DOCUMENTS_SQUAD, QUESTIONS_SQUAD)

In [40]:
# additional_fields_squad_df.to_csv('results/additional_fields_squad.csv')

### Explore results

In [33]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

  pd.set_option('display.max_colwidth', -1)


In [34]:
additional_fields_swift_df.sort_values(['hits@10'], ascending=False).style.set_caption("SWIFT_UI - additional fields sorted by hits@10")


Unnamed: 0,fields,query_type,hits@10,hits@5,hits@3,hits@1
0,['text'],cross_fields,0.940217,0.86413,0.804348,0.603261
1,['text'],most_fields,0.940217,0.86413,0.804348,0.603261
2,['text'],best_fields,0.940217,0.86413,0.804348,0.603261
17,"['text', 'title']",best_fields,0.940217,0.86413,0.804348,0.603261
15,"['text', 'title']",cross_fields,0.940217,0.86413,0.798913,0.61413
20,"['text', 'title', 'keywords', 'text_lemma', 'ne']",best_fields,0.934783,0.858696,0.793478,0.576087
16,"['text', 'title']",most_fields,0.929348,0.858696,0.798913,0.608696
18,"['text', 'title', 'keywords', 'text_lemma', 'ne']",cross_fields,0.923913,0.869565,0.793478,0.586957
5,['text_lemma'],best_fields,0.913043,0.86413,0.782609,0.592391
4,['text_lemma'],most_fields,0.913043,0.86413,0.782609,0.592391


In [38]:
additional_fields_squad_df.sort_values(['hits@10'], ascending=False).style.set_caption("SQUAD 10 k - additional fields sorted by hits@10")


Unnamed: 0,fields,query_type,hits@10,hits@5,hits@3,hits@1
18,"['text', 'title', 'keywords', 'text_lemma', 'ne']",cross_fields,0.927,0.899,0.869,0.766
3,['text_lemma'],cross_fields,0.922,0.896,0.866,0.76
4,['text_lemma'],most_fields,0.922,0.896,0.866,0.76
5,['text_lemma'],best_fields,0.922,0.896,0.866,0.76
20,"['text', 'title', 'keywords', 'text_lemma', 'ne']",best_fields,0.916,0.883,0.852,0.741
15,"['text', 'title']",cross_fields,0.91,0.873,0.84,0.724
1,['text'],most_fields,0.904,0.868,0.835,0.716
17,"['text', 'title']",best_fields,0.904,0.868,0.835,0.716
0,['text'],cross_fields,0.904,0.868,0.835,0.716
2,['text'],best_fields,0.904,0.868,0.835,0.716
