### Imports

In [47]:
import time
import spacy
import pandas as pd
from typing import Callable, Union, List, Dict, Any, Tuple
from tqdm.notebook import tqdm

In [2]:
from index_utils import IndexUtil
from scoring_utils import ScoringUtil
from experiment_utils import ExperimentUtil
from analyzer_utils import AnalyzerUtil

### Prepare index/mappings/settings

In [3]:
INDEX_NAME = 'index_synonyms_meaningful'

In [4]:
INDEX = IndexUtil(INDEX_NAME)

In [5]:
def get_mappings(analyzer:str = 'synonym_analyzer'):
    return {
        "properties": {
            "article_id": {
                "type": "keyword"
            },
            "text": {
                "type": "text",
                "analyzer": analyzer
            },
        }
    }

In [59]:
def get_settings(synonyms_list=["Eins, Uno, One", "Cosmos, Universe"]):
    return {
        "number_of_shards": 1,
          "analysis": {
            "analyzer": {
              "synonym_analyzer": {
                "tokenizer": "standard",
                "filter": ["lowercase","synonyms_named_entities_filter"]
              }
            },
            "filter": {
              "synonyms_named_entities_filter": {
              "type": "synonym",
              "synonyms": synonyms_list
              }
            }
          }
        }

In [60]:
def document_mapping_func(doc: Dict[str, Any])->Dict[str, Any]:
    return {
        'article_id': doc['uuid'],
        'text': doc['text'],
    }

### Load dataset/ modify questions

In [8]:
nlp = spacy.load("en_core_web_sm")



In [27]:
DOCUMENTS_SWIFT, QUESTIONS_SWIFT = ExperimentUtil.load_dataset('swift_ui')

In [28]:
QUESTIONS_SWIFT_MOD=QUESTIONS_SWIFT.copy()

In [29]:
def get_named_entities_from_questions(questions=QUESTIONS_SWIFT):
    named_entities_ls = []
    for question in questions:
        doc = nlp(question['question'])
        for token in doc:
            if token.pos_ in ['PROPN']:
                named_entities_ls.append(token.lemma_)
    return list(set(named_entities_ls))

In [30]:
# NE save to file
# names_entities_list = get_named_entities_from_questions()
# with open("settings/named_entities.txt", "w") as f:
#     for ne in names_entities_list:
#         f.write(str(ne) +"\n")

In [31]:
# NE manually extended with thei synonyms 
ENTITIES_SYNONYMS = []
with open("settings/named_entities_synonyms.txt", "r") as f:
    for line in f:
        ENTITIES_SYNONYMS.append(line.strip())

In [32]:
def replace_named_entities(text, synonyms=ENTITIES_SYNONYMS):
    synonyms_dict = {
        line.split(', ')[0]:line.split(', ')[1]
        for line in synonyms
    }
    for key, value in synonyms_dict.items():
        if key in text:
            text = text.replace(key,value)
    return text

In [33]:
def modify_questions(questions_mod=QUESTIONS_SWIFT_MOD):
    for question in questions_mod:
        question['question'] = replace_named_entities(question['question'])

In [34]:
modify_questions()

### Experiment

In [48]:
def test_meaningful_synonyms(documents, questions, index = INDEX_NAME, query_fuc = INDEX.default_query, synonyms_list=ENTITIES_SYNONYMS):
    synonyms_ls = []
    hits_top_10_ls = []
    hits_top_5_ls = []
    hits_top_3_ls = []
    hits_top_1_ls = []
    synonyms_range=[True, False]
    for use_synonyms in tqdm(synonyms_range):
        INDEX.delete_index()
        if use_synonyms:
            INDEX.create_index(get_mappings(), get_settings(synonyms_list))
        else:
            INDEX.create_index(get_mappings(), get_settings([]))
        INDEX.index_all_docs(documents, document_mapping_func)
        
        synonyms_ls.append(use_synonyms)
        time.sleep(1)
        all_hits = ExperimentUtil.validate(index, questions, query_fuc)
        hits_10, hits_5, hit_3, hits_1 = all_hits['hits@10'], all_hits['hits@5'], all_hits['hits@3'], all_hits['hits@1']
        hits_top_10_ls.append(hits_10)
        hits_top_5_ls.append(hits_5)
        hits_top_3_ls.append(hit_3)
        hits_top_1_ls.append(hits_1)
    return pd.DataFrame.from_dict({
        'synonyms_used':synonyms_ls,
        'hits@10':hits_top_10_ls,
        'hits@5':hits_top_5_ls,
        'hits@3':hits_top_3_ls,
        'hits@1':hits_top_1_ls
    })

In [57]:
# meaningful_synonyms_df = test_meaningful_synonyms(DOCUMENTS_SWIFT, QUESTIONS_SWIFT_MOD)

In [56]:
# meaningful_synonyms_df.to_csv('results/meaningful_synonyms_impact.csv')

### See results and analyzer

In [58]:
meaningfuls_synonyms_df.style.set_caption("SWIFT_UI - impact of adding synonyms of Named Entities")

Unnamed: 0,synonyms_used,hits@10,hits@5,hits@3,hits@1
0,True,0.923913,0.86413,0.777174,0.597826
1,False,0.777174,0.673913,0.625,0.429348


In [50]:
INDEX.delete_index()
INDEX.create_index(get_mappings(), get_settings(ENTITIES_SYNONYMS))

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'index_synonyms_meaningful'}

In [51]:
ANALYZER = AnalyzerUtil(INDEX_NAME)

In [53]:
ANALYZER.analyze(analyzer='synonym_analyzer', text="I want to uno eins universe, Mac, MVVM, Core")

['i',
 'want',
 'to',
 'uno',
 'eins',
 'universe',
 'mac',
 'macintosh',
 'mvvm',
 'model',
 'core',
 'view',
 'icoreutil',
 'view',
 'model']