### Imports

In [9]:
import time
import pandas as pd
from typing import Callable, Union, List, Dict, Any, Tuple
from tqdm.notebook import tqdm

In [10]:
from index_utils import IndexUtil
from experiment_utils import ExperimentUtil
from analyzer_utils import AnalyzerUtil

### Prepare index/mappings/settings

In [11]:
INDEX_NAME = 'index_synonyms_wordnet'

In [12]:
INDEX = IndexUtil(INDEX_NAME)

In [19]:
ANALYZER = AnalyzerUtil(INDEX_NAME)

In [16]:
def get_mappings(analyzer:str = 'synonym_analyzer'):
    return {
        "properties": {
            "article_id": {
                "type": "keyword"
            },
            "text": {
                "type": "text",
                "analyzer": analyzer
            },
        }
    }

In [27]:
def get_settings():
    return {
        "number_of_shards": 1,
          "analysis": {
            "analyzer": {
              "synonym_analyzer": {
                "tokenizer": "standard",
                "filter": ["lowercase","synonyms_wordnet"]
              },
              "stop-english_standard_analyzer": {
                "type": "custom",
                "tokenizer": "standard_tokenizer",
                "filter": [
                    "lowercase",
                    "stop-english_filter",
                ]
              },
            },
            "filter": {
              "synonyms_wordnet": {
                "type": "synonym_graph",
                "format": "wordnet",
                "synonyms_path": "synonyms/wordnet_synonyms.txt"
              },
              "stop-english_filter": {
                "type": "stop",
                "stopwords": "_english_"
               },
            },
            "tokenizer": {
                "standard_tokenizer": {
                    "type": "standard"
                }
            }
          }
        }

In [28]:
def document_mapping_func(doc: Dict[str, Any])->Dict[str, Any]:
    return {
        'article_id': doc['uuid'],
        'text': doc['text'],
    }

### Load dataset

In [29]:
DOCUMENTS_SQUAD, QUESTIONS_SQUAD = ExperimentUtil.load_dataset('squad_10k')
DOCUMENTS_SWIFT, QUESTIONS_SWIFT = ExperimentUtil.load_dataset('swift_ui')

### Experiment

In [30]:
def test_wordnet_synonyms(documents, questions, index = INDEX_NAME, query_fuc = INDEX.default_query):
    synonyms_ls = []
    hits_top_10_ls = []
    hits_top_5_ls = []
    hits_top_3_ls = []
    hits_top_1_ls = []
    synonyms_range=[True, False]
    for use_synonyms in tqdm(synonyms_range):
        INDEX.delete_index()
        if use_synonyms:
            INDEX.create_index(get_mappings('synonym_analyzer'), get_settings())
        else:
            INDEX.create_index(get_mappings('english_standard_analyzer'), get_settings())
        INDEX.index_all_docs(documents, document_mapping_func)
        
        synonyms_ls.append(use_synonyms)
        time.sleep(1)
        all_hits = ExperimentUtil.validate(index, questions, query_fuc)
        hits_10, hits_5, hit_3, hits_1 = all_hits['hits@10'], all_hits['hits@5'], all_hits['hits@3'], all_hits['hits@1']
        hits_top_10_ls.append(hits_10)
        hits_top_5_ls.append(hits_5)
        hits_top_3_ls.append(hit_3)
        hits_top_1_ls.append(hits_1)
    return pd.DataFrame.from_dict({
        'synonyms_used':synonyms_ls,
        'hits@10':hits_top_10_ls,
        'hits@5':hits_top_5_ls,
        'hits@3':hits_top_3_ls,
        'hits@1':hits_top_1_ls
    })

In [44]:
# squad_wordnet_synonyms_df = test_wordnet_synonyms(DOCUMENTS_SQUAD, QUESTIONS_SQUAD)

In [43]:
# squad_wordnet_synonyms_df.to_csv('results/synonyms_wordnet_squad.csv')

In [38]:
# swift_wordnet_synonyms_df = test_wordnet_synonyms(DOCUMENTS_SWIFT, QUESTIONS_SWIFT)

In [39]:
# swift_wordnet_synonyms_df.to_csv('results/synonyms_wordnet_swift.csv')

In [50]:
INDEX.delete_index()
INDEX.create_index(get_mappings('synonym_analyzer'), get_settings())
INDEX.index_all_docs(DOCUMENTS_SQUAD, document_mapping_func)

  0%|          | 0/10000 [00:00<?, ?it/s]

### See results and analyzer

In [41]:
# 62199 - withouth synonyms, 78284 - with synonyms
squad_wordnet_synonyms_df.style.set_caption("SQUAD - impact of using all WordNet synonyms")

Unnamed: 0,synonyms_used,hits@10,hits@5,hits@3,hits@1
0,True,0.64,0.583,0.538,0.444
1,False,0.904,0.871,0.83,0.717


In [32]:
# 2951 - withouth synonyms, 10054 - with synonyms
swift_wordnet_synonyms_df.style.set_caption("SWIFT UI - impact of using all WordNet synonyms")

Unnamed: 0,synonyms_used,hits@10,hits@5,hits@3,hits@1
0,True,0.711957,0.668478,0.586957,0.423913
1,False,0.940217,0.86413,0.798913,0.608696


In [34]:
INDEX.delete_index()
INDEX.create_index(get_mappings('synonym_analyzer'), get_settings())

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'index_synonyms_wordnet'}

In [36]:
ANALYZER.analyze(analyzer='synonym_analyzer', text="Cats and dogs always fight")

['cats',
 'and',
 'dogs',
 'ever',
 "e'er",
 'constantly',
 'forever',
 'perpetually',
 'incessantly',
 'invariably',
 'always',
 'battle',
 'conflict',
 'engagement',
 'fighting',
 'combat',
 'scrap',
 'competitiveness',
 'contend',
 'struggle',
 'oppose',
 'fight',
 'fight',
 'defend',
 'crusade',
 'press',
 'campaign',
 'push',
 'agitate',
 'fight',
 'back',
 'down']