### Imports

In [52]:
import time
import pandas as pd
from typing import Callable, Union, List, Dict, Any, Tuple
from tqdm.notebook import tqdm

from index_utils import IndexUtil
from experiment_utils import ExperimentUtil
from analyzer_utils import AnalyzerUtil

### Prepare index/mappings/settings

In [2]:
INDEX_NAME = 'index_phonemes'

In [3]:
INDEX = IndexUtil(INDEX_NAME)

In [36]:
def get_settings():
    return {
        "number_of_shards": 1,
        "index": {
          "analysis": {
            "analyzer": {
              "phonetic_analyzer": {
                "tokenizer": "standard",
                "filter": [
                  "lowercase",
                  "metaphone_filter"
                ]
              },
            "phonetic_analyzer_no_replacement": {
                "tokenizer": "standard",
                "filter": [
                  "lowercase",
                  "metaphone_filter_no_replacement"
                ]
              },
            "stop-english_standard_analyzer": {
                "type": "custom",
                "tokenizer": "standard_tokenizer",
                "filter": [
                    "lowercase",
                    "stop-english_filter",
                ]
             },
            },
            "filter": {
              "metaphone_filter": {
                "type": "phonetic",
                "encoder": "metaphone",
                "replace": True
              },
              "metaphone_filter_no_replacement": {
                "type": "phonetic",
                "encoder": "metaphone",
                "replace": False
              },
              "stop-english_filter": {
                    "type": "stop",
                    "stopwords": "_english_"
              },
              "stemmer-porter2_filter": {
                    "type": "stemmer",
                    "language": "porter2"
              },
            },
            "tokenizer": {
                "standard_tokenizer": {
                    "type": "standard"
                }
            }
          }
        }
    }

In [37]:
def get_mappings():
    return {
        "properties": {
            "article_id": {
                "type": "keyword"
            },
            "text": {
                "type": "text",
                "analyzer": "stop-english_standard_analyzer",
                "copy_to": ["text_phonemes","text_phonemes_no_replacement"]
            },
            "title": {
                "type": "text",
                "analyzer": "stop-english_standard_analyzer",
            },
            "text_phonemes": {
                "type": "text",
                "analyzer": "phonetic_analyzer",
            },
            "text_phonemes_no_replacement": {
                "type": "text",
                "analyzer": "phonetic_analyzer_no_replacement",
            },
        }
    }

In [38]:
def document_mapping_func(doc: Dict[str, Any])->Dict[str, Any]:
    return {
        'article_id': doc['uuid'],
        'text': doc['text'],
        'title': doc['title'],
    }

### Load datasets

In [39]:
DOCUMENTS_SQUAD, QUESTIONS_SQUAD = ExperimentUtil.load_dataset('squad_10k')
DOCUMENTS_SWIFT, QUESTIONS_SWIFT = ExperimentUtil.load_dataset('swift_ui')

### Query

In [40]:
def phonemes_query(query, fields = ["text_phonemes","text^2.0", "title"], query_type = 'cross_fields', index=INDEX, index_name=INDEX_NAME, limit = 10):
    res = INDEX.elastic_connector.search(
        index=INDEX_NAME, 
        size=limit, 
        query={
            "multi_match" : {
              "query":    query, 
              "fields": fields,
              "type": query_type
            }
        },
        search_type='dfs_query_then_fetch'
    )
    results =  [(hit["_source"]) for hit in res['hits']['hits']]
    return results

### Experiment

In [41]:
def test_phonemes_impact(documents, questions, index = INDEX_NAME, query_fuc = phonemes_query):
    fields_ls = []
    types_ls = []
    hits_top_10_ls = []
    hits_top_5_ls = []
    hits_top_3_ls = []
    hits_top_1_ls = []
    
    fields_range=[
        ["text"],
        ["text_phonemes"],
        ["text_phonemes_no_replacement"],
        ["title"],
        ["text","text_phonemes"],
        ["text","text_phonemes_no_replacement"],
        ["text_phonemes","text", "title"],
        ["text_phonemes","text^2.0", "title"],
        ["text_phonemes_no_replacement","text", "title"],
        ["text_phonemes_no_replacement","text^2.0", "title"],
    ]
    
    types_range = ['cross_fields','most_fields', 'best_fields']
    
    INDEX.delete_index()
    INDEX.create_index(get_mappings(), get_settings())
    INDEX.index_all_docs(documents, document_mapping_func)
    
    for fields in tqdm(fields_range):
        for query_type in tqdm(types_range):

            fields_ls.append(str(fields))
            types_ls.append(query_type)
            time.sleep(1)
            all_hits = ExperimentUtil.validate(
                index,
                questions,
                lambda query,limit: query_fuc(
                    query=query,
                    fields=fields,
                    query_type=query_type,
                    limit=limit
                )
            )
            hits_10, hits_5, hit_3, hits_1 = all_hits['hits@10'], all_hits['hits@5'], all_hits['hits@3'], all_hits['hits@1']
            hits_top_10_ls.append(hits_10)
            hits_top_5_ls.append(hits_5)
            hits_top_3_ls.append(hit_3)
            hits_top_1_ls.append(hits_1)

    return pd.DataFrame.from_dict({
        'fields':fields_ls,
        'query_type': types_ls,
        'hits@10':hits_top_10_ls,
        'hits@5':hits_top_5_ls,
        'hits@3':hits_top_3_ls,
        'hits@1':hits_top_1_ls
    })

In [46]:
# phonemes_swift_df = test_phonemes_impact(DOCUMENTS_SWIFT, QUESTIONS_SWIFT)

In [57]:
# phonemes_swift_df.to_csv('results/phonemes_swift.csv')

In [58]:
# phonemes_squad_df = test_phonemes_impact(DOCUMENTS_SQUAD, QUESTIONS_SQUAD)

In [59]:
# phonemes_squad_df.to_csv('results/phonemes_squad.csv')

### Explore results

In [47]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

  pd.set_option('display.max_colwidth', -1)


In [62]:
phonemes_swift_df_10 = phonemes_swift_df.sort_values(['hits@10'], ascending=False)
phonemes_swift_df_10 = phonemes_swift_df_10.style.set_caption("SWIFT_UI - performance of phonetic analyzer sorted by hits@10")
phonemes_swift_df_10

Unnamed: 0,fields,query_type,hits@10,hits@5,hits@3,hits@1
0,['text'],cross_fields,0.940217,0.86413,0.804348,0.603261
1,['text'],most_fields,0.940217,0.86413,0.804348,0.603261
27,"['text_phonemes_no_replacement', 'text^2.0', 'title']",cross_fields,0.940217,0.869565,0.804348,0.608696
23,"['text_phonemes', 'text^2.0', 'title']",best_fields,0.940217,0.86413,0.798913,0.603261
22,"['text_phonemes', 'text^2.0', 'title']",most_fields,0.940217,0.88587,0.798913,0.608696
21,"['text_phonemes', 'text^2.0', 'title']",cross_fields,0.940217,0.86413,0.798913,0.608696
29,"['text_phonemes_no_replacement', 'text^2.0', 'title']",best_fields,0.940217,0.869565,0.804348,0.603261
2,['text'],best_fields,0.940217,0.86413,0.804348,0.603261
19,"['text_phonemes', 'text', 'title']",most_fields,0.934783,0.875,0.798913,0.603261
13,"['text', 'text_phonemes']",most_fields,0.929348,0.875,0.804348,0.61413


In [61]:
phonemes_swift_df_5 = phonemes_swift_df.sort_values(['hits@5'], ascending=False)
phonemes_swift_df_5 = phonemes_swift_df_5.style.set_caption("SWIFT_UI - performance of phonetic analyzer sorted by hits@5")
phonemes_swift_df_5

Unnamed: 0,fields,query_type,hits@10,hits@5,hits@3,hits@1
22,"['text_phonemes', 'text^2.0', 'title']",most_fields,0.940217,0.88587,0.798913,0.608696
19,"['text_phonemes', 'text', 'title']",most_fields,0.934783,0.875,0.798913,0.603261
18,"['text_phonemes', 'text', 'title']",cross_fields,0.923913,0.875,0.809783,0.586957
13,"['text', 'text_phonemes']",most_fields,0.929348,0.875,0.804348,0.61413
29,"['text_phonemes_no_replacement', 'text^2.0', 'title']",best_fields,0.940217,0.869565,0.804348,0.603261
12,"['text', 'text_phonemes']",cross_fields,0.923913,0.869565,0.804348,0.581522
27,"['text_phonemes_no_replacement', 'text^2.0', 'title']",cross_fields,0.940217,0.869565,0.804348,0.608696
20,"['text_phonemes', 'text', 'title']",best_fields,0.923913,0.869565,0.804348,0.581522
14,"['text', 'text_phonemes']",best_fields,0.923913,0.869565,0.804348,0.581522
23,"['text_phonemes', 'text^2.0', 'title']",best_fields,0.940217,0.86413,0.798913,0.603261


In [63]:
phonemes_squad_df_10 = phonemes_squad_df.sort_values(['hits@10'], ascending=False)
phonemes_squad_df_10 = phonemes_squad_df_10.style.set_caption("SUQAD 10k - performance of phonetic analyzer sorted by hits@10")
phonemes_squad_df_10

Unnamed: 0,fields,query_type,hits@10,hits@5,hits@3,hits@1
13,"['text', 'text_phonemes']",most_fields,0.923,0.89,0.862,0.739
28,"['text_phonemes_no_replacement', 'text^2.0', 'title']",most_fields,0.922,0.889,0.858,0.739
25,"['text_phonemes_no_replacement', 'text', 'title']",most_fields,0.922,0.89,0.86,0.738
22,"['text_phonemes', 'text^2.0', 'title']",most_fields,0.922,0.889,0.853,0.732
16,"['text', 'text_phonemes_no_replacement']",most_fields,0.92,0.889,0.857,0.737
19,"['text_phonemes', 'text', 'title']",most_fields,0.919,0.89,0.856,0.733
27,"['text_phonemes_no_replacement', 'text^2.0', 'title']",cross_fields,0.917,0.878,0.843,0.72
29,"['text_phonemes_no_replacement', 'text^2.0', 'title']",best_fields,0.913,0.877,0.843,0.717
21,"['text_phonemes', 'text^2.0', 'title']",cross_fields,0.913,0.872,0.838,0.719
18,"['text_phonemes', 'text', 'title']",cross_fields,0.912,0.881,0.845,0.728


In [64]:
phonemes_squad_df_5 = phonemes_squad_df.sort_values(['hits@5'], ascending=False)
phonemes_squad_df_5 = phonemes_squad_df_5.style.set_caption("SUQAD 10k - performance of phonetic analyzer sorted by hits@5")
phonemes_squad_df_5

Unnamed: 0,fields,query_type,hits@10,hits@5,hits@3,hits@1
25,"['text_phonemes_no_replacement', 'text', 'title']",most_fields,0.922,0.89,0.86,0.738
19,"['text_phonemes', 'text', 'title']",most_fields,0.919,0.89,0.856,0.733
13,"['text', 'text_phonemes']",most_fields,0.923,0.89,0.862,0.739
28,"['text_phonemes_no_replacement', 'text^2.0', 'title']",most_fields,0.922,0.889,0.858,0.739
22,"['text_phonemes', 'text^2.0', 'title']",most_fields,0.922,0.889,0.853,0.732
16,"['text', 'text_phonemes_no_replacement']",most_fields,0.92,0.889,0.857,0.737
18,"['text_phonemes', 'text', 'title']",cross_fields,0.912,0.881,0.845,0.728
27,"['text_phonemes_no_replacement', 'text^2.0', 'title']",cross_fields,0.917,0.878,0.843,0.72
20,"['text_phonemes', 'text', 'title']",best_fields,0.908,0.878,0.843,0.721
12,"['text', 'text_phonemes']",cross_fields,0.908,0.878,0.843,0.721


### See how phonetic analyzer works

In [54]:
ANALYZER = AnalyzerUtil(INDEX_NAME)
SAMPLE_TEXT = 'Tom Hanks is a good actor as he loves playing'

In [55]:
ANALYZER.analyze(analyzer='phonetic_analyzer', text=SAMPLE_TEXT)

['TM', 'HNKS', 'IS', 'A', 'KT', 'AKTR', 'AS', 'H', 'LFS', 'PLYN']

In [56]:
ANALYZER.analyze(analyzer='phonetic_analyzer_no_replacement', text=SAMPLE_TEXT)

['TM',
 'tom',
 'HNKS',
 'hanks',
 'IS',
 'is',
 'A',
 'a',
 'KT',
 'good',
 'AKTR',
 'actor',
 'AS',
 'as',
 'H',
 'he',
 'LFS',
 'loves',
 'PLYN',
 'playing']