### Installaion - compatibilty with setup

In [38]:
# ! pip install elasticsearch==7.15.1
# ! pip install -U tqdm

### Imports

In [13]:
import os
import json
import pandas as pd
from typing import Dict, Any

from index_utils import IndexUtil
from experiment_utils import ExperimentUtil

### Setup

In [14]:
INDEX_NAME = 'index_test_inverted_index'

In [3]:
TEST_ANALYZER = 'stop-english_standard_analyzer'

In [4]:
INDEX = IndexUtil(INDEX_NAME)

In [5]:
def get_index_mappings(analyzer:str = TEST_ANALYZER):
    return {
        "properties": {
            "article_id": {
                "type": "keyword"
            },
            "text": {
                "type": "text",
                "analyzer": analyzer
            },
        }
    }

In [6]:
def get_settings():
    settings = IndexUtil.get_default_settings()
    IndexUtil.set_shards_in_settings(settings)
    return settings

In [7]:
def document_mapping_func(doc: Dict[str, Any])->Dict[str, Any]:
    return {
        'article_id': doc['uuid'],
        'text': doc['text'],
    }

### Prepare data

In [8]:
DOCUMENTS_SQUAD, QUESTIONS_SQUAD = ExperimentUtil.load_dataset('squad_10k')

In [9]:
INDEX.delete_index()
INDEX.create_index(get_index_mappings(), get_settings())
INDEX.index_all_docs(DOCUMENTS_SQUAD, document_mapping_func)

  0%|          | 0/10000 [00:00<?, ?it/s]

### Display index

In [20]:
# In order to display - install elasticsearch==7.0.2

# ! pip install elasticsearch==7.0.2
# ! pip install inelastic==0.2.4
# ! pip install -U tqdm

In [26]:
def load_inverted_index_to_dataframe(index_name: str, field: str = 'text') -> pd.DataFrame:
    process = os.popen(f'inelastic -i {index_name} -o json -f {field}')
    inverted_index = json.loads(process.read())
    process.close()

    df_index = pd.DataFrame(inverted_index['terms'])
    df_index.ids = df_index.ids.apply(lambda l: ' | '.join([el[:3] for el in l]))
    return df_index

In [35]:
df_index = load_inverted_index_to_dataframe(INDEX_NAME)

In [24]:
df_index[31000:31050]

Unnamed: 0,term,doc_count,freq,ids
31000,josephson,2,2,56t | 96t
31001,josephus,1,1,eqt
31002,josh,1,1,z6t
31003,joshua,3,4,Cat | Oqt | TKt
31004,josiah,3,3,2Kt | 9qt | Zqt
31005,josip,6,6,76t | Eat | Qat | Sat | xqt | yat
31006,jost,1,1,0at
31007,josé,20,24,-at | 6Kt | 96t | Jqt | M6t | Pat | Q6t | Sat ...
31008,joséphine,4,7,2Kt | 6Kt | FKt | wKt
31009,jotas,1,1,Dat


In [36]:
df_index

Unnamed: 0,term,doc_count,freq,ids
0,0,58,77,-Ky | -Ky | -ay | 2qy | 56y | 5qy | 96y | 9Ky ...
1,01,1,1,Cay
2,075,1,1,aKy
3,0.0,1,1,zay
4,0.000,1,2,mKy
...,...,...,...,...
78279,컴보이,1,1,06y
78280,현대,1,1,06y
78281,ﬂ,1,1,Xqy
78282,𐀞𐀊𐀍𐀚,1,1,YKy
