In [1]:
from mockup_rag.utils import find_similar_docs, search_fuzzy, search_match_phrase, format_search_output
from mockup_rag.config import client
import pandas as pd 
from opensearchpy import OpenSearch # type: ignore
from opensearch_dsl import Search

In [2]:
INDEX_NAME = "test2-cosine" 
query = "Wer ist asylberechtigt?"

In [3]:
query_keywords = [q for q in query.split(" ") if len(q) > 2]
query_keywords

['Wer', 'ist', 'asylberechtigt?']

## Retrieve lexical search results on OpenSearch index

### Run exact match phrase search

In [4]:
lexical_df = pd.DataFrame()
for q in query_keywords:
    out_shard = search_match_phrase(field='text', query=q, index_name=INDEX_NAME)
    df_ = format_search_output(out_shard)
    lexical_df = pd.concat([lexical_df, df_], axis=0)
          

Searching for `Wer` in the field `text`
couldnt format output because of 'title'
Searching for `ist` in the field `text`
couldnt format output because of 'title'
Searching for `asylberechtigt?` in the field `text`
couldnt format output because of 'title'


In [None]:
s = Search(using=client, index=INDEX_NAME) \
    .query("match", text="Asyl")

    #.filter("term", year="2011") \

response = s.execute()
print('\nSearch results:')
for hit in response:
    print(hit.meta.score, hit.paragraph, hit.text)

print(response[1])

In [None]:
lexical_df.head(5)

### Run fuzzy word search 
By specifying the level of `fuzziness` we can tolerate for things like mispellings, typos etc. `fuzziness` is an integer>=0 where when `fuzziness=0` we are saying we don't want any fuzziness and want an exact match. When `fuzziness=1`, we are saying we can tolerate results that are one character off from our search query. 

In [None]:
fuzzy_df = pd.DataFrame()
for q in query_keywords:
    out_shard = search_fuzzy(field='text', query=q, fuzziness=1, index_name=INDEX_NAME)
    df_ = format_search_output(out_shard)
    fuzzy_df = pd.concat([fuzzy_df, df_], axis=0)
          

In [None]:
fuzzy_df.head(5)

### Retrieve semantic search output using OpenSearch knn-vector search and co:here embeddings

In [None]:
semantic_out = find_similar_docs(query=query, k=2, num_results=5, index_name=INDEX_NAME) 
semantic_df = format_search_output(semantic_out)

In [None]:
semantic_df.head()

## Visualize outputs
Let's take the top abstract result from the `lexical_df`, `fuzzy_df` and the top abstract result from the `semantic_df` and see if the results look interesting. They query keywords in all abstract results are highlighted to show that while the semantic results may not retrieve the most keywords, the results are semantically more meaningful than lexical/fuzzy based approaches. 

In [None]:
from utils import colorize

def visualize(top_row, color): 
    print(f'''Top result for this searchmethod is arxiv_id={top_row['arxiv_id']} with score={top_row['score']}\n''')
    print(colorize(top_row.abstract, query_keywords, color=color))

In [None]:
visualize(lexical_df.iloc[0], color="cyan")


In [None]:
visualize(fuzzy_df.iloc[0], color="blue")

In [None]:
visualize(semantic_df.iloc[0], color="green")