# Search Engine

### Import

In [1]:
import os
import re

import ipywidgets as widgets

from elasticsearch import Elasticsearch
from bs4 import BeautifulSoup

### Give the username of your computer below:

In [3]:
path = os.getcwd()
username_pc = path.split('\\')[2] # username is always third item [drive][users][username][folder][folder][etc]
# username_pc = 'kenne' # or type manually

path.split(username_pc)[1]

#### CHEATSHEET Elastic Search curl's

In [4]:
# !curl "http://localhost:9200/test"
# !curl -XDELETE "localhost:9200/repos"
# !curl -XPOST "http://localhost:9200/_shutdown"

!curl "http://localhost:9200/_cat/indices?v"

health status index      uuid                   pri rep docs.count docs.deleted store.size pri.store.size
yellow open   repos_cell LftGIMs6S9CvBlBXWhjlOw   1   1     116397            0    303.1mb        303.1mb


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100   212  100   212    0     0  16307      0 --:--:-- --:--:-- --:--:-- 16307


### Fucntions

#### Difference re.match and re.findall
re.match matches the pattern from the start of the string. re.findall however searches for occurrences of the pattern anywhere in the string.

In [5]:
# Text 
HOST = 'http://localhost:9200/'
es = Elasticsearch(hosts=[HOST]) 

# Text 
normal_text = widgets.Text() #description="Search", continuous_update=True)
code = widgets.Text(description="Code")
markdown = widgets.Text(description="Markdown")
modules = widgets.Text(description="Modules")

# Buttons
search_b_normal = widgets.Button(description="Search")
search_b_adv = widgets.Button(description="Adv S")
clear_b = widgets.Button(description="Clear")

# Output
output = widgets.Output()


def markdown_ref(text):
    refs = re.findall('\[(.*?)\]\((.*?)\)', text) # get match the same as [reference](url or # within page)
    for ref in refs: # small chance of multiple in a single text
        to_replace = '['+ref[0]+']'+'('+ref[1]+')' # lorem ipsum [`ref0`](`ref1`) dolor sit amet
        text = text.replace(to_replace,ref[0])
    return text


def create_ref(input_str):
    if type(input_str) is list: # get the longest string from the list of strings, this works better with highlighting 
        longest = max(input_str, key=len)
    else: 
        longest = input_str
    if longest.startswith('#'):
        longest = longest.replace('#','').lstrip() #use lstrip instead of strip (only want to strip in front of string)
    longest = BeautifulSoup(longest, "html.parser").text # remove possible html tags, to avoid contradictory tags in display_results()
    # "BeautifulSoup(longest,"lxml") causes problems on other pc's"
    longest = longest.lstrip('-').lstrip() 
    longest = markdown_ref(longest)
    start = '#:~:text='
    end = longest.replace('`','').replace(' ','%20')    # remove html encoding for ` character (used like this ``variable``)
                                                        # add html encoding for whitespaces between words
    output = start+end
    return output


def multi_match_query(string,highlight):
    """
    Function explenation
    """
    # https://www.elastic.co/guide/en/elasticsearch/reference/6.8/search-request-highlighting.html
    fields = ['string']
    query_str = string
    q = {}
    query_dict = {}
    multi_match = {}
    multi_match['fields'] = fields
    multi_match['query'] = query_str
    query_dict['multi_match'] = multi_match
    q['query'] = query_dict
    
    if highlight != None:
        highlight = {"pre_tags":["<u><i><b>"],
            "post_tags":["</b></i></u>"],
            "fields":{'string':{}}}
        q['highlight'] = highlight
    return q


def on_normal_search_clicked(b):
    with output:
        output.clear_output()
        if normal_text.value == '':
            print("Type a query")
        else:
            display_results(multi_match_query(normal_text.value,True),10)


def on_adv_search_clicked(b):
    send_code = code.value
    send_markdown = markdown.value
    send_modules = modules.value
    with output:
        output.clear_output()
        if code.value == "" and markdown.value=='' and modules.value=='':
            print("Type a query")
        else:
            if code.value == '':
                send_code = None
            if markdown.value == '':
                send_markdown = None
            if modules.value == '':
                send_modules = None
            print(query_string_query(send_code,send_markdown,send_modules,True))
            display_results(query_string_query(send_code,send_markdown,send_modules,True),20)


def on_clear_clicked(b):
    with output:
        output.clear_output()
    normal_text.value = ''
    markdown.value = ''
    code.value = ''
    modules.value = ''


def switch(check):
    with output:                      # remove if you want results to stay when
        output.clear_output()         # you switch between normal and adv search
    if check['new']:
        search_display.children=[row_adv2,row_output] # --------------- ADV VERANDERD IN ADV2 --------------------
    else:
        search_display.children=[row_normal,row_output]


def display_results(query,max_res):
    result = es.search(body=query, size=max_res)
    # HIER STAAT OFFLINE ES
#     result = offline_es
    for i in range(len(result['hits']['hits'])): # for all hits
        res = result['hits']['hits'][i]
        
        res_layout = widgets.Layout(maring='0px')
        
        rank_title = widgets.HTML(value='<b>'+str(i+1)+'   '+res['_source']['file']+'</b>')#,layout=res_layout)
        os_path = res['_source']['location']
        localhost_path = os_path.split(username_pc)[1]
        
        url = widgets.HTML(value='<a href= http://localhost:8888/notebooks'+localhost_path+' target="_blank">'+localhost_path+'</a>')
        score = widgets.HTML(value=str(res['_score']))#,layout=res_layout)
        folder = widgets.HTML(value=res['_source']['folder'])
        file_cell = widgets.HTML(value='1')#res['_source']['file_cell'])
        whole_layout = widgets.Layout(display='flex',
                        flex_flow='column',
                        align_items='stretch',
                        border='grey solid 1px')
        try:

            highlight_list = res['highlight']['string']
            highlight_str = ' '.join(highlight_list)
            highlight = widgets.HTML(value=str(highlight_list),layout=res_layout)
            highlight_ref = create_ref(highlight_list)
            text = res['_source']['string']
            fl = widgets.HTML(value=text[0])
            ref_path = 'http://localhost:8888/notebooks' + localhost_path + create_ref(text[0])
            ref_path = ref_path.replace(' ','%20') # in case the filename has whitespace between words (example the ipython-notebooks files of yoavram) 

            url_local_ref = widgets.HTML(value= '<a href= '+ref_path+' target="_blank"> '+localhost_path+'</a>')
            single_result = widgets.VBox([rank_title,score,folder,file_cell,url_local_ref,highlight],layout=whole_layout)      
        except:
            single_result = widgets.VBox([rank_title,score,folder,file_cell,url],layout=whole_layout)
        display(single_result)


In [6]:
search_b_normal.on_click(on_normal_search_clicked)
search_b_adv.on_click(on_adv_search_clicked)
clear_b.on_click(on_clear_clicked)

check_a = widgets.Checkbox(description='markdown')
check_b = widgets.Checkbox(description='code')
check_c = widgets.Checkbox(description='heading')
check_d = widgets.Checkbox(description='display data')
check_e = widgets.Checkbox(description='e')
row_check = widgets.HBox([check_a,check_b,check_c,check_d,check_e])

row_normal = widgets.HBox([normal_text,search_b_normal,clear_b])
row_adv = widgets.HBox([markdown,code,modules,search_b_adv,clear_b])
row_adv2 = widgets.VBox([row_check,row_adv])
row_output = widgets.VBox([output])

top_toggle = widgets.Checkbox(description='Advanced Search')
top_toggle.observe(switch, names='value')
display(top_toggle)

normal_text.on_submit(on_normal_search_clicked)
code.on_submit(on_adv_search_clicked)
markdown.on_submit(on_adv_search_clicked)
# modules.on_submit(on_adv_search_clicked)


search_display = widgets.VBox(children = [row_normal,row_output])
display(search_display)

Checkbox(value=False, description='Advanced Search')

VBox(children=(HBox(children=(Text(value=''), Button(description='Search', style=ButtonStyle()), Button(descri…