# Search Engine for Jupyter notebooks
## Creating a dictionary from the notebook files

In [1]:
# import
import json
import os
from collections import defaultdict
import numpy as np

In [3]:
folder = 'PythonDataScienceHandbook-notebooks'
files = [file for file in os.listdir(os.getcwd()+'\\'+folder) if file[-6:]=='.ipynb']

# test
# files

In [4]:
def read_ipynb(file,folder):
    cwd = os.getcwd()
    with open(cwd+'\\'+folder+'\\'+file,encoding="utf8") as file:
        data = json.load(file)
        code = []
        markdown = []
        modules = []
        heading = []
        code_output = []
        
        output_line = []
        for cell in data['cells']:
            clean_cell = list(map(lambda s: s.strip(), cell['source'])) #remove the '\n' at the end of each string in the list         
            for line in clean_cell:
                if line[:6]=='import' or line[:4]=='from':
                    modules += [line]
                if line[:1] == '#':
                    heading += [line]
            
            if cell['cell_type'] == 'markdown':
                markdown += clean_cell
            
            if cell['cell_type'] == 'code':
                code += clean_cell
                if cell['outputs']!=[]:
                    output_type = cell['outputs'][0]['output_type']
                    if output_type == 'stream':
                        output_line = cell['outputs'][0]['text']
                    if output_type == 'execute_result':
                        output_line = cell['outputs'][0]['data']['text/plain']
                    code_output += output_line

    markdown_str = ' '.join(map(str, markdown))
    code_str = ' '.join(map(str, code))
    code_output_str = ' '.join(map(str, code_output))
    modules = list(set(modules))

    return sorted(modules),heading,markdown_str,code_str,code_output_str #markdown,code

In [5]:
# test

# %%time
# read_ipynb('02.02-The-Basics-Of-NumPy-Arrays.ipynb',folder)

In [6]:
%%time

ipynb_dict = defaultdict()
for file in files:
    temp_dict = {}
    values = read_ipynb(file,folder)
    temp_dict['file_name'] = file
    temp_dict['modules'] = values[0]
    temp_dict['heading'] = values[1]
    temp_dict['markdown_str'] = values[2]
    temp_dict['code_str'] = values[3]
    temp_dict['code_output_str'] = values[4]
    ipynb_dict[file] = temp_dict

Wall time: 746 ms


In [7]:
# test
# ipynb_dict['02.02-The-Basics-Of-NumPy-Arrays.ipynb']['modules']

## Analysing the dictionary by creating a index

### Create a dataframe

In [9]:
import pandas as pd

In [10]:
notebooks_df = pd.DataFrame.from_dict(ipynb_dict,orient='index').reset_index(drop=True)
notebooks_df.head(10)

Unnamed: 0,file_name,modules,heading,markdown_str,code_str,code_output_str
0,00.00-Preface.ipynb,[],"[# Preface, ## What Is Data Science?, ## Who I...","<!--BOOK_INFORMATION--> <img align=""left"" styl...",,
1,01.00-IPython-Beyond-Normal-Python.ipynb,[],"[# IPython: Beyond Normal Python, ## Shell or ...","<!--BOOK_INFORMATION--> <img align=""left"" styl...",,
2,01.01-Help-And-Documentation.ipynb,[],"[# Help and Documentation in IPython, ## Acces...","<!--BOOK_INFORMATION--> <img align=""left"" styl...",,
3,01.02-Shell-Keyboard-Shortcuts.ipynb,[],"[# Keyboard Shortcuts in the IPython Shell, ##...","<!--BOOK_INFORMATION--> <img align=""left"" styl...",,
4,01.03-Magic-Commands.ipynb,[],"[# IPython Magic Commands, ## Pasting Code Blo...","<!--BOOK_INFORMATION--> <img align=""left"" styl...",,
5,01.04-Input-Output-History.ipynb,[import math],"[# Input and Output History, ## IPython's ``In...","<!--BOOK_INFORMATION--> <img align=""left"" styl...",,
6,01.05-IPython-And-Shell-Commands.ipynb,[],"[# IPython and Shell Commands, ## Quick Introd...","<!--BOOK_INFORMATION--> <img align=""left"" styl...",,
7,01.06-Errors-and-Debugging.ipynb,[],"[# Errors and Debugging, ## Controlling Except...","<!--BOOK_INFORMATION--> <img align=""left"" styl...","def func1(a, b): return a / b def func2(x): a...",Exception reporting mode: Plain\n Exception re...
8,01.07-Timing-and-Profiling.ipynb,"[from mprun_demo import sum_of_lists, import r...","[# Profiling and Timing Code, ## Timing Code S...","<!--BOOK_INFORMATION--> <img align=""left"" styl...",%timeit sum(range(100)) %%timeit total = 0 for...,"100000 loops, best of 3: 1.54 µs per loop\n 1 ..."
9,01.08-More-IPython-Resources.ipynb,[],"[# More IPython Resources, ## Web Resources, #...","<!--BOOK_INFORMATION--> <img align=""left"" styl...",,


## Elastic Search Setup Local

In [11]:
# import
from elasticsearch import Elasticsearch # <== May need to pip install this
import pandas as pd
from tqdm import tqdm_notebook

In [50]:
HOST = 'http://localhost:9200/'
es = Elasticsearch(hosts=[HOST])

INDEX="handboek"
TYPE= "record"

def rec_to_actions(df):
    for record in df.to_dict(orient="records"):
        yield ('{ "index" : { "_index" : "%s", "_type" : "%s" }}'% (INDEX, TYPE))
        yield (json.dumps(record, default=int))

### Put the dataframe ..

In [51]:
# to dataframe
# df = pd.read_dict(wiki_dict,orient='index')


def index_marks(nrows, chunk_size):
    return range(1 * chunk_size, (nrows // chunk_size + 1) * chunk_size, chunk_size)


def split(dfm, chunk_size):
    indices = index_marks(dfm.shape[0], chunk_size)
    return np.split(dfm, indices) 

# split up the dataframe
chunks = split(notebooks_df, 9000)

# Now bulk index all the chunks
c = len(chunks)
for c in tqdm_notebook(chunks):
    if c.shape[0]>0:
        r = es.bulk(rec_to_actions(c)) # return a dict
print('Done')

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Done


In [52]:
# !curl "http://localhost:9200/handboek"
# !curl -XDELETE "localhost:9200/handboek"

!curl "http://localhost:9200/_cat/indices?v"


# !curl -XPOST "http://localhost:9200/_shutdown"

health status index    uuid                   pri rep docs.count docs.deleted store.size pri.store.size
yellow open   handboek 8SoUSayXT_OGv2TXxjdXhQ   1   1         68            0     16.2kb         16.2kb


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100   208  100   208    0     0  10947      0 --:--:-- --:--:-- --:--:-- 10947


In [350]:
def multi_match_query(code,markdown,modules,highlight):
    query_str = ''
    fields = []
    
    if code != None:
        fields += ['code_str']
        query_str += ' '+code
    if markdown != None:
        fields += ['markdown_str']
        query_str += ' '+markdown
    if modules != None:
        fields += ['modules']
        query_str += ' '+modules   
    
        
    q = {}
    query_dict = {}
    multi_match = {}
    
    multi_match['fields'] = fields
    multi_match['query'] = query_str
    query_dict['multi_match'] = multi_match
    q['query'] = query_dict
    
    if highlight != None:
        highlight = {"pre_tags":["<b>"],
            "post_tags":["</b>"],
            "fields":{'markdown_str':{}}}
        q['highlight'] = highlight
    return q

def query_string_query(code,markdown,modules,highlight):
    query_str = ''
    fields = []
    
    if code != None:
        fields += ['code_str']
        query_str += '(code_str:'+code+')'
    if markdown != None:
        fields += ['markdown_str']
        if query_str != '':
            query_str += ' AND '
        query_str += '(markdown_str:'+markdown+')'
    if modules != None:
        fields += ['modules']
        if query_str != '':
            query_str += ' AND '
        query_str += '(modules:'+modules+')'  
    
        
    q = {}
    query_dict = {}
    query_string = {}
    
    query_string['query'] = query_str
    query_dict['query_string'] = query_string
    q['query'] = query_dict
    
    if highlight == True:
        highlight = {"pre_tags":["<b>"],
                     "post_tags":["</b>"],
                     "order":"score",
                     "fields":{'markdown_str':{},'code_str':{}}}
        q['highlight'] = highlight
    return q

# "order":"score",
# "fields":{'_all':{}}} WERKT NIET....


# https://www.elastic.co/guide/en/elasticsearch/reference/6.8/search-request-highlighting.html

In [None]:
# https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-multi-match-query.html

# {
#   "query": {
#     "multi_match" : {
#       "query":    "this is a test", 
#       "fields": [ "subject", "message" ] 
#     }
#   }
# }

In [None]:
# https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html

# {
#     "query": {
#         "query_string": {
#             "query": "(content:this OR name:this) AND (content:that OR name:that)"
#         }
#     }
# }

In [276]:
q_test_multimatch = multi_match_query('linear',None,'numpy',True)
q_test_multimatch

{'query': {'multi_match': {'fields': ['code_str', 'modules'],
   'query': ' linear numpy'}},
 'highlight': {'pre_tags': ['<b>'],
  'post_tags': ['</b>'],
  'fields': {'markdown_str': {}}}}

In [343]:
q_test_querystring = query_string_query(None,'drop','pandas',True)
q_test_querystring

{'query': {'query_string': {'query': '(markdown_str:drop) AND (modules:pandas)'}},
 'highlight': {'pre_tags': ['<b>'],
  'post_tags': ['</b>'],
  'order': 'score',
  'fields': {'_all': {}}}}

In [181]:
# def search_results(query,max_res):
#     result = es.search(body=query, size=max_res)
#     transcripts = ""
#     for i in range(len(result['hits']['hits'])): # for all hits
#         res = result['hits']['hits'][i]
#         modules = res['_source']['modules']
#         #len(res['_source']['modules'])
#         print(res['_score'],'https://github.com/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/'+res['_source']['file_name'])
#         print(res['highlight']['markdown_str'])

In [285]:
# q_test = multi_match_query(None,'linear regression',None,False)
# print(q_test)
# # print()

# search_results(q_test,20)

In [109]:
result['hits']['hits'][0]

{'_index': 'handboek',
 '_type': 'record',
 '_id': 'LTR2YnABgBRbd-7Oab31',
 '_score': 4.1406717,
 '_source': {'file_name': '05.07-Support-Vector-Machines.ipynb',
  'modules': ['from ipywidgets import interact, fixed',
   'from mpl_toolkits import mplot3d',
   'from scipy import stats',
   'from sklearn.cross_validation import train_test_split',
   'from sklearn.datasets import fetch_lfw_people',
   'from sklearn.datasets.samples_generator import make_blobs',
   'from sklearn.datasets.samples_generator import make_circles',
   'from sklearn.decomposition import RandomizedPCA',
   'from sklearn.grid_search import GridSearchCV',
   'from sklearn.metrics import classification_report',
   'from sklearn.metrics import confusion_matrix',
   'from sklearn.pipeline import make_pipeline',
   'from sklearn.svm import SVC',
   'from sklearn.svm import SVC # "Support vector classifier"',
   'import matplotlib.pyplot as plt',
   'import numpy as np',
   'import seaborn as sns; sns.set()'],
  'headin

# Search Engine widget

In [134]:
import ipywidgets as widgets

In [367]:
# Text 
normal_text = widgets.Text() #description="Search", continuous_update=True)
code = widgets.Text(description="Code")
markdown = widgets.Text(description="Markdown")
modules = widgets.Text(description="Modules")

# Buttons
search_b_normal = widgets.Button(description="Search")
search_b_adv = widgets.Button(description="Search")
clear_b = widgets.Button(description="Clear")

output = widgets.Output()





def on_normal_search_clicked(b):
    with output:
        output.clear_output()
        if normal_text.value == '':
            print("Type a query")
        else:
#             print(query_maker(normal_text.value,normal_text.value,normal_text.value,None))
            display_results(multi_match_query(normal_text.value,normal_text.value,normal_text.value,True),20)

def on_adv_search_clicked(b):
    send_code = code.value
    send_markdown = markdown.value
    send_modules = modules.value
    with output:
        output.clear_output()
        if code.value == "" and markdown.value=='' and modules.value=='':
            print("Type a query")
        else:
            if code.value == '':
                send_code = None
            if markdown.value == '':
                send_markdown = None
            if modules.value == '':
                send_modules = None
            print(query_string_query(send_code,send_markdown,send_modules,True))
            display_results(query_string_query(send_code,send_markdown,send_modules,True),20)

        
def on_clear_clicked(b):
    with output:
        output.clear_output()
    normal_text.value = ''
    markdown.value = ''
    code.value = ''
    modules.value = ''
    
        
def switch(check):
    with output:                      # remove if you want results to stay when
        output.clear_output()         # you switch between normal and adv search
    if check['new']:
        search_display.children=[row_adv,row_output]
    else:
        search_display.children=[row_normal,row_output]

        
def display_results(query,max_res):
    result = es.search(body=query, size=max_res)
    for i in range(len(result['hits']['hits'])): # for all hits
        res = result['hits']['hits'][i]
        modules = res['_source']['modules']
        
        res_layout = widgets.Layout(maring='0px')
        
        title = widgets.HTML(value='<b>'+res['_source']['file_name']+'</b>')#,layout=res_layout)
        url_str = 'https://github.com/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/'+res['_source']['file_name']
        url = widgets.HTML(value='<a href='+url_str + ' target="_blank">' + url_str + '</a>')
        score = widgets.HTML(value=str(res['_score']))#,layout=res_layout)
        
        whole_layout = widgets.Layout(display='flex',
                        flex_flow='column',
                        align_items='stretch',
                        border='grey solid 1px')
        
        
        try:
            highlight_str = str(res['highlight'].values())
            highlight = widgets.HTML(value=highlight_str,layout=res_layout)
            single_result = widgets.VBox([title,score,url,highlight],layout=whole_layout)
#             print(res['highlight'].keys())            
#             print(highlight_str)            
        except:
            single_result = widgets.VBox([title,score,url],layout=whole_layout)
        display(single_result)


In [368]:
search_b_normal.on_click(on_normal_search_clicked)
search_b_adv.on_click(on_adv_search_clicked)
clear_b.on_click(on_clear_clicked)

row_normal = widgets.HBox([normal_text,search_b_normal,clear_b])
row_adv = widgets.HBox([markdown,code,modules,search_b_adv,clear_b])
row_output = widgets.VBox([output])

top_toggle = widgets.Checkbox(description='Advanced Search')
top_toggle.observe(switch, names='value')
display(top_toggle)

normal_text.on_submit(on_normal_search_clicked)
code.on_submit(on_adv_search_clicked)
markdown.on_submit(on_adv_search_clicked)
modules.on_submit(on_adv_search_clicked)


search_display = widgets.VBox(children = [row_normal,row_output])
display(search_display)

Checkbox(value=False, description='Advanced Search')

VBox(children=(HBox(children=(Text(value=''), Button(description='Search', style=ButtonStyle()), Button(descri…

In [None]:
# result = es.search(body=query, size=max_res)
#     transcripts = ""
#     for i in range(len(result['hits']['hits'])): # for all hits
#         res = result['hits']['hits'][i]
#         modules = res['_source']['modules']
#         #len(res['_source']['modules'])
#         print(res['_score'],'https://github.com/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/'+res['_source']['file_name'])
#         print(res['highlight']['markdown_str'])

In [253]:
# test_query = {'query': {'multi_match': {'fields': ['modules'], 'query': ' numpy'}}}
# search_results(test_query,20)

enter toets gebruiken voor search:
- link: https://stackoverflow.com/questions/47137370/getting-text-from-jupyter-text-widget
- link: https://pythonprogs.blogspot.com/2017/01/widgets-for-jupyter-notebook-text-input.html

advanced search toggle (advanced search tonen/verbergen):
- link: https://stackoverflow.com/questions/54093955/ipywidgets-use-checkbox-to-display-or-hide-other-widgets