# Chaining search



## Sphinx documentatie: https://pythonhosted.org/an_example_pypi_project/sphinx.html
## in voorbeelden handige python functies opnemen
## zoals ; .sort_values(ascending=False,by=['raw_freq']));  list enz


## Library functions: Search
 

In [1]:
import requests
import pandas as pd
import xml.etree.ElementTree as ET
import json
import urllib
import wx   # for interaction popups          TODO -> omzetten naar JS of zo
import itertools # for frequency list function
import numpy     # idem
from IPython.display import FileLink, FileLinks
AVAILABLE_CORPORA = ['chn', 'opensonar', 'zeebrieven', 'gysseling', 'nederlab']
RECORDS_PER_PAGE = 1000

# Get rid of ellipsis in display (otherwise relevant data might not be shown)
pd.set_option('display.max_colwidth',1000)

# Search methods

def search_corpus_allwords(corpus, pos):
    query = r'[word=".*"]'
    if pos is not None:
        query = r'[word=".*" & pos="'+pos+r'"]'
    return search_corpus(query, corpus)

def search_corpus_alllemmata(corpus, pos):
    query = r'[lemma=".*"]'
    if pos is not None:
        query = r'[lemma=".*" & pos="'+pos+r'"]'
    return search_corpus(query, corpus) 

def search_corpus(query, corpus, start_position=1):
    # show wait indicator
    app = wx.App()
    msg_to_user = wx.BusyInfo('Searching '+corpus+' corpus')
    if corpus not in AVAILABLE_CORPORA:
        raise ValueError("Unknown corpus: " + corpus)
    try:
        # Do request to federated content search corpora, so we get same output format for every corpus
        url = "http://portal.clarin.inl.nl/fcscorpora/clariah-fcs-endpoints/sru?operation=searchRetrieve&queryType=fcs&maximumRecords=1000&x-fcs-context=" + corpus + "&query=" + urllib.parse.quote(query)
        #print(url)
        response = requests.get(url)
        response_text = response.text    
        df, next_page = _parse_xml(response_text)
        # If there are next pages, call search_corpus recursively
        if next_page > 0:
            df_more = search_corpus(query,corpus, start_position=next_page)
            df = df.append(df_more, ignore_index=True)
        # show message out of xml, if some error has occured (prevents empty output)
        _show_error_if_any(response_text)
        return df
    except Exception as e:
        raise ValueError("An error occured when searching corpus " + corpus + ": "+ str(e))
    finally:
        # remove wait indicator, and return dataframe
        del msg_to_user        

def search_corpus_multiple(queries, corpus):
    result_dict = {}
    for query in queries:
        result_dict[query] = search_corpus(query,corpus)
    return result_dict
   

def search_lexicon_alllemmata(lexicon, pos):
    query = lexicon_query_alllemmata(lexicon, pos)
    return search_lexicon(query, lexicon)

def search_lexicon(query, lexicon):
     # show wait indicator, so the user knows what's happening
    app = wx.App()
    msg_to_user = wx.BusyInfo('Searching '+lexicon+' lexicon')
    # default endpoint, except when diamant is invoked
    endpoint = "http://172.16.4.56:8890/sparql"
    if (lexicon=="diamant"):
        endpoint = "http://svprre02:8080/fuseki/tdb/sparql"
    
    try:
        # Accept header is needed for virtuoso, it isn't otherwise!
        response = requests.post(endpoint, data={"query":query}, headers = {"Accept":"application/sparql-results+json"})
        
        response_json = json.loads(response.text)
        records_json = response_json["results"]["bindings"]
        records_string = json.dumps(records_json)    
        df = pd.read_json(records_string, orient="records")
    
        # make sure cells containing NULL are added too, otherwise we'll end up with ill-formed data
        # TODO: maybe this can be replaced by:
        # df = df.fillna('')
        df = df.applymap(lambda x: '' if pd.isnull(x) else x["value"])         
        return df
    except Exception as e:
        raise ValueError("An error occured when searching lexicon " + lexicon + ": "+ str(e))
    finally:
        # remove wait indicator, 
        del msg_to_user
        

# Processing methods

def column_difference(df_column1, df_column2):
    set_df1 = set(df_column1)
    set_df2 = set(df_column2)
    diff_left = set_df1.difference(set_df2)
    diff_right = set_df2.difference(set_df1)
    intersec = set_df1.intersection(set_df2)
    return diff_left, diff_right, intersec

def diamant_get_synonyms(df):
    # Depending on the result type, we return the lemma or the definition text
    lemmas = set(df[df["inputMode"]=="defText"]["n_ontolex_writtenRep"])
    defTexts = set(df[df["inputMode"]=="lemma"]["n_syndef_definitionText"])
    return lemmas|defTexts

def _parse_xml(text):
    # TODO: should we secure against untrusted XML?
    root = ET.fromstring(text)
    records = []
    n_words_in_hit = 0
    computed_nwih = False
    layers_processed = 0
    
    for entry in root.iter("{http://clarin.eu/fcs/resource}ResourceFragment"):    
        
        for dataView in entry.findall("{http://clarin.eu/fcs/resource}DataView"):            
            
            # We only take into account hits, ignore metadata and segmenting dataViews
            if (dataView.get("type")=="application/x-clarin-fcs-hits+xml"):
                layers_processed = layers_processed + 1
                result = dataView.find("{http://clarin.eu/fcs/dataview/hits}Result")
                left_context = result.text if result.text is not None else ''
                hits = list(result)
                if len(hits)==0:
                    print([w for w in result.itertext()])
                    print("no hit in kwic, skip")
                    continue
                last_hit = hits[-1]
                right_context = last_hit.tail if last_hit.tail is not None else ''
                hit_words = [hit.text for hit in hits]
                
                if not computed_nwih:
                    n_words_in_hit = len(hits)
                    computed_nwih=True
                
                
            # Get lemma of each hit
            if (dataView.get("type")=="application/x-clarin-fcs-adv+xml"):
                layers_processed = layers_processed + 1
                #layers = dataView.find("adv:Advanced")                
                for layer in dataView.findall(".//{http://clarin.eu/fcs/dataview/advanced}Layer"):                    
                    if (layer.get("id")=="http://www.ivdnt.org/annotation-layers/lemma"):
                        hit_lemmata = []
                        for one_span in layer.findall(".//{http://clarin.eu/fcs/dataview/advanced}Span[@highlight='h1']"):
                            span_text = one_span.text            
                            hit_lemmata.append(span_text)
                            
            # Get pos of each hit
            if (dataView.get("type")=="application/x-clarin-fcs-adv+xml"):
                layers_processed = layers_processed + 1
                #layers = dataView.find("adv:Advanced")                
                for layer in dataView.findall(".//{http://clarin.eu/fcs/dataview/advanced}Layer"):                    
                    if (layer.get("id")=="http://www.ivdnt.org/annotation-layers/universal_dependency"):
                        hit_pos = []
                        for one_span in layer.findall(".//{http://clarin.eu/fcs/dataview/advanced}Span[@highlight='h1']"):
                            span_text = one_span.text            
                            hit_pos.append(span_text)
                            
            if layers_processed == 3:
                kwic = [left_context] + hit_lemmata + hit_pos + hit_words + [right_context]
                records.append(kwic)
                layers_processed = 0
                    
    columns = ["left context"] + ["lemma " + str(n) for n in range(n_words_in_hit)] + ["pos " + str(n) for n in range(n_words_in_hit)] + ["word " + str(n) for n in range(n_words_in_hit)] + ["right context"]
    
    next_pos = 0
    next_record_position = root.find("{http://docs.oasis-open.org/ns/search-ws/sruResponse}nextRecordPosition")
    if (next_record_position is not None):
        next_pos = int(next_record_position.text)
        
    return pd.DataFrame(records, columns = columns), next_pos

def _show_error_if_any(text):
    # get error message out of xml and print it on screen
    root = ET.fromstring(text)
    msgs = []
    for diagnostic in root.iter("{http://docs.oasis-open.org/ns/search-ws/diagnostic}diagnostic"):
        for msg in diagnostic.findall("{http://docs.oasis-open.org/ns/search-ws/diagnostic}message"):
            msg_text = msg.text if msg.text is not None else ''
            msgs.append(msg_text)
    if len(msgs) > 0:
        print("; ".join(msgs))

# View methods

# results: dict of df's
# labels: list of label corresponding to the df's in results
def view_multiple_results(results, labels):
    assert len(labels)==len(results)
    for n,query in enumerate(results):
        df = results[query]
        if not df.empty:
            display(HTML('Resultaten voor <b>' + labels[n] + "</b>:"))
            display(df)
            
            
            
def get_frequency_list(lexicon, pos, corpus):
    
    # LEXICON: get a lemmata list to work with
    df_lexicon = search_lexicon_alllemmata(lexicon, pos)
    lexicon_lemmata_set = sorted( set([w.lower() for w in df_lexicon["writtenForm"]]) )
    lexicon_lemmata_arr= numpy.array(lexicon_lemmata_set)

    # instantiate a dataframe for storing lemmata and frequencies
    df_frequency_list = pd.DataFrame(index=lexicon_lemmata_arr, columns=['raw_freq'])
    df_frequency_list.index.name = 'lemmata'

    # CORPUS: loop through lemmata list, query the corpus with that lemma, and count the results

    # It's a good idea to work with more than one lemma at once!
    nr_of_lemmata_to_query_atonce = 100
    
    # loop over lemmata list 
    for i in range(0, len(lexicon_lemmata_set), nr_of_lemmata_to_query_atonce):
        # slice to small sets of lemmata to query at once
        small_lemmata_set = set( lexicon_lemmata_arr[i : i+nr_of_lemmata_to_query_atonce] )    

        # join set of lemmata to send them in a query all at once
        # beware: single quotes need escaping
        lemmata_list = "|".join(small_lemmata_set).replace("'", "\\\\'")
        df_corpus = search_corpus(r'[lemma="' + lemmata_list + r'"]', corpus)

        # store frequencies
        if (len(df_corpus)>0):
            for one_lemma in small_lemmata_set: 
                raw_freq = len(df_corpus[df_corpus['lemma 0'] == one_lemma])
                df_frequency_list.at[one_lemma, 'raw_freq'] = raw_freq 
                
    # final step: compute rank
    # this is needed to be able to compare different frequency lists 
    # with each other (which we could achieve by computing a rank diff)
    df_frequency_list['rank'] = df_frequency_list['raw_freq'].rank(ascending = False).astype(int)
    
    return df_frequency_list;


def get_rank_diff(df1, df2):
    
    # we assure df1 and df2 share the same lemma list
    lemmata_list = df1.index.tolist()
    
    # instantiate a dataframe for storing lemmata and rank diffs
    df_rankdiffs = pd.DataFrame(index=lemmata_list, columns=['rank_1', 'rank_2', 'rank_diff'])
    df_rankdiffs.index.name = 'lemmata'
    
    df_rankdiffs['rank_1'] = df1['rank']
    df_rankdiffs['rank_2'] = df2['rank']
    df_rankdiffs['rank_diff'] = pd.DataFrame.abs( df_rankdiffs['rank_1'] - df_rankdiffs['rank_2'] )
    
    return df_rankdiffs

## Library functions: UI

In [2]:

import ipywidgets as widgets
from IPython.display import display
import tkinter as tk
from tkinter import filedialog
from pathlib import Path
from IPython.display import Javascript
DEFAULT_QUERY = r'[lemma="boek" & pos="verb"]' #r'[lemma="boeken" pos="verb"]'
DEFAULT_CORPUS = "chn"



def create_corpus_ui():
    # Create UI elements
    corpusQueryField = widgets.Text(description="<b>CQL query:</b>", value=DEFAULT_QUERY)
    corpusField = widgets.Dropdown(
        options=AVAILABLE_CORPORA,
        value=DEFAULT_CORPUS,
        description='<b>Corpus:</b>',
    )
    '''corpusSearchButton = widgets.Button(
        description='Search',
        button_style='info', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Search',
    )
    # Handle events
    corpusSearchButton.on_click(corpus_search)'''
    
    # Stack UI elements in vertical box and display
    corpusUiBox = widgets.VBox([corpusQueryField,corpusField])
    display(corpusUiBox)
    
    # Return fields, so their contents are accessible from the global namespace of the Notebook
    return corpusQueryField, corpusField

def create_lexicon_ui():
    DEFAULT_SEARCHWORD = 'boek'
    DEFAULT_LEXICON = "diamant"

    # Create UI elements
    searchWordField = widgets.Text(description="<b>Word:</b>", value=DEFAULT_SEARCHWORD)
    lexiconField = widgets.Dropdown(
        options=['anw', 'celex', 'diamant', 'duelme', 'molex'],
        value=DEFAULT_LEXICON,
        description='<b>Lexicon:</b>',
    )
    '''lexSearchButton = widgets.Button(
        description='Search',
        button_style='info', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Search',
    )
    # Handle events
    lexSearchButton.on_click(lexicon_search)'''
    # Stack UI elements in vertical box and display
    lexUiBox = widgets.VBox([searchWordField,lexiconField])
    display(lexUiBox)
    return searchWordField, lexiconField


def create_save_dataframe_ui(df):
    # build ui for saving results
    DEFAULT_FILENAME = 'mijn_resultaten.csv'
    saveResultsCaption = widgets.Label(value='Sla uw resultaten op:')
    fileNameField = widgets.Text(value=DEFAULT_FILENAME)
    savebutton = widgets.Button(
        description='Bestand opslaan',
        disabled=False,
        button_style='warning', 
        tooltip=DEFAULT_FILENAME,  # trick to pass filename to button widget
        icon=''
    )
    # inject dataframe into button object
    savebutton.df = df
    # when the user types a new filename, it will be passed to the button tooltip property straight away
    fileNameLink = widgets.jslink((fileNameField, 'value'), (savebutton, 'tooltip'))
    # click event with callback
    savebutton.on_click( _save_dataframe )    
    saveResultsBox = widgets.HBox([saveResultsCaption, fileNameField, savebutton])
    display(saveResultsBox)    
    
def _save_dataframe(button):
    fileName = button.tooltip
    # The result files can be saved locally or on the server:
    # If result files are to be offered as downloads, set to True; otherwise set to False    
    fileDownloadable = False
    # specify paths here, if needed:
    filePath_onServer = ''  # could be /path/to
    filePath_default = ''
    # compute full path given chosen mode
    fullFileName = (filePath_onServer if fileDownloadable else filePath_default ) + fileName
        
    try:
        button.df.to_csv( fullFileName, index=False)
        # confirm it all went well
        print(fileName + " saved")    
        button.button_style = 'success'
        button.icon = 'check'
        # trick: https://stackoverflow.com/questions/31893930/download-csv-from-an-ipython-notebook
        if (fileDownloadable):
            downloadableFiles = FileLinks(filePath_onServer)
            display(downloadableFiles)
    except Exception as e:
        button.button_style = 'danger'
        raise ValueError("An error occured when saving " + fileName + ": "+ str(e))    

    
    
def load_dataframe(filepath):
    try:
        df = pd.read_csv(filepath)
        print(filepath + " loaded successfully")            
    except Exception as e:
        raise ValueError("An error occured when loading " + filepath + ": "+ str(e))
    finally:
        return df

## Library functions: Queries

In [3]:
import re

def containsRegex(word):
    return ( word.find('^')>-1 or
            word.find('$')>-1 or 
            re.match("\(.+?\)", word) or
            re.match("\[.+?\]", word) or
            re.match("[\+*]", word) )
                     
def lexicon_query(word, pos, lexicon):
    if (lexicon=="anw"):
        exactsearch = (not containsRegex(word))
        subpart = """FILTER ( regex(?lemma, \""""+word+"""\") || regex(?definition, \""""+word+"""\") ) . """
        if (exactsearch == True):
              subpart =  """
                { { ?lemId rdfs:label ?lemma .  
                values ?lemma { \""""+word+"""\"@nl \""""+word+"""\" } }                 
                UNION
                { ?definitionId lemon:value ?definition .
                values ?definition { \""""+word+"""\"@nl \""""+word+"""\" } } } .
                """               
        query = """PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
                  PREFIX anw: <http://rdf.ivdnt.org/lexica/anw>
                  PREFIX anwsch: <http://rdf.ivdnt.org/schema/anw/>
                  PREFIX lemon: <http://lemon-model.net/lemon#>
                  
                  SELECT ?lemId ?lemma ?writtenForm ?definition concat('', ?definitionComplement) as ?definitionComplement
                  FROM <http://rdf.ivdnt.org/lexica/anw>
                  WHERE {
                      ?lemId rdfs:label ?lemma .
                      ?lemId ontolex:sense ?senseId .
                      ?senseId lemon:definition ?definitionId .
                      ?definitionId lemon:value ?definition .
                      OPTIONAL { ?definitionId anwsch:definitionComplement ?definitionComplement .}
                      OPTIONAL { ?lemId ontolex:canonicalForm ?lemCFId . 
                          ?lemCFId ontolex:writtenRepresentation ?writtenForm . }
                      """+subpart+"""
                      }"""
    elif (lexicon=="diamant"):
        exactsearch = (not containsRegex(word))
        subpart1 = """?n_form ontolex:writtenRep ?n_ontolex_writtenRep . 
            FILTER regex(?n_ontolex_writtenRep, \""""+word+"""\") . """
        subpart2 = """?n_syndef diamant:definitionText ?n_syndef_definitionText .  
            FILTER regex(?n_ontolex_writtenRep, \""""+word+"""\") . """
        if (exactsearch == True):
            subpart1 =  """
                { ?n_form ontolex:writtenRep ?n_ontolex_writtenRep . 
                values ?n_ontolex_writtenRep { \""""+word+"""\"@nl \""""+word+"""\" } } 
                """                
            subpart2 = """
                { ?n_syndef diamant:definitionText ?n_syndef_definitionText . 
                values ?n_syndef_definitionText { \""""+word+"""\"@nl \""""+word+"""\" } } 
                """
        query = """
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        prefix prov: <http://www.w3.org/ns/prov#>
        prefix diamant: <http://rdf.ivdnt.org/schema/diamant#>
        prefix lexinfo: <http://www.lexinfo.net/ontology/2.0/lexinfo#>
        prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        prefix lemon: <http://lemon-model.net/lemon#>
        prefix ontolex: <http://www.w3.org/ns/lemon/ontolex#>
        prefix ud: <http://universaldependencies.org/u/pos/>
        prefix skos: <http://www.w3.org/2004/02/skos/core#>
        prefix dcterms: <http://purl.org/dc/terms/>
        prefix dc: <http://purl.org/dc/terms/>

        select ?n_entry ?n_form ?n_ontolex_writtenRep ?n_syndef ?n_sensedef ?n_sensedef_definitionText ?n_syndef_definitionText ?n_sense ?inputMode ?wy_f_show ?wy_t_show
        where
        {
        graph ?g
        {
        {
            """ + subpart1 + """
            { ?n_entry a ontolex:LexicalEntry} .
            { ?n_form a ontolex:Form} .
            { ?n_sense a ontolex:LexicalSense} .
            { ?n_syndef a diamant:SynonymDefinition} .
            { ?n_sensedef a lemon:SenseDefinition} .
            { ?n_syndef diamant:definitionText ?n_syndef_definitionText } .
            { ?n_sensedef diamant:definitionText ?n_sensedef_definitionText } .
            { ?n_entry ontolex:canonicalForm ?n_form } .
            { ?n_entry ontolex:sense ?n_sense } .
            { ?n_sense lemon:definition ?n_syndef } .
            { ?n_sense lemon:definition ?n_sensedef } .
              ?n_sense diamant:attestation ?n_attest_show .
              ?n_sense diamant:attestation ?n_attest_filter .
              ?n_attest_show diamant:text ?n_q_show .
              ?n_attest_filter diamant:text ?n_q_filter .
              ?n_attest_show a diamant:Attestation .
              ?n_attest_filter a diamant:Attestation .
              ?n_q_filter a diamant:Quotation .
              ?n_q_show a diamant:Quotation .
              ?n_q_filter diamant:witnessYearFrom ?wy_f_filter .
              ?n_q_filter diamant:witnessYearTo ?wy_t_filter .
              ?n_q_show diamant:witnessYearFrom ?wy_f_show .
              ?n_q_show diamant:witnessYearTo ?wy_t_show .
              FILTER (xsd:integer(?wy_f_show) >= 1200)
              FILTER (xsd:integer(?wy_t_show) >= 1200)
              FILTER (xsd:integer(?wy_f_show) <= 2018)
              FILTER (xsd:integer(?wy_t_show) <= 2018)
            { bind("lemma" as ?inputMode) } .
            } UNION
          {
            """ + subpart2 + """
            { ?n_sense a ontolex:LexicalSense} .
            { ?n_syndef a diamant:SynonymDefinition} .
            { ?n_sensedef a lemon:SenseDefinition} .
            { ?n_form a ontolex:Form} .
            { ?n_form ontolex:writtenRep ?n_ontolex_writtenRep } .  { ?n_entry a ontolex:LexicalEntry} .
            { ?n_entry ontolex:sense ?n_sense } .
            { ?n_sense lemon:definition ?n_syndef } .
            { ?n_sense lemon:definition ?n_sensedef } .
            { ?n_sensedef diamant:definitionText ?n_sensedef_definitionText } .
            { ?n_entry ontolex:canonicalForm ?n_form } .
            ?n_sense diamant:attestation ?n_attest_show .
            ?n_sense diamant:attestation ?n_attest_filter .
            ?n_attest_filter diamant:text ?n_q_filter .
            ?n_attest_show diamant:text ?n_q_show .
            ?n_q_filter diamant:witnessYearFrom ?wy_f_filter .
            ?n_q_filter diamant:witnessYearTo ?wy_t_filter .
            ?n_q_show diamant:witnessYearFrom ?wy_f_show .
            ?n_q_show diamant:witnessYearTo ?wy_t_show .
            ?n_attest_show a diamant:Attestation .
            ?n_attest_filter a diamant:Attestation .
            ?n_q_filter a diamant:Quotation .
            ?n_q_show a diamant:Quotation .
            FILTER (xsd:integer(?wy_f_show) >= 1200)
            FILTER (xsd:integer(?wy_t_show) >= 1200)
            FILTER (xsd:integer(?wy_f_show) <= 2018)
            FILTER (xsd:integer(?wy_t_show) <= 2018)
          { bind("defText" as ?inputMode) } .
            }
        }
        }"""
    elif (lexicon=="molex"):
        exactsearch = (not containsRegex(word))
        subpart1 = """"""
        subpart2 = """"""
        subpartPos = """"""
        if (word != ''):
            if (exactsearch == True):
                subpart1 =  """
                    { ?lemCFId ontolex:writtenRep ?lemma . 
                    values ?lemma { \""""+word+"""\"@nl \""""+word+"""\" } } 
                    UNION
                    { ?wordformId ontolex:writtenRep ?wordform . 
                    values ?wordform { \""""+word+"""\"@nl \""""+word+"""\" } } .
                    """        
            else:
                subpart2 = """FILTER ( regex(?lemma, \""""+word+"""\") || regex(?wordform, \""""+word+"""\") ) . """
        if (pos is not None and pos != ''):
            subpartPos = """FILTER ( regex(?lemPos, \""""+pos+"""$\") ) ."""
        query = """
            PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
            PREFIX UD: <http://universaldependencies.org/u/>
            PREFIX diamant: <http://rdf.ivdnt.org/schema/diamant#>
            
            SELECT ?lemEntryId ?lemma ?lemPos ?wordformId ?wordform ?hyphenation ?wordformPos ?Gender ?Number
            FROM <http://rdf.ivdnt.org/lexica/molex>
            WHERE
            {
            ?lemEntryId ontolex:canonicalForm ?lemCFId .
            ?lemCFId ontolex:writtenRep ?lemma .
            """+subpart1+"""
            OPTIONAL {?lemEntryId UD:Gender ?Gender .}
            OPTIONAL {?lemEntryId UD:VerbForm ?verbform .}
            ?lemEntryId UD:pos ?lemPos .
            """+subpartPos+"""
            ?lemEntryId ontolex:lexicalForm ?wordformId .
            ?wordformId UD:pos ?wordformPos .
            OPTIONAL {?wordformId UD:Number ?Number .}
            OPTIONAL {?wordformId ontolex:writtenRep ?wordform .}
            OPTIONAL {?wordformId diamant:hyphenation ?hyphenation .}
            """+subpart2+"""
            }
        """
    elif (lexicon=="duelme"):
        exactsearch = (not containsRegex(word))
        subpart = """FILTER ( regex(?lemma, \""""+word+"""\") || regex(?wordform, \""""+word+"""\") ) ."""
        if (exactsearch == True):
            subpart =  """
                { ?y lmf:hasLemma ?dl .  
                values ?dl { \""""+word+"""\"@nl \""""+word+"""\" } }                 
                """        
        query = """
            PREFIX duelme: <http://rdf.ivdnt.org/lexica/duelme>
            PREFIX intskos: <http://ivdnt.org/schema/lexica#>
            PREFIX lmf: <http://www.lexinfo.net/lmf>
            PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
            PREFIX UD: <http://rdf.ivdnt.org/vocabs/UniversalDependencies2#>
            
            SELECT ?exampleSentence ?lemma ?gender ?number
            WHERE  {
                  ?d intskos:ExampleSentence ?exampleSentence .
                  ?d lmf:ListOfComponents [lmf:Component ?y] .
                  ?y lmf:hasLemma ?lemma . 
                  OPTIONAL {?y UD:Gender ?gender}
                  OPTIONAL {?y UD:Number ?number}
            """+subpart+"""
            }
        """
    elif (lexicon=="celex"):
        exactsearch = (not containsRegex(word))
        subpart = """FILTER ( regex(?lemma, \""""+word+"""\") ) . """
        if (exactsearch == True):
            subpart =  """
                { ?lemmaId ontolex:canonicalForm [ontolex:writtenRep ?lemma] .  
                values ?lemma { \""""+word+"""\"@nl \""""+word+"""\" } }                 
                """        
        query = """
            PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
            PREFIX celex: <http://rdf.ivdnt.org/lexica/celex>
            PREFIX UD: <http://rdf.ivdnt.org/vocabs/UniversalDependencies2#>
            PREFIX decomp: <http://www.w3.org/ns/lemon/decomp#>
            PREFIX gold: <http://purl.org/linguistics/gold#>
            
            SELECT DISTINCT ?lemmaId ?lemma ?wordformId ?wordform ?number ?gender concat('',?subLemmata) AS ?subLemmata
            WHERE  {
                ?lemmaId ontolex:canonicalForm [ontolex:writtenRep ?lemma] .
                """+subpart+"""
                BIND( ?lemmaId AS ?lemmaIdIRI ).
                ?lemmaId ontolex:lexicalForm ?wordformId .
                ?wordformId ontolex:writtenRep ?wordform .
                OPTIONAL {?wordformId UD:Number ?number} .
                OPTIONAL {
                    ?lemmaId UD:Gender ?g . 
                        bind( 
                            if(?g = UD:Fem_Gender, 
                            UD:Com_Gender, 
                                if(?g = UD:Masc_Gender,
                                    UD:Com_Gender,
                                    UD:Neut_Gender
                                )
                            )
                            AS ?gender
                        )
                }
                OPTIONAL {
                    SELECT ?lemmaIdIRI (group_concat(DISTINCT concat(?partNr,":",?subLemma);separator=" + ") as ?subLemmata)
                    WHERE {
                        SELECT ?lemmaIdIRI ?celexComp ?aWordformId ?subLemma ?partNr
                        WHERE {
                                {
                                ?lemmaIdIRI ontolex:lexicalForm ?aWordformId . 
                                ?lemmaIdIRI decomp:constituent ?celexComp .
                                OPTIONAL { ?celexComp gold:stem [ontolex:writtenRep ?subLemma] . }
                                OPTIONAL { ?celexComp decomp:correspondsTo [ ontolex:canonicalForm [ontolex:writtenRep ?subLemma]] . }
                                }
                                {
                                    {
                                        {?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#_1> ?celexComp .}
                                        UNION
                                        {?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#_2> ?celexComp .}
                                        UNION
                                        {?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#_3> ?celexComp .}
                                        UNION
                                        {?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#_4> ?celexComp .}
                                        UNION
                                        {?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#_5> ?celexComp .}
                                        UNION
                                        {?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#_6> ?celexComp .}                                        
                                    }
                                ?lemmaIdIRI ?rdfsynt ?celexComp .
                                BIND(IF(STRSTARTS(str(?rdfsynt), "http://www.w3.org/1999/02/22-rdf-syntax-ns#"), replace(STRAFTER(str(?rdfsynt), "#"), "_", ""), "999") AS ?partNr) .
                                MINUS {
                                    ?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#0> ?celexComp .
                                    }
                                }
                            FILTER (?partNr != "999") .
                            }
                            ORDER BY ?partNr
                            }
                        GROUP BY ?aWordformId ?lemmaIdIRI
                    }
            }
        """
        
    return query

def corpus_query_lemma(word):
    return r'[lemma="'+ word + r'"]'

def lexicon_query_alllemmata(lexicon, pos):
    if (lexicon=="anw"):
        query = """PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
                  PREFIX anw: <http://rdf.ivdnt.org/lexica/anw>                  
                  SELECT DISTINCT ?writtenForm
                  FROM <http://rdf.ivdnt.org/lexica/anw>
                  WHERE {
                      ?lemId rdfs:label ?lemma .
                      ?lemId ontolex:canonicalForm ?lemCFId . 
                      ?lemCFId ontolex:writtenRepresentation ?writtenForm .
                      }
                      ORDER BY ?writtenForm"""
    elif (lexicon=="celex"):
        query = """
            PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
            
            SELECT DISTINCT ?lemma AS ?writtenForm
            WHERE  {
                ?lemmaId ontolex:canonicalForm [ontolex:writtenRep ?lemma] .                
                }
            ORDER BY ?lemma"""
    elif (lexicon=="diamant"):
        query = """
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        prefix prov: <http://www.w3.org/ns/prov#>
        prefix diamant: <http://rdf.ivdnt.org/schema/diamant#>
        prefix lexinfo: <http://www.lexinfo.net/ontology/2.0/lexinfo#>
        prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        prefix lemon: <http://lemon-model.net/lemon#>
        prefix ontolex: <http://www.w3.org/ns/lemon/ontolex#>
        prefix ud: <http://universaldependencies.org/u/pos/>
        prefix skos: <http://www.w3.org/2004/02/skos/core#>
        prefix dcterms: <http://purl.org/dc/terms/>
        prefix dc: <http://purl.org/dc/terms/>

        select DISTINCT ?n_ontolex_writtenRep AS ?writtenForm
        where
        {
        graph ?g
        {
        {
            { ?n_form ontolex:writtenRep ?n_ontolex_writtenRep} .
            { ?n_form a ontolex:Form} .
        }
        }
        }
        ORDER BY ?n_ontolex_writtenRep
        LIMIT 10000
        """
    elif (lexicon=="duelme"):
        query = """
            PREFIX lmf: <http://www.lexinfo.net/lmf>            
            SELECT DISTINCT ?lemma AS ?writtenForm
            WHERE  {
                  ?y lmf:hasLemma ?lemma . 
            }
            ORDER BY ?lemma"""
    elif (lexicon=="molex"):
        pos_condition = """"""
        if pos is not None:
            pos_condition = """
            {?lemEntryId UD:pos ?lemPos .
            FILTER regex(?lemPos, '"""+pos+"""') } .
            """
        query = """
                PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
                PREFIX UD: <http://universaldependencies.org/u/>
                SELECT DISTINCT ?lemma AS ?writtenForm
                FROM <http://rdf.ivdnt.org/lexica/molex>
                WHERE
                {
                ?lemEntryId ontolex:canonicalForm ?lemCFId .
                ?lemCFId ontolex:writtenRep ?lemma .  
                """+pos_condition+"""
                }
                 ORDER BY ?lemma"""
    else:
        raise ValueError("Lexicon " + lexicon + " not supported for querying all words.")
        
    #print(query)
    return query

## Corpus search

* Run the cell below to show the UI, and fill in your search query

In [4]:
#from chaininglib import ui

# Create corpus UI, creates references to field contents
corpusQueryField, corpusField = create_corpus_ui()

VBox(children=(Text(value='[lemma="boek" & pos="verb"]', description='<b>CQL query:</b>'), Dropdown(descriptio…

 * Click the cell below and press Run to perform the given query

In [5]:
#from chaininglib import search
query= corpusQueryField.value
corpus = corpusField.value
df_corpus = search_corpus(query, corpus)
#df_corpus = load_dataframe('mijn_resultaten.csv')
display(df_corpus)
create_save_dataframe_ui(df_corpus)



Unnamed: 0,left context,lemma 0,pos 0,word 0,right context
0,of Imam Zij staat te,boek,VERB,boek,als het eerste zwarte topmodel
1,Rihanna 4 Ze staat te,boek,VERB,boek,als het eerste zwarte model
2,te pakken Gilaard staat te,boek,VERB,boek,als een goed militair maar
3,verklaard Scotto d'Abusco staat te,boek,VERB,boek,als een beloftevolle Italiaan van
4,als La Gioconda staat te,boek,VERB,boek,als het portret van de
5,dat niet Beatrix staat te,boek,VERB,boek,als een eigengereide koppige koningin
6,realpolitische belegging staat vandaag te,boek,VERB,boek,als een van hun grootste
7,en biograaf Bastet staat te,boek,VERB,boek,als een groot kenner van
8,Il Magnifico 1449-1492 staat te,boek,VERB,boek,als een van de grootste
9,zou zijn Danneels staat te,boek,VERB,boek,als een van de intelligentere


HBox(children=(Label(value='Sla uw resultaten op:'), Text(value='mijn_resultaten.csv'), Button(button_style='w…

## Lexicon search

* Run the cell below to show the UI, and fill in your search query in the UI

In [6]:
#from chaininglib import ui
searchWordField, lexiconField = create_lexicon_ui()

VBox(children=(Text(value='boek', description='<b>Word:</b>'), Dropdown(description='<b>Lexicon:</b>', index=2…

 * Click the cell below and press Run to perform the given query

In [7]:
#from chaininglib import queries, search

search_word = searchWordField.value
lexicon = lexiconField.value
# USER: can replace this by own custom query
query = lexicon_query(word=search_word, pos= '', lexicon=lexicon)

df_lexicon = search_lexicon(query, lexicon)
display(df_lexicon)
#df_columns_list = list(df_lexicon.columns.values)
#df_lexicon_in_columns = df_lexicon[df_columns_list]
#display(df_lexicon_in_columns)

Unnamed: 0,inputMode,n_entry,n_form,n_ontolex_writtenRep,n_sense,n_sensedef,n_sensedef_definitionText,n_syndef,n_syndef_definitionText,wy_f_show,wy_t_show
0,lemma,http://rdf.ivdnt.org/lexica/diamant/lemma/mnw/04828,http://rdf.ivdnt.org/lexica/diamant/lemma/mnw/04828f,boek,http://rdf.ivdnt.org/lexica/diamant/sense/04828.sense.2,http://rdf.ivdnt.org/lexica/diamant/definition/04828.sense.2,"Ook in den zin van officieel stuk, acte, oorkonde; vgl. got. mv. bôkôs.",http://rdf.ivdnt.org/lexica/diamant/synonymdefinition//12745,acte,1228,1349
1,lemma,http://rdf.ivdnt.org/lexica/diamant/lemma/mnw/04828,http://rdf.ivdnt.org/lexica/diamant/lemma/mnw/04828f,boek,http://rdf.ivdnt.org/lexica/diamant/sense/04828.sense.2,http://rdf.ivdnt.org/lexica/diamant/definition/04828.sense.2,"Ook in den zin van officieel stuk, acte, oorkonde; vgl. got. mv. bôkôs.",http://rdf.ivdnt.org/lexica/diamant/synonymdefinition//12745,acte,1228,1349
2,lemma,http://rdf.ivdnt.org/lexica/diamant/lemma/mnw/04828,http://rdf.ivdnt.org/lexica/diamant/lemma/mnw/04828f,boek,http://rdf.ivdnt.org/lexica/diamant/sense/04828.sense.2,http://rdf.ivdnt.org/lexica/diamant/definition/04828.sense.2,"Ook in den zin van officieel stuk, acte, oorkonde; vgl. got. mv. bôkôs.",http://rdf.ivdnt.org/lexica/diamant/synonymdefinition//12745,acte,1456,1456
3,lemma,http://rdf.ivdnt.org/lexica/diamant/lemma/mnw/04828,http://rdf.ivdnt.org/lexica/diamant/lemma/mnw/04828f,boek,http://rdf.ivdnt.org/lexica/diamant/sense/04828.sense.2,http://rdf.ivdnt.org/lexica/diamant/definition/04828.sense.2,"Ook in den zin van officieel stuk, acte, oorkonde; vgl. got. mv. bôkôs.",http://rdf.ivdnt.org/lexica/diamant/synonymdefinition//12745,acte,1456,1456
4,lemma,http://rdf.ivdnt.org/lexica/diamant/lemma/mnw/04828,http://rdf.ivdnt.org/lexica/diamant/lemma/mnw/04828f,boek,http://rdf.ivdnt.org/lexica/diamant/sense/04828.sense.2,http://rdf.ivdnt.org/lexica/diamant/definition/04828.sense.2,"Ook in den zin van officieel stuk, acte, oorkonde; vgl. got. mv. bôkôs.",http://rdf.ivdnt.org/lexica/diamant/synonymdefinition//12747,oorkonde,1228,1349
5,lemma,http://rdf.ivdnt.org/lexica/diamant/lemma/mnw/04828,http://rdf.ivdnt.org/lexica/diamant/lemma/mnw/04828f,boek,http://rdf.ivdnt.org/lexica/diamant/sense/04828.sense.2,http://rdf.ivdnt.org/lexica/diamant/definition/04828.sense.2,"Ook in den zin van officieel stuk, acte, oorkonde; vgl. got. mv. bôkôs.",http://rdf.ivdnt.org/lexica/diamant/synonymdefinition//12747,oorkonde,1228,1349
6,lemma,http://rdf.ivdnt.org/lexica/diamant/lemma/mnw/04828,http://rdf.ivdnt.org/lexica/diamant/lemma/mnw/04828f,boek,http://rdf.ivdnt.org/lexica/diamant/sense/04828.sense.2,http://rdf.ivdnt.org/lexica/diamant/definition/04828.sense.2,"Ook in den zin van officieel stuk, acte, oorkonde; vgl. got. mv. bôkôs.",http://rdf.ivdnt.org/lexica/diamant/synonymdefinition//12747,oorkonde,1456,1456
7,lemma,http://rdf.ivdnt.org/lexica/diamant/lemma/mnw/04828,http://rdf.ivdnt.org/lexica/diamant/lemma/mnw/04828f,boek,http://rdf.ivdnt.org/lexica/diamant/sense/04828.sense.2,http://rdf.ivdnt.org/lexica/diamant/definition/04828.sense.2,"Ook in den zin van officieel stuk, acte, oorkonde; vgl. got. mv. bôkôs.",http://rdf.ivdnt.org/lexica/diamant/synonymdefinition//12747,oorkonde,1456,1456
8,lemma,http://rdf.ivdnt.org/lexica/diamant/lemma/mnw/04828,http://rdf.ivdnt.org/lexica/diamant/lemma/mnw/04828f,boek,http://rdf.ivdnt.org/lexica/diamant/sense/04828.sense.1,http://rdf.ivdnt.org/lexica/diamant/definition/04828.sense.1,Boek.,http://rdf.ivdnt.org/lexica/diamant/synonymdefinition//12744,Boek,1460,1480
9,lemma,http://rdf.ivdnt.org/lexica/diamant/lemma/mnw/04828,http://rdf.ivdnt.org/lexica/diamant/lemma/mnw/04828f,boek,http://rdf.ivdnt.org/lexica/diamant/sense/04828.sense.1,http://rdf.ivdnt.org/lexica/diamant/definition/04828.sense.1,Boek.,http://rdf.ivdnt.org/lexica/diamant/synonymdefinition//12744,Boek,1460,1480


## Case study 1 (parallel): Frequency of *puur*+verb and *zuiver*+verb compared
* Below cell searches for *puur*+verb and for *zuiver*+verb in the CHN corpus
* Compare frequencies

In [11]:
#from chaininglib import search
from IPython.core.display import display, HTML

# Word 1: puur
word1= "puur"
df_corpus1 = search_corpus('[word="' + word1 + r'"][pos="verb"]',corpus="chn")
display(HTML('<b>' + word1 + '</b>'))
display(df_corpus1)

# Word 2: zuiver
word2 = "zuiver"
df_corpus2 = search_corpus(r'[word="' + word2 + r'"][pos="verb"]',"chn")
display(HTML('<b>' + word2 + '</b>'))
display(df_corpus2)

# Compute difference
diff_left, diff_right, intersec = column_difference(df_corpus1["word 1"], df_corpus2["word 1"])
# Elements of 1 that are not in 2
display(HTML('Werkwoorden voor <b>' + word1 + '</b> niet in <b>' + word2 + '</b>: ' + ", ".join(diff_left)))
# Elements of 2 that are not in 1
display(HTML('Werkwoorden voor <b>' + word1 + '</b> niet in <b>' + word2 + '</b>: ' + ", ".join(diff_right)))
# Elements both in 1 and 2
display(HTML('Werkwoorden zowel voor <b>' + word1 + '</b> als voor <b>' + word2 + '</b>: ' + ", ".join(intersec)))

Unnamed: 0,left context,lemma 0,lemma 1,pos 0,pos 1,word 0,word 1,right context
0,stad omdat de cultuur daar,puur,zijn,ADJ,VERB,puur,is,
1,succes dan wanneer de coalitie,puur,stoelen,ADJ,VERB,puur,gestoeld,is op een parlementaire meerderheid
2,de andere jongeren die gewoon,puur,willen,ADJ,VERB,puur,willen,werken Maar financieel zijn zij
3,gratis zijn De deelname is,puur,baseren,ADJ,VERB,puur,gebaseerd,op interesse In de middaguren
4,de natuur De natuur is,puur,vertellen,ADJ,VERB,puur,vertelt,de kunstenaar Een tijger jaagt
5,van een licentieovereenkomst maar het,puur,gaan,NOUN,VERB,puur,gaat,om de oorspronkelijke eis van
6,gevecht voor ons voorouderlijk land,puur,bedoelen,ADJ,VERB,puur,bedoeld,is om het menselijk leven
7,het SZF maar waar het,puur,gaan,PRON,VERB,puur,gaat,om winsten te maken zegt
8,redactrice Inge SchelstraeteHET zou poëzie,puur,worden,ADJ,VERB,puur,worden,waarschuwde Piet Piryns ons Maar
9,bepaalde zender te adverteren is,puur,baseren,ADJ,VERB,puur,gebaseerd,op marketing én op het


Unnamed: 0,left context,lemma 0,lemma 1,pos 0,pos 1,word 0,word 1,right context
0,baby Ik wil het contact,zuiver,beperken,ADJ,VERB,zuiver,beperken,tot de baby en het
1,zo zul je als mens,zuiver,moeten,ADJ,VERB,zuiver,moeten,zijn om in aanmerking te
2,adviezen van mensen Ik heb,zuiver,vastleggen,ADJ,VERB,zuiver,vastgelegd,wat ik zelf spraakmakend vond
3,karaoke je ding Daarvoor is,zuiver,zingen,ADJ,VERB,zuiver,zingen,niet nodig Dat is juist
4,aan als het gevoel erachter,zuiver,zijn,ADJ,VERB,zuiver,is,Met deze woorden besloot Loes
5,Maar als je je gevoelens,zuiver,houden,ADJ,VERB,zuiver,houdt,en goede wensen creëert dan
6,Cronie maar die was te,zuiver,nemen,ADJ,VERB,zuiver,genomen,Bij The Scorpions benutten Misiedjan
7,Woudman op Cairo Deze werd,zuiver,nemen,ADJ,VERB,zuiver,genomen,door Dwight Tempico 2-1 Een
8,de leerlingen herkenbaar en is,zuiver,beweren,ADJ,VERB,zuiver,beweert,de directeur Ook enkele leerkrachten
9,bakzeil De strafschop was te,zuiver,nemen,ADJ,VERB,zuiver,genomen,


## Case study 2 (sequential): Retrieve synonyms from DiaMaNT, look up in Gysseling
* Below cell searches for term "boek" in DiaMaNT, and looks up all variants in Gysseling

In [9]:
search_word = "boek"
lexicon = "diamant"
corpus= "gysseling"

# First, lookup synonyms in DiaMaNT
query = lexicon_query(word=search_word, pos= '', lexicon=lexicon)
df_lexicon = search_lexicon(query, lexicon)
syns = diamant_get_synonyms(df_lexicon) 
syns.add(search_word) # Also add search word itself
display(HTML('Synoniemen voor <b>' + search_word + '</b>: ' + ", ".join(syns)))

# Search for all synonyms in corpus
## Create queries: search by lemma
syns_queries = [corpus_query_lemma(syn) for syn in syns]
## Search for all synonyms in corpus
result_dict = search_corpus_multiple(syns_queries, corpus)
view_multiple_results(result_dict, labels=list(syns))



Unnamed: 0,left context,lemma 0,pos 0,word 0,right context
0,ende met .iiij. draden roder,LIJST,NOUN,lijsten,in .ij. euelten die hier ieghen
1,lib. het ne ware die,LIJST,NOUN,lijste,vanden lakene. Vort eist ghecuert
2,ᨮᨬ. het ne ware die,LIJST,NOUN,lijste,vanden lakene [3] vort es
3,langher ende dat met ere,LIJST,NOUN,lijste,Ende in so wat cammen dat
4,van wits te moreideine die,LIJST,NOUN,lijste,ne laghe binnen diere ieghen dade
5,.xxx. pond ende met .ij.,LIJST,NOUN,lijsten,.ix. drade vp elken egh
6,drade vp elken egh blawer,LIJST,NOUN,lijsten,ende der in ghesceert diere ieghen dade
7,.xxviij. pond met .i. roder,LIJST,NOUN,lijste,Ende dese vorseide saye salmen
8,elc warpin say ene blaeuwe,LIJST,NOUN,"lijste,",ende tweueline ne ghene. Ende so wie
9,ne ghene. Ende so wie die blaeuwe,LIJST,NOUN,lijste,scerde an weuelin say ouer


Unnamed: 0,left context,lemma 0,pos 0,word 0,right context
0,Ende roofde den tempel ende,TOESLUITEN,VERB,sloten toe.,Dar na saen so starf hi
1,want haer mont was sekerleke,TOESLUITEN,VERB,toe ghesloten,so starkeleke dat menne met


Unnamed: 0,left context,lemma 0,pos 0,word 0,right context
0,uan haueleker scult ende negene,OORKONDE,NOUN,orconden,ne heuet; die beclaghet es
1,tue eruahtege man heuet in,OORKONDE,NOUN,orconden,hi sal winnen sinen houestoel.
2,yemene dinghet ende hem uermet,OORKONDE,NOUN,orconden,die hi iegenwordech heuet ende
3,cateil. Ende dinghet hi sonder,OORKONDE,NOUN,orconden;,deghene dar hi up dinghet
4,hebben tue eruahteghe man in,OORKONDE,NOUN,orconden.,ende si sullen hem helpen
5,hebbe tue eruahteghe man te,OORKONDE,NOUN,orconden,dat hem uergolden si. Neware
6,uan doder hant ende negene,OORKONDE,NOUN,orconden,ne heuet hi biede sinen
7,hebben tue eruahteghe man te,OORKONDE,NOUN,orconden.,Doed en uremde man enen
8,Tiemen hem meer dar neghene,OORKONDE,NOUN,orconden,ne sien; met enen eruahtegen
9,dan enen eruahteghen man in,OORKONDE,NOUN,orconden;,die orconde moet sueren binnen


Unnamed: 0,left context,lemma 0,pos 0,word 0,right context
0,didscher talen ende ic vten,TEKST,NOUN,texte,van den vire ewangelisten makde
1,in vele staden es de,TEKST,NOUN,text,van der ewangelien also donker
2,vele meerre sijn dan de,TEKST,NOUN,text,van der ewangelien alte male. Ende
3,didsche alse si in den,TEKST,NOUN,texte,"gescreuen sijn, so bleue dit"


Unnamed: 0,left context,lemma 0,pos 0,word 0,right context
0,.iiii. ghecorne gulde broeders die de,BOEK,NOUN,boeke,oudenden sin. si moghen elc
1,viere ghecorne guldebroeders die de,BOEK,NOUN,boeke,houden si moghen elc haren
2,.iiii. ghecorne guldebroeders die de,BOEK,NOUN,boke,ouden si moghen elc haren
3,secundi willelmus de lapide willelmus,BOEK,NOUN,boec,Jn elst. arnulphus de keelne
4,"Heren M CC LXXX, due wart det",BOEK,NOUN,buec,begonnen. Desen csens es mer
5,"van poschen, due wart det",BOEK,NOUN,buec,begonnen. Desen pagt es mer
6,"van poschen, due wart det",BOEK,NOUN,buec,begonnen. Desen pagt es mer
7,"van poschen, due wart dit",BOEK,NOUN,buc,begonnen. Dese pegte es mer
8,"van poschen, due wart dit",BOEK,NOUN,buc,begonnen. Dit blift den bruderen
9,"van poschen, due wart dit",BOEK,NOUN,buc,begonnen. Dit sin degene die


Unnamed: 0,left context,lemma 0,pos 0,word 0,right context
0,.iiii. ghecorne gulde broeders die de,BOEK,NOUN,boeke,oudenden sin. si moghen elc
1,viere ghecorne guldebroeders die de,BOEK,NOUN,boeke,houden si moghen elc haren
2,.iiii. ghecorne guldebroeders die de,BOEK,NOUN,boke,ouden si moghen elc haren
3,secundi willelmus de lapide willelmus,BOEK,NOUN,boec,Jn elst. arnulphus de keelne
4,"Heren M CC LXXX, due wart det",BOEK,NOUN,buec,begonnen. Desen csens es mer
5,"van poschen, due wart det",BOEK,NOUN,buec,begonnen. Desen pagt es mer
6,"van poschen, due wart det",BOEK,NOUN,buec,begonnen. Desen pagt es mer
7,"van poschen, due wart dit",BOEK,NOUN,buc,begonnen. Dese pegte es mer
8,"van poschen, due wart dit",BOEK,NOUN,buc,begonnen. Dit blift den bruderen
9,"van poschen, due wart dit",BOEK,NOUN,buc,begonnen. Dit sin degene die


Unnamed: 0,left context,lemma 0,pos 0,word 0,right context
0,Voert; dat een weuera die,WERK,NOUN,werc,"ghenoch heft ter volre weken,"
1,met den weueren die en ghen,WERK,NOUN,werc,en hebben te weuene; hine
2,platse hout inde weke ende,WERK,NOUN,werc,heft; hi es sculdech en
3,van vresen van vinders. Dat,WERK,NOUN,werc,es verbord; Ende hi moet
4,dade weuen; hi verborde dat,WERK,NOUN,werc,ende .iij. lb. So wat ambochts man;
5,dade weuen hie verbord dat,WERK,NOUN,werc,ende .iij. lb. Dit mach
6,es .v. ᨣ. [2] Dat,WERK,NOUN,werc,es sculdich te stane an
7,es .v. ᨣ. [2] Dat,WERK,NOUN,werc,es sculdich te stane an
8,te enighes drapeniers huus; omme,WERK,NOUN,weerc,te beiaghene. jof omme enighen
9,staet die die mester gheen,WERK,NOUN,weerc,gheuen ne wille; hie moet


Unnamed: 0,left context,lemma 0,pos 0,word 0,right context
0,van der vorseider stede. ten,GEWERK,NOUN,ghewarke,boef. van der fermerien van
1,graf. Ende gheft aldaer ten,GEWERK,NOUN,ghewerke.,vander kerken vijf pont vlaemsche.
2,deelne. vord gaf soe den,GEWERK,NOUN,ghewerke,van onser vrouwen kerke. twintich
3,kerke. twintich sceleghe vlaemsche. den,GEWERK,NOUN,ghewerke,van sinte saluators kerke. twintich
4,sceleghe vlaemsche. vord te elken,GEWERK,NOUN,ghewerke,van allen kerken die binne
5,ende xv. ᨮᨬ te haren,GEWERK,NOUN,"ghewerke,",Vort gheuic broder pauwels vanine
6,Tonser vrouwen in brugghe ten,GEWERK,NOUN,ghewerke,xl. ᨣ. Aldar tsinte saluatoers
7,saluatoers xl. ᨣ also. Ten,GEWERK,NOUN,ghewerke,te sinte baues bute brugghe
8,engeen man en ware met,GEWERK,NOUN,gewerke.,hi en waert selue dan
9,wille te wederstane Ende gode,GEWERK,NOUN,ghewerke,in elke stede Te begharne


## Case study (parallel) 3: Find corpus words not in lexicon; list most frequent ones.
* Only parallel if you can ask the lexicon a list of all words.
* Currently only working: ask DiaMaNT list of words (limited at 10000)

In [12]:
# Query lexicon to give list of all words
lexicon="anw"
df_lexicon = search_lexicon_alllemmata(lexicon)
## TODO: Why do double words appear?
lexicon_set = sorted( set([w.lower() for w in df_lexicon["writtenForm"]]) )
display(lexicon_set)

df_corpus = search_corpus_allwords("gysseling", None)
display(df_corpus)
len(df_corpus)



TypeError: search_lexicon_alllemmata() missing 1 required positional argument: 'pos'

## Case study (sequential) 4: Find occurences of attributive adjectives not ending with -e, even though they are preceeded by a definite article

In [None]:
corpus_to_search="opensonar"
lexicon_to_search="molex"

# CORPUS: get [article + attributive adjective + nouns] combinations in which the adjective does not end with -e
print('Stap 1:')
df_corpus = search_corpus(r'[lemma="de|het"][word="^g(.+)[^e]$" & pos="ADJ"][pos="NOUN"]', corpus=corpus_to_search)
display(df_corpus)

# LEXICON: get adjectives the lemma of which does not end with -e
query=lexicon_query('^g(.+)[^e]$', 'ADJ', lexicon_to_search)
df_lexicon = search_lexicon(query, lexicon_to_search)
display(df_lexicon)

# LEXICON: get adjectives having a final -e in definite attributive use
print('Filtering lexicon results')
final_e_condition=df_lexicon.wordform.str.contains('e$')
df = df_lexicon[final_e_condition]
display(df)

# RESULT: get the records out of our first list in which the -e-less-adjectives match the lemma form of our last list
print('Wanted list:')
eless_forms = list(df.lemma)
no_final_e_condition = df_corpus['word 1'].isin(eless_forms)
display( df_corpus[no_final_e_condition] )

## Case study (sequential) 5: (morphosyntactic lexicon and possibly unannotated corpus) Look up inflected forms and spelling variants for a given lemma in a corpus

In [None]:
lexicon_to_search="molex"
corpus_to_search="chn"

##############################################
# TODO  zelfde met meerdere lemmata en gegroepeerd 
##############################################

lemma_to_look_for="denken"

# LEXICON: Search for the inflected forms of a lemma in a morphosyntactic lexicon
query=lexicon_query(lemma_to_look_for, None, lexicon_to_search)
df_lexicon = search_lexicon(query, lexicon_to_search)
display(df_lexicon)

# Put all inflected forms into a list
inflected_wordforms = list(df_lexicon.wordform)

# CORPUS: Look up the inflected forms in a (possibly unannotated) corpus
# beware: If the corpus is not annotated, all we can do is searching for the inflected words
#         But if the corpus is lemmatized, we have to make sure we're retrieving correct data by specifying the lemma as well
annotated_corpus = True
query = r'[lemma="'+lemma_to_look_for+r'" & word="'+r"|".join(inflected_wordforms)+r'"]' if annotated_corpus else r'[word="'+r"|".join(inflected_wordforms)+r'"]'
df_corpus = search_corpus(query, corpus=corpus_to_search)
display(df_corpus)

## Case study : Build frequency table of some corpus, based on lemma list of a given lexicon

In [None]:
base_lexicon="anw"
corpus_to_search1="opensonar"
corpus_to_search2="chn"

# build frequency tables of two corpora

df_frequency_list1 = get_frequency_list(base_lexicon, "NOUN", corpus_to_search1)

display(df_frequency_list1.sort_values(ascending=False,by=['raw_freq']))

df_frequency_list2 = get_frequency_list(base_lexicon, "NOUN", corpus_to_search2)

display(df_frequency_list2.sort_values(ascending=False,by=['raw_freq']))

# compute the rank diff of lemmata in frequency tables

df_rankdiffs = get_rank_diff(df_frequency_list1, df_frequency_list2)

display(df_rankdiffs.sort_values(by=['rank_diff']))