# Chaining search


## Library functions: Search
 

In [11]:
import wx
import requests
import pandas as pd
import xml.etree.ElementTree as ET
import json
import urllib
from IPython.display import FileLink, FileLinks
AVAILABLE_CORPORA = ['chn', 'opensonar', 'zeebrieven', 'gysseling', 'nederlab']
RECORDS_PER_PAGE = 1000

# Get rid of ellipsis in display (otherwise relevant data might not be shown)
pd.set_option('display.max_colwidth',1000)

# Search methods

def search_corpus_allwords(corpus):
    query = r'[word=".*"]'
    return search_corpus(query,corpus)

def search_corpus(query, corpus, start_position=1):
    # show wait indicator
    app = wx.App()
    msg_to_user = wx.BusyInfo('Searching '+corpus+' corpus')
    if corpus not in AVAILABLE_CORPORA:
        raise ValueError("Unknown corpus: " + corpus)
    try:
        # Do request to federated content search corpora, so we get same output format for every corpus
        url = "http://portal.clarin.inl.nl/fcscorpora/clariah-fcs-endpoints/sru?operation=searchRetrieve&queryType=fcs&maximumRecords=1000&x-fcs-context=" + corpus + "&query=" + urllib.parse.quote(query)
        print(url)
        response = requests.get(url)
        response_text = response.text    
        df, next_page = parse_xml(response_text)
        # If there are next pages, call search_corpus recursively
        if next_page > 0:
            df_more = search_corpus(query,corpus, start_position=next_page)
            df = df.append(df_more, ignore_index=True)
        # show message out of xml, if some error has occured (prevents empty output)
        _show_error_if_any(response_text)
        return df
    except Exception as e:
        raise ValueError("An error occured when searching corpus " + corpus + ": "+ str(e))
    finally:
        # remove wait indicator, and return dataframe
        del msg_to_user        

def search_corpus_multiple(queries, corpus):
    result_dict = {}
    for query in queries:
        result_dict[query] = search_corpus(query,corpus)
    return result_dict
   

def search_lexicon_allwords(lexicon):
    query = lexicon_query_allwords(lexicon)
    return search_lexicon(query, lexicon)

def search_lexicon(query, lexicon):
     # show wait indicator, so the user knows what's happening
    app = wx.App()
    msg_to_user = wx.BusyInfo('Searching '+lexicon+' lexicon')
    # default endpoint, except when diamant is invoked
    endpoint = "http://172.16.4.56:8890/sparql"
    if (lexicon=="diamant"):
        endpoint = "http://svprre02:8080/fuseki/tdb/sparql"
    
    try:
        # Accept header is needed for virtuoso, it isn't otherwise!
        response = requests.post(endpoint, data={"query":query}, headers = {"Accept":"application/sparql-results+json"})
        
        response_json = json.loads(response.text)
        records_json = response_json["results"]["bindings"]
        records_string = json.dumps(records_json)    
        df = pd.read_json(records_string, orient="records")
    
        # make sure cells containing NULL are added too, otherwise we'll end up with ill-formed data
        # TODO: maybe this can be replaced by:
        # df = df.fillna('')
        df = df.applymap(lambda x: '' if pd.isnull(x) else x["value"])         
        return df
    except Exception as e:
        raise ValueError("An error occured when searching lexicon " + lexicon + ": "+ str(e))
    finally:
        # remove wait indicator, 
        del msg_to_user
        

# Processing methods

def column_difference(df_column1, df_column2):
    set_df1 = set(df_column1)
    set_df2 = set(df_column2)
    diff_left = set_df1.difference(set_df2)
    diff_right = set_df2.difference(set_df1)
    intersec = set_df1.intersection(set_df2)
    return diff_left, diff_right, intersec

def diamant_get_synonyms(df):
    # Depending on the result type, we return the lemma or the definition text
    lemmas = set(df[df["inputMode"]=="defText"]["n_ontolex_writtenRep"])
    defTexts = set(df[df["inputMode"]=="lemma"]["n_syndef_definitionText"])
    return lemmas|defTexts

def parse_xml(text):
    # TODO: should we secure against untrusted XML?
    root = ET.fromstring(text)
    records = []
    n_words_in_hit = 0
    computed_nwih = False
    for entry in root.iter("{http://clarin.eu/fcs/resource}ResourceFragment"):
        for dataView in entry.findall("{http://clarin.eu/fcs/resource}DataView"):
            # We only take into account hits, ignore metadata and segmenting dataViews
            if (dataView.get("type")=="application/x-clarin-fcs-hits+xml"):
                result = dataView.find("{http://clarin.eu/fcs/dataview/hits}Result")
                left_context = result.text if result.text is not None else ''
                hits = list(result)
                if len(hits)==0:
                    print([w for w in result.itertext()])
                    print("no hit in kwic, skip")
                    continue
                last_hit = hits[-1]
                right_context = last_hit.tail if last_hit.tail is not None else ''
                hit_words = [hit.text for hit in hits]
                
                if not computed_nwih:
                    n_words_in_hit = len(hits)
                    computed_nwih=True
                kwic = [left_context] + hit_words + [right_context]
                records.append(kwic)
    columns = ["left context"] + ["word " + str(n) for n in range(n_words_in_hit)] + ["right context"]
    
    next_pos = 0
    next_record_position = root.find("{http://docs.oasis-open.org/ns/search-ws/sruResponse}nextRecordPosition")
    if (next_record_position is not None):
        next_pos = int(next_record_position.text)
        
    return pd.DataFrame(records, columns = columns), next_pos

def show_error_if_any(text):
    # get error message out of xml and print it on screen
    root = ET.fromstring(text)
    msgs = []
    for diagnostic in root.iter("{http://docs.oasis-open.org/ns/search-ws/diagnostic}diagnostic"):
        for msg in diagnostic.findall("{http://docs.oasis-open.org/ns/search-ws/diagnostic}message"):
            msg_text = msg.text if msg.text is not None else ''
            msgs.append(msg_text)
    if len(msgs) > 0:
        print("; ".join(msgs))

# View methods

# results: dict of df's
# labels: list of label corresponding to the df's in results
def view_multiple_results(results, labels):
    assert len(labels)==len(results)
    for n,query in enumerate(results):
        df = results[query]
        if not df.empty:
            display(HTML('Resultaten voor <b>' + labels[n] + "</b>:"))
            display(df)

## Library functions: UI

In [2]:

import ipywidgets as widgets
from IPython.display import display
import tkinter as tk
from tkinter import filedialog
from pathlib import Path
from IPython.display import Javascript
DEFAULT_QUERY = r'[lemma="boek" & pos="verb"]' #r'[lemma="boeken" pos="verb"]'
DEFAULT_CORPUS = "chn"


# $$$$$$$$$$$ DIT WERKT NOG NIET $$$$$$$$$$$
def create_run_cell_ui(cell_id):
    executebutton = widgets.Button(
        description='Uitvoeren',
        disabled=False,
        button_style='info', 
        tooltip='Voer de volgende code uit',  
        icon=''
    )
    executebutton.cell_id = cell_id
    executebutton.on_click( run_cell )    
    executebuttonBox = widgets.HBox([executebutton])
    display(executebuttonBox)    
    
def _run_cell(button):
    cell_id = button.cell_id
    # https://stackoverflow.com/questions/47567834/execute-a-jupyter-notebook-cell-programmatically
    Javascript("Jupyter.notebook.execute_cells(["+cell_id+"])")
# $$$$$$$$$$$ DIT WERKT NOG NIET $$$$$$$$$$$



def create_corpus_ui():
    # Create UI elements
    corpusQueryField = widgets.Text(description="<b>CQL query:</b>", value=DEFAULT_QUERY)
    corpusField = widgets.Dropdown(
        options=AVAILABLE_CORPORA,
        value=DEFAULT_CORPUS,
        description='<b>Corpus:</b>',
    )
    '''corpusSearchButton = widgets.Button(
        description='Search',
        button_style='info', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Search',
    )
    # Handle events
    corpusSearchButton.on_click(corpus_search)'''
    
    # Stack UI elements in vertical box and display
    corpusUiBox = widgets.VBox([corpusQueryField,corpusField])
    display(corpusUiBox)
    
    # Return fields, so their contents are accessible from the global namespace of the Notebook
    return corpusQueryField, corpusField

def create_lexicon_ui():
    DEFAULT_SEARCHWORD = 'boek'
    DEFAULT_LEXICON = "diamant"

    # Create UI elements
    searchWordField = widgets.Text(description="<b>Word:</b>", value=DEFAULT_SEARCHWORD)
    lexiconField = widgets.Dropdown(
        options=['anw', 'celex', 'diamant', 'duelme', 'molex'],
        value=DEFAULT_LEXICON,
        description='<b>Lexicon:</b>',
    )
    '''lexSearchButton = widgets.Button(
        description='Search',
        button_style='info', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Search',
    )
    # Handle events
    lexSearchButton.on_click(lexicon_search)'''
    # Stack UI elements in vertical box and display
    lexUiBox = widgets.VBox([searchWordField,lexiconField])
    display(lexUiBox)
    return searchWordField, lexiconField

def create_save_dataframe_ui(df):
    # build ui for saving results
    DEFAULT_FILENAME = 'mijn_resultaten.csv'
    saveResultsCaption = widgets.Label(value='Sla uw resultaten op:')
    fileNameField = widgets.Text(value=DEFAULT_FILENAME)
    savebutton = widgets.Button(
        description='Bestand opslaan',
        disabled=False,
        button_style='warning', 
        tooltip=DEFAULT_FILENAME,  # trick to pass filename to button widget
        icon=''
    )
    # inject dataframe into button object
    savebutton.df = df
    # when the user types a new filename, it will be passed to the button tooltip property straight away
    fileNameLink = widgets.jslink((fileNameField, 'value'), (savebutton, 'tooltip'))
    # click event with callback
    savebutton.on_click( _save_dataframe )    
    saveResultsBox = widgets.HBox([saveResultsCaption, fileNameField, savebutton])
    display(saveResultsBox)    
    
def _save_dataframe(button):
    fileName = button.tooltip
    # The result files can be saved locally or on the server:
    # If result files are to be offered as downloads, set to True; otherwise set to False    
    fileDownloadable = False
    # specify paths here, if needed:
    filePath_onServer = ''  # could be /path/to
    filePath_default = ''
    # compute full path given chosen mode
    fullFileName = (filePath_onServer if fileDownloadable else filePath_default ) + fileName
        
    try:
        button.df.to_csv( fullFileName, index=False)
        # confirm it all went well
        print(fileName + " saved")    
        button.button_style = 'success'
        button.icon = 'check'
        # trick: https://stackoverflow.com/questions/31893930/download-csv-from-an-ipython-notebook
        if (fileDownloadable):
            downloadableFiles = FileLinks(filePath_onServer)
            display(downloadableFiles)
    except Exception as e:
        button.button_style = 'danger'
        raise ValueError("An error occured when saving " + fileName + ": "+ str(e))    

    
    
    
# $$$$$$$$$$$$$$$$$$ DIT WERKT NIET $$$$$$$$$$$$$$$$$$
def create_load_dataframe_ui():
    # https://stackoverflow.com/questions/9319317/quick-and-easy-file-dialog-in-python
    root = tk.Tk()
    root.withdraw()
    # build ui for loading saved results
    DEFAULT_FILENAME = 'mijn_resultaten.csv'
    loadResultsCaption = widgets.Label(value='Laad uw opgeslagen resultaten op:')
    loadbutton = widgets.Button(
        description='Bestand laden',
        disabled=False,
        button_style='warning', 
        tooltip=DEFAULT_FILENAME,  # trick to pass filename to button widget
        icon='check'
    )
    # click event with callback
    df = loadbutton.on_click( _load_dataframe )    
    loadResultsBox = widgets.HBox([loadResultsCaption, loadbutton])
    display(loadResultsBox)  
    return df

# subroutine of create_load_results_ui
def _load_dataframe(button):
    # https://stackoverflow.com/questions/9319317/quick-and-easy-file-dialog-in-python
    filepath = filedialog.askopenfilename(initialdir="/", title="Select file")
    df = load_dataframe(filepath)    
    # confirm it all went well
    button.button_style = 'success'
    button.icon = 'check'
    return df
# $$$$$$$$$$$$$$$$$$ DIT WERKT NIET $$$$$$$$$$$$$$$$$$




def load_dataframe(filepath):
    try:
        df = pd.read_csv(filepath)
        print(filepath + " loaded successfully")            
    except Exception as e:
        raise ValueError("An error occured when loading " + filepath + ": "+ str(e))
    finally:
        return df

## Library functions: Queries

In [40]:
import re

def containsRegex(word):
    return ( word.find('^')>-1 or
            word.find('$')>-1 or 
            re.match("\(.+?\)", word) or
            re.match("\[.+?\]", word) or
            re.match("[\+*]", word) )
                     
def lexicon_query(word, pos, lexicon):
    if (lexicon=="anw"):
        exactsearch = (not containsRegex(word))
        subpart = """FILTER ( regex(?lemma, \""""+word+"""\") || regex(?definition, \""""+word+"""\") ) . """
        if (exactsearch == True):
              subpart =  """
                { { ?lemId rdfs:label ?lemma .  
                values ?lemma { \""""+word+"""\"@nl \""""+word+"""\" } }                 
                UNION
                { ?definitionId lemon:value ?definition .
                values ?definition { \""""+word+"""\"@nl \""""+word+"""\" } } } .
                """               
        query = """PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
                  PREFIX anw: <http://rdf.ivdnt.org/lexica/anw>
                  PREFIX anwsch: <http://rdf.ivdnt.org/schema/anw/>
                  PREFIX lemon: <http://lemon-model.net/lemon#>
                  
                  SELECT ?lemId ?lemma ?writtenForm ?definition concat('', ?definitionComplement) as ?definitionComplement
                  FROM <http://rdf.ivdnt.org/lexica/anw>
                  WHERE {
                      ?lemId rdfs:label ?lemma .
                      ?lemId ontolex:sense ?senseId .
                      ?senseId lemon:definition ?definitionId .
                      ?definitionId lemon:value ?definition .
                      OPTIONAL { ?definitionId anwsch:definitionComplement ?definitionComplement .}
                      OPTIONAL { ?lemId ontolex:canonicalForm ?lemCFId . 
                          ?lemCFId ontolex:writtenRepresentation ?writtenForm . }
                      """+subpart+"""
                      }"""
    elif (lexicon=="diamant"):
        exactsearch = (not containsRegex(word))
        subpart1 = """?n_form ontolex:writtenRep ?n_ontolex_writtenRep . 
            FILTER regex(?n_ontolex_writtenRep, \""""+word+"""\") . """
        subpart2 = """?n_syndef diamant:definitionText ?n_syndef_definitionText .  
            FILTER regex(?n_ontolex_writtenRep, \""""+word+"""\") . """
        if (exactsearch == True):
            subpart1 =  """
                { ?n_form ontolex:writtenRep ?n_ontolex_writtenRep . 
                values ?n_ontolex_writtenRep { \""""+word+"""\"@nl \""""+word+"""\" } } 
                """                
            subpart2 = """
                { ?n_syndef diamant:definitionText ?n_syndef_definitionText . 
                values ?n_syndef_definitionText { \""""+word+"""\"@nl \""""+word+"""\" } } 
                """
        query = """
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        prefix prov: <http://www.w3.org/ns/prov#>
        prefix diamant: <http://rdf.ivdnt.org/schema/diamant#>
        prefix lexinfo: <http://www.lexinfo.net/ontology/2.0/lexinfo#>
        prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        prefix lemon: <http://lemon-model.net/lemon#>
        prefix ontolex: <http://www.w3.org/ns/lemon/ontolex#>
        prefix ud: <http://universaldependencies.org/u/pos/>
        prefix skos: <http://www.w3.org/2004/02/skos/core#>
        prefix dcterms: <http://purl.org/dc/terms/>
        prefix dc: <http://purl.org/dc/terms/>

        select ?n_entry ?n_form ?n_ontolex_writtenRep ?n_syndef ?n_sensedef ?n_sensedef_definitionText ?n_syndef_definitionText ?n_sense ?inputMode ?wy_f_show ?wy_t_show
        where
        {
        graph ?g
        {
        {
            """ + subpart1 + """
            { ?n_entry a ontolex:LexicalEntry} .
            { ?n_form a ontolex:Form} .
            { ?n_sense a ontolex:LexicalSense} .
            { ?n_syndef a diamant:SynonymDefinition} .
            { ?n_sensedef a lemon:SenseDefinition} .
            { ?n_syndef diamant:definitionText ?n_syndef_definitionText } .
            { ?n_sensedef diamant:definitionText ?n_sensedef_definitionText } .
            { ?n_entry ontolex:canonicalForm ?n_form } .
            { ?n_entry ontolex:sense ?n_sense } .
            { ?n_sense lemon:definition ?n_syndef } .
            { ?n_sense lemon:definition ?n_sensedef } .
              ?n_sense diamant:attestation ?n_attest_show .
              ?n_sense diamant:attestation ?n_attest_filter .
              ?n_attest_show diamant:text ?n_q_show .
              ?n_attest_filter diamant:text ?n_q_filter .
              ?n_attest_show a diamant:Attestation .
              ?n_attest_filter a diamant:Attestation .
              ?n_q_filter a diamant:Quotation .
              ?n_q_show a diamant:Quotation .
              ?n_q_filter diamant:witnessYearFrom ?wy_f_filter .
              ?n_q_filter diamant:witnessYearTo ?wy_t_filter .
              ?n_q_show diamant:witnessYearFrom ?wy_f_show .
              ?n_q_show diamant:witnessYearTo ?wy_t_show .
              FILTER (xsd:integer(?wy_f_show) >= 1200)
              FILTER (xsd:integer(?wy_t_show) >= 1200)
              FILTER (xsd:integer(?wy_f_show) <= 2018)
              FILTER (xsd:integer(?wy_t_show) <= 2018)
            { bind("lemma" as ?inputMode) } .
            } UNION
          {
            """ + subpart2 + """
            { ?n_sense a ontolex:LexicalSense} .
            { ?n_syndef a diamant:SynonymDefinition} .
            { ?n_sensedef a lemon:SenseDefinition} .
            { ?n_form a ontolex:Form} .
            { ?n_form ontolex:writtenRep ?n_ontolex_writtenRep } .  { ?n_entry a ontolex:LexicalEntry} .
            { ?n_entry ontolex:sense ?n_sense } .
            { ?n_sense lemon:definition ?n_syndef } .
            { ?n_sense lemon:definition ?n_sensedef } .
            { ?n_sensedef diamant:definitionText ?n_sensedef_definitionText } .
            { ?n_entry ontolex:canonicalForm ?n_form } .
            ?n_sense diamant:attestation ?n_attest_show .
            ?n_sense diamant:attestation ?n_attest_filter .
            ?n_attest_filter diamant:text ?n_q_filter .
            ?n_attest_show diamant:text ?n_q_show .
            ?n_q_filter diamant:witnessYearFrom ?wy_f_filter .
            ?n_q_filter diamant:witnessYearTo ?wy_t_filter .
            ?n_q_show diamant:witnessYearFrom ?wy_f_show .
            ?n_q_show diamant:witnessYearTo ?wy_t_show .
            ?n_attest_show a diamant:Attestation .
            ?n_attest_filter a diamant:Attestation .
            ?n_q_filter a diamant:Quotation .
            ?n_q_show a diamant:Quotation .
            FILTER (xsd:integer(?wy_f_show) >= 1200)
            FILTER (xsd:integer(?wy_t_show) >= 1200)
            FILTER (xsd:integer(?wy_f_show) <= 2018)
            FILTER (xsd:integer(?wy_t_show) <= 2018)
          { bind("defText" as ?inputMode) } .
            }
        }
        }"""
    elif (lexicon=="molex"):
        exactsearch = (not containsRegex(word))
        subpart1 = """"""
        subpart2 = """"""
        subpartPos = """"""
        if (word != ''):
            if (exactsearch == True):
                subpart1 =  """
                    { ?lemCFId ontolex:writtenRep ?lemma . 
                    values ?lemma { \""""+word+"""\"@nl \""""+word+"""\" } } 
                    UNION
                    { ?wordformId ontolex:writtenRep ?wordform . 
                    values ?wordform { \""""+word+"""\"@nl \""""+word+"""\" } } .
                    """        
            else:
                subpart2 = """FILTER ( regex(?lemma, \""""+word+"""\") || regex(?wordform, \""""+word+"""\") ) . """
        if (pos is not None and pos != ''):
            subpartPos = """FILTER ( regex(?lemPos, \""""+pos+"""$\") ) ."""
        query = """
            PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
            PREFIX UD: <http://universaldependencies.org/u/>
            PREFIX diamant: <http://rdf.ivdnt.org/schema/diamant#>
            
            SELECT ?lemEntryId ?lemma ?lemPos ?wordformId ?wordform ?hyphenation ?wordformPos ?Gender ?Number
            FROM <http://rdf.ivdnt.org/lexica/molex>
            WHERE
            {
            ?lemEntryId ontolex:canonicalForm ?lemCFId .
            ?lemCFId ontolex:writtenRep ?lemma .
            """+subpart1+"""
            OPTIONAL {?lemEntryId UD:Gender ?Gender .}
            OPTIONAL {?lemEntryId UD:VerbForm ?verbform .}
            ?lemEntryId UD:pos ?lemPos .
            """+subpartPos+"""
            ?lemEntryId ontolex:lexicalForm ?wordformId .
            ?wordformId UD:pos ?wordformPos .
            OPTIONAL {?wordformId UD:Number ?Number .}
            OPTIONAL {?wordformId ontolex:writtenRep ?wordform .}
            OPTIONAL {?wordformId diamant:hyphenation ?hyphenation .}
            """+subpart2+"""
            }
        """
    elif (lexicon=="duelme"):
        exactsearch = (not containsRegex(word))
        subpart = """FILTER ( regex(?lemma, \""""+word+"""\") || regex(?wordform, \""""+word+"""\") ) ."""
        if (exactsearch == True):
            subpart =  """
                { ?y lmf:hasLemma ?dl .  
                values ?dl { \""""+word+"""\"@nl \""""+word+"""\" } }                 
                """        
        query = """
            PREFIX duelme: <http://rdf.ivdnt.org/lexica/duelme>
            PREFIX intskos: <http://ivdnt.org/schema/lexica#>
            PREFIX lmf: <http://www.lexinfo.net/lmf>
            PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
            PREFIX UD: <http://rdf.ivdnt.org/vocabs/UniversalDependencies2#>
            
            SELECT ?exampleSentence ?lemma ?gender ?number
            WHERE  {
                  ?d intskos:ExampleSentence ?exampleSentence .
                  ?d lmf:ListOfComponents [lmf:Component ?y] .
                  ?y lmf:hasLemma ?lemma . 
                  OPTIONAL {?y UD:Gender ?gender}
                  OPTIONAL {?y UD:Number ?number}
            """+subpart+"""
            }
        """
    elif (lexicon=="celex"):
        exactsearch = (not containsRegex(word))
        subpart = """FILTER ( regex(?lemma, \""""+word+"""\") ) . """
        if (exactsearch == True):
            subpart =  """
                { ?lemmaId ontolex:canonicalForm [ontolex:writtenRep ?lemma] .  
                values ?lemma { \""""+word+"""\"@nl \""""+word+"""\" } }                 
                """        
        query = """
            PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
            PREFIX celex: <http://rdf.ivdnt.org/lexica/celex>
            PREFIX UD: <http://rdf.ivdnt.org/vocabs/UniversalDependencies2#>
            PREFIX decomp: <http://www.w3.org/ns/lemon/decomp#>
            PREFIX gold: <http://purl.org/linguistics/gold#>
            
            SELECT DISTINCT ?lemmaId ?lemma ?wordformId ?wordform ?number ?gender concat('',?subLemmata) AS ?subLemmata
            WHERE  {
                ?lemmaId ontolex:canonicalForm [ontolex:writtenRep ?lemma] .
                """+subpart+"""
                BIND( ?lemmaId AS ?lemmaIdIRI ).
                ?lemmaId ontolex:lexicalForm ?wordformId .
                ?wordformId ontolex:writtenRep ?wordform .
                OPTIONAL {?wordformId UD:Number ?number} .
                OPTIONAL {
                    ?lemmaId UD:Gender ?g . 
                        bind( 
                            if(?g = UD:Fem_Gender, 
                            UD:Com_Gender, 
                                if(?g = UD:Masc_Gender,
                                    UD:Com_Gender,
                                    UD:Neut_Gender
                                )
                            )
                            AS ?gender
                        )
                }
                OPTIONAL {
                    SELECT ?lemmaIdIRI (group_concat(DISTINCT concat(?partNr,":",?subLemma);separator=" + ") as ?subLemmata)
                    WHERE {
                        SELECT ?lemmaIdIRI ?celexComp ?aWordformId ?subLemma ?partNr
                        WHERE {
                                {
                                ?lemmaIdIRI ontolex:lexicalForm ?aWordformId . 
                                ?lemmaIdIRI decomp:constituent ?celexComp .
                                OPTIONAL { ?celexComp gold:stem [ontolex:writtenRep ?subLemma] . }
                                OPTIONAL { ?celexComp decomp:correspondsTo [ ontolex:canonicalForm [ontolex:writtenRep ?subLemma]] . }
                                }
                                {
                                    {
                                        {?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#_1> ?celexComp .}
                                        UNION
                                        {?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#_2> ?celexComp .}
                                        UNION
                                        {?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#_3> ?celexComp .}
                                        UNION
                                        {?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#_4> ?celexComp .}
                                        UNION
                                        {?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#_5> ?celexComp .}
                                        UNION
                                        {?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#_6> ?celexComp .}                                        
                                    }
                                ?lemmaIdIRI ?rdfsynt ?celexComp .
                                BIND(IF(STRSTARTS(str(?rdfsynt), "http://www.w3.org/1999/02/22-rdf-syntax-ns#"), replace(STRAFTER(str(?rdfsynt), "#"), "_", ""), "999") AS ?partNr) .
                                MINUS {
                                    ?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#0> ?celexComp .
                                    }
                                }
                            FILTER (?partNr != "999") .
                            }
                            ORDER BY ?partNr
                            }
                        GROUP BY ?aWordformId ?lemmaIdIRI
                    }
            }
        """
        
    return query

def corpus_query_lemma(word):
    return r'[lemma="'+ word + r'"]'

def lexicon_query_allwords(lexicon):
    if (lexicon=="anw"):
        query = """PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
                  PREFIX anw: <http://rdf.ivdnt.org/lexica/anw>                  
                  SELECT DISTINCT ?writtenForm
                  FROM <http://rdf.ivdnt.org/lexica/anw>
                  WHERE {
                      ?lemId rdfs:label ?lemma .
                      ?lemId ontolex:canonicalForm ?lemCFId . 
                      ?lemCFId ontolex:writtenRepresentation ?writtenForm .
                      }
                      ORDER BY ?writtenForm"""
    elif (lexicon=="celex"):
        query = """
            PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
            
            SELECT DISTINCT ?lemma AS ?writtenForm
            WHERE  {
                ?lemmaId ontolex:canonicalForm [ontolex:writtenRep ?lemma] .                
                }
            ORDER BY ?lemma"""
    elif (lexicon=="diamant"):
        query = """
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        prefix prov: <http://www.w3.org/ns/prov#>
        prefix diamant: <http://rdf.ivdnt.org/schema/diamant#>
        prefix lexinfo: <http://www.lexinfo.net/ontology/2.0/lexinfo#>
        prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        prefix lemon: <http://lemon-model.net/lemon#>
        prefix ontolex: <http://www.w3.org/ns/lemon/ontolex#>
        prefix ud: <http://universaldependencies.org/u/pos/>
        prefix skos: <http://www.w3.org/2004/02/skos/core#>
        prefix dcterms: <http://purl.org/dc/terms/>
        prefix dc: <http://purl.org/dc/terms/>

        select DISTINCT ?n_ontolex_writtenRep AS ?writtenForm
        where
        {
        graph ?g
        {
        {
            { ?n_form ontolex:writtenRep ?n_ontolex_writtenRep} .
            { ?n_form a ontolex:Form} .
        }
        }
        }
        ORDER BY ?n_ontolex_writtenRep
        LIMIT 10000
        """
    elif (lexicon=="duelme"):
        query = """
            PREFIX lmf: <http://www.lexinfo.net/lmf>            
            SELECT DISTINCT ?lemma AS ?writtenForm
            WHERE  {
                  ?y lmf:hasLemma ?lemma . 
            }
            ORDER BY ?lemma"""
    elif (lexicon=="molex"):
        query = """
                PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
                SELECT DISTINCT ?lemma AS ?writtenForm
                FROM <http://rdf.ivdnt.org/lexica/molex>
                WHERE
                {
                ?lemEntryId ontolex:canonicalForm ?lemCFId .
                ?lemCFId ontolex:writtenRep ?lemma .                
                }
                 ORDER BY ?lemma"""
    else:
        raise ValueError("Lexicon " + lexicon + " not supported for querying all words.")
    return query

## Corpus search

* Run the cell below to show the UI, and fill in your search query

In [19]:
#from chaininglib import ui

# Create corpus UI, creates references to field contents
corpusQueryField, corpusField = create_corpus_ui()

VBox(children=(Text(value='[lemma="boek" & pos="verb"]', description='<b>CQL query:</b>'), Dropdown(descriptio…

 * Click the cell below and press Run to perform the given query

In [21]:
#from chaininglib import search
query= corpusQueryField.value
corpus = corpusField.value
df_corpus = search_corpus(query,corpus)
#df_corpus = load_dataframe('mijn_resultaten.csv')
display(df_corpus)
create_save_dataframe_ui(df_corpus)



Unnamed: 0,left context,right context


HBox(children=(Label(value='Sla uw resultaten op:'), Text(value='mijn_resultaten.csv'), Button(button_style='w…

## Lexicon search

* Run the cell below to show the UI, and fill in your search query in the UI

In [22]:
#from chaininglib import ui
searchWordField, lexiconField = create_lexicon_ui()

VBox(children=(Text(value='boek', description='<b>Word:</b>'), Dropdown(description='<b>Lexicon:</b>', index=2…

 * Click the cell below and press Run to perform the given query

In [41]:
#from chaininglib import queries, search

search_word = searchWordField.value
lexicon = lexiconField.value
# USER: can replace this by own custom query
query = lexicon_query(word=search_word, pos= '', lexicon=lexicon)

df_lexicon = search_lexicon(query, lexicon)
df_columns_list = list(df_lexicon.columns.values)
df_lexicon_in_columns = df_lexicon[df_columns_list]
display(df_lexicon_in_columns)

Unnamed: 0,gender,lemma,lemmaId,number,subLemmata,wordform,wordformId
0,http://rdf.ivdnt.org/vocabs/UniversalDependencies2#Com_Gender,heldendaad,http://rdf.ivdnt.org/lexica/celex#celex_lemma_39339,http://rdf.ivdnt.org/vocabs/UniversalDependencies2#Sing_Number,1:held + 2: + 3:daad,heldendaad,http://rdf.ivdnt.org/lexica/celex#celex_wform_123231
1,http://rdf.ivdnt.org/vocabs/UniversalDependencies2#Neut_Gender,heldendaad,http://rdf.ivdnt.org/lexica/celex#celex_lemma_39339,http://rdf.ivdnt.org/vocabs/UniversalDependencies2#Sing_Number,1:held + 2: + 3:daad,heldendaad,http://rdf.ivdnt.org/lexica/celex#celex_wform_123231
2,http://rdf.ivdnt.org/vocabs/UniversalDependencies2#Com_Gender,heldendaad,http://rdf.ivdnt.org/lexica/celex#celex_lemma_39339,http://rdf.ivdnt.org/vocabs/UniversalDependencies2#Plur_Number,1:held + 2: + 3:daad,heldendaden,http://rdf.ivdnt.org/lexica/celex#celex_wform_123232
3,http://rdf.ivdnt.org/vocabs/UniversalDependencies2#Neut_Gender,heldendaad,http://rdf.ivdnt.org/lexica/celex#celex_lemma_39339,http://rdf.ivdnt.org/vocabs/UniversalDependencies2#Plur_Number,1:held + 2: + 3:daad,heldendaden,http://rdf.ivdnt.org/lexica/celex#celex_wform_123232
4,http://rdf.ivdnt.org/vocabs/UniversalDependencies2#Neut_Gender,heldendaad,http://rdf.ivdnt.org/lexica/diamant/entry/molex/31207,,,heldendaden,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/119586
5,http://rdf.ivdnt.org/vocabs/UniversalDependencies2#Neut_Gender,heldendaad,http://rdf.ivdnt.org/lexica/diamant/entry/molex/31207,,,heldendaden,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/119586
6,http://rdf.ivdnt.org/vocabs/UniversalDependencies2#Neut_Gender,heldendaad,http://rdf.ivdnt.org/lexica/diamant/entry/molex/31207,,,heldendaden,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/119586
7,http://rdf.ivdnt.org/vocabs/UniversalDependencies2#Neut_Gender,heldendaad,http://rdf.ivdnt.org/lexica/diamant/entry/molex/31207,,,heldendaden,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/119586
8,http://rdf.ivdnt.org/vocabs/UniversalDependencies2#Neut_Gender,heldendaad,http://rdf.ivdnt.org/lexica/diamant/entry/molex/31207,,,heldendaden,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/119586
9,http://rdf.ivdnt.org/vocabs/UniversalDependencies2#Neut_Gender,heldendaad,http://rdf.ivdnt.org/lexica/diamant/entry/molex/31207,,,heldendaden,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/119586


## Case study 1 (parallel): Frequency of *puur*+verb and *zuiver*+verb compared
* Below cell searches for *puur*+verb and for *zuiver*+verb in the CHN corpus
* Compare frequencies

In [28]:
#from chaininglib import search
from IPython.core.display import display, HTML

# Word 1: puur
word1= "puur"
df_corpus1 = search_corpus(r'[word="' + word1 + r'"][pos="verb"]',corpus="chn")
display(HTML('<b>' + word1 + '</b>'))
display(df_corpus1)

# Word 2: zuiver
word2 = "zuiver"
df_corpus2 = search_corpus(r'[word="' + word2 + r'"][pos="verb"]',"chn")
display(HTML('<b>' + word2 + '</b>'))
display(df_corpus2)

# Compute difference
diff_left, diff_right, intersec = column_difference(df_corpus1["word 1"], df_corpus2["word 1"])
# Elements of 1 that are not in 2
display(HTML('Werkwoorden voor <b>' + word1 + '</b> niet in <b>' + word2 + '</b>: ' + ", ".join(diff_left)))
# Elements of 2 that are not in 1
display(HTML('Werkwoorden voor <b>' + word1 + '</b> niet in <b>' + word2 + '</b>: ' + ", ".join(diff_right)))
# Elements both in 1 and 2
display(HTML('Werkwoorden zowel voor <b>' + word1 + '</b> als voor <b>' + word2 + '</b>: ' + ", ".join(intersec)))

Unnamed: 0,left context,word 0,word 1,right context
0,stad omdat de cultuur daar,puur,is,
1,succes dan wanneer de coalitie,puur,gestoeld,is op een parlementaire meerderheid
2,de andere jongeren die gewoon,puur,willen,werken Maar financieel zijn zij
3,gratis zijn De deelname is,puur,gebaseerd,op interesse In de middaguren
4,de natuur De natuur is,puur,vertelt,de kunstenaar Een tijger jaagt
5,van een licentieovereenkomst maar het,puur,gaat,om de oorspronkelijke eis van
6,gevecht voor ons voorouderlijk land,puur,bedoeld,is om het menselijk leven
7,het SZF maar waar het,puur,gaat,om winsten te maken zegt
8,redactrice Inge SchelstraeteHET zou poëzie,puur,worden,waarschuwde Piet Piryns ons Maar
9,bepaalde zender te adverteren is,puur,gebaseerd,op marketing én op het


Unnamed: 0,left context,word 0,word 1,right context
0,baby Ik wil het contact,zuiver,beperken,tot de baby en het
1,zo zul je als mens,zuiver,moeten,zijn om in aanmerking te
2,adviezen van mensen Ik heb,zuiver,vastgelegd,wat ik zelf spraakmakend vond
3,karaoke je ding Daarvoor is,zuiver,zingen,niet nodig Dat is juist
4,aan als het gevoel erachter,zuiver,is,Met deze woorden besloot Loes
5,Maar als je je gevoelens,zuiver,houdt,en goede wensen creëert dan
6,Cronie maar die was te,zuiver,genomen,Bij The Scorpions benutten Misiedjan
7,Woudman op Cairo Deze werd,zuiver,genomen,door Dwight Tempico 2-1 Een
8,de leerlingen herkenbaar en is,zuiver,beweert,de directeur Ook enkele leerkrachten
9,bakzeil De strafschop was te,zuiver,genomen,


## Case study 2 (sequential): Retrieve synonyms from DiaMaNT, look up in Gysseling
* Below cell searches for term "boek" in DiaMaNT, and looks up all variants in Gysseling

In [29]:
search_word = "boek"
lexicon = "diamant"
corpus= "gysseling"

# First, lookup synonyms in DiaMaNT
query = lexicon_query(word=search_word, pos= '', lexicon=lexicon)
df_lexicon = search_lexicon(query, lexicon)
syns = diamant_get_synonyms(df_lexicon) 
syns.add(search_word) # Also add search word itself
display(HTML('Synoniemen voor <b>' + search_word + '</b>: ' + ", ".join(syns)))

# Search for all synonyms in corpus
## Create queries: search by lemma
syns_queries = [corpus_query_lemma(syn) for syn in syns]
## Search for all synonyms in corpus
result_dict = search_corpus_multiple(syns_queries, corpus)
view_multiple_results(result_dict, labels=list(syns))



Unnamed: 0,left context,word 0,right context
0,ende met .iiij. draden roder,lijsten,in .ij. euelten die hier ieghen
1,lib. het ne ware die,lijste,vanden lakene. Vort eist ghecuert
2,ᨮᨬ. het ne ware die,lijste,vanden lakene [3] vort es
3,langher ende dat met ere,lijste,Ende in so wat cammen dat
4,van wits te moreideine die,lijste,ne laghe binnen diere ieghen dade
5,.xxx. pond ende met .ij.,lijsten,.ix. drade vp elken egh
6,drade vp elken egh blawer,lijsten,ende der in ghesceert diere ieghen dade
7,.xxviij. pond met .i. roder,lijste,Ende dese vorseide saye salmen
8,elc warpin say ene blaeuwe,"lijste,",ende tweueline ne ghene. Ende so wie
9,ne ghene. Ende so wie die blaeuwe,lijste,scerde an weuelin say ouer


Unnamed: 0,left context,word 0,right context
0,.iiii. ghecorne gulde broeders die de,boeke,oudenden sin. si moghen elc
1,viere ghecorne guldebroeders die de,boeke,houden si moghen elc haren
2,.iiii. ghecorne guldebroeders die de,boke,ouden si moghen elc haren
3,secundi willelmus de lapide willelmus,boec,Jn elst. arnulphus de keelne
4,"Heren M CC LXXX, due wart det",buec,begonnen. Desen csens es mer
5,"van poschen, due wart det",buec,begonnen. Desen pagt es mer
6,"van poschen, due wart det",buec,begonnen. Desen pagt es mer
7,"van poschen, due wart dit",buc,begonnen. Dese pegte es mer
8,"van poschen, due wart dit",buc,begonnen. Dit blift den bruderen
9,"van poschen, due wart dit",buc,begonnen. Dit sin degene die


Unnamed: 0,left context,word 0,right context
0,Voert; dat een weuera die,werc,"ghenoch heft ter volre weken,"
1,met den weueren die en ghen,werc,en hebben te weuene; hine
2,platse hout inde weke ende,werc,heft; hi es sculdech en
3,van vresen van vinders. Dat,werc,es verbord; Ende hi moet
4,dade weuen; hi verborde dat,werc,ende .iij. lb. So wat ambochts man;
5,dade weuen hie verbord dat,werc,ende .iij. lb. Dit mach
6,es .v. ᨣ. [2] Dat,werc,es sculdich te stane an
7,es .v. ᨣ. [2] Dat,werc,es sculdich te stane an
8,te enighes drapeniers huus; omme,weerc,te beiaghene. jof omme enighen
9,staet die die mester gheen,weerc,gheuen ne wille; hie moet


Unnamed: 0,left context,word 0,right context
0,didscher talen ende ic vten,texte,van den vire ewangelisten makde
1,in vele staden es de,text,van der ewangelien also donker
2,vele meerre sijn dan de,text,van der ewangelien alte male. Ende
3,didsche alse si in den,texte,"gescreuen sijn, so bleue dit"


Unnamed: 0,left context,word 0,right context
0,Ende roofde den tempel ende,sloten toe.,Dar na saen so starf hi
1,want haer mont was sekerleke,toe ghesloten,so starkeleke dat menne met


Unnamed: 0,left context,word 0,right context
0,.iiii. ghecorne gulde broeders die de,boeke,oudenden sin. si moghen elc
1,viere ghecorne guldebroeders die de,boeke,houden si moghen elc haren
2,.iiii. ghecorne guldebroeders die de,boke,ouden si moghen elc haren
3,secundi willelmus de lapide willelmus,boec,Jn elst. arnulphus de keelne
4,"Heren M CC LXXX, due wart det",buec,begonnen. Desen csens es mer
5,"van poschen, due wart det",buec,begonnen. Desen pagt es mer
6,"van poschen, due wart det",buec,begonnen. Desen pagt es mer
7,"van poschen, due wart dit",buc,begonnen. Dese pegte es mer
8,"van poschen, due wart dit",buc,begonnen. Dit blift den bruderen
9,"van poschen, due wart dit",buc,begonnen. Dit sin degene die


Unnamed: 0,left context,word 0,right context
0,van der vorseider stede. ten,ghewarke,boef. van der fermerien van
1,graf. Ende gheft aldaer ten,ghewerke.,vander kerken vijf pont vlaemsche.
2,deelne. vord gaf soe den,ghewerke,van onser vrouwen kerke. twintich
3,kerke. twintich sceleghe vlaemsche. den,ghewerke,van sinte saluators kerke. twintich
4,sceleghe vlaemsche. vord te elken,ghewerke,van allen kerken die binne
5,ende xv. ᨮᨬ te haren,"ghewerke,",Vort gheuic broder pauwels vanine
6,Tonser vrouwen in brugghe ten,ghewerke,xl. ᨣ. Aldar tsinte saluatoers
7,saluatoers xl. ᨣ also. Ten,ghewerke,te sinte baues bute brugghe
8,engeen man en ware met,gewerke.,hi en waert selue dan
9,wille te wederstane Ende gode,ghewerke,in elke stede Te begharne


Unnamed: 0,left context,word 0,right context
0,uan haueleker scult ende negene,orconden,ne heuet; die beclaghet es
1,tue eruahtege man heuet in,orconden,hi sal winnen sinen houestoel.
2,yemene dinghet ende hem uermet,orconden,die hi iegenwordech heuet ende
3,cateil. Ende dinghet hi sonder,orconden;,deghene dar hi up dinghet
4,hebben tue eruahteghe man in,orconden.,ende si sullen hem helpen
5,hebbe tue eruahteghe man te,orconden,dat hem uergolden si. Neware
6,uan doder hant ende negene,orconden,ne heuet hi biede sinen
7,hebben tue eruahteghe man te,orconden.,Doed en uremde man enen
8,Tiemen hem meer dar neghene,orconden,ne sien; met enen eruahtegen
9,dan enen eruahteghen man in,orconden;,die orconde moet sueren binnen


## Case study (parallel) 3: Find corpus words not in lexicon; list most frequent ones.
* Only parallel if you can ask the lexicon a list of all words.
* Currently only working: ask DiaMaNT list of words (limited at 10000)

In [30]:
# Query lexicon to give list of all words
lexicon="anw"
df_lexicon = search_lexicon_allwords(lexicon)
## TODO: Why do double words appear?
lexicon_set = set([w.lower() for w in df_lexicon["writtenForm"]])
display(lexicon_set)

df_corpus = search_corpus_allwords("gysseling")
display(df_corpus)
len(df_corpus)



['heemstedenaar',
 'heemsteedse',
 'heen-en-weerkind',
 'heenmatch',
 'heenwedstrijd',
 'heerdenaar',
 'heerdense',
 'heerdese',
 'heerenveense',
 'heerenvener',
 'heerlenaar',
 'heerlense',
 'heerlijk',
 'heethoofd',
 'heffingsperiode',
 'heidesafari',
 'heideschaap',
 'heilbot',
 'heiligendag',
 'heiloose',
 'heilooër',
 'heimwee',
 'heisessie',
 'heistenaar',
 'heistse',
 'heldenaar',
 'heldendood',
 'heldense',
 'heldentenor',
 'heldergroen',
 'helgroen',
 'heli',
 'helicon',
 'heligate',
 'helihaven',
 'helikopter',
 'helikoptergeld',
 'helikopterouder',
 'helikoptertaxi',
 'heliodruk',
 'heliogravure',
 'helitaxi',
 'helix',
 'hellehond',
 'helm',
 'helmdraad',
 'helmonder',
 'helmondse',
 'helper',
 'helperssyndroom',
 'helpie',
 'helpster',
 'hematologie',
 'hematologisch',
 'hematoloog',
 'hemd',
 'hemelvaartsdag',
 'hemelwaarts',
 'hen',
 'henegouwer',
 'henegouwse',
 'hengel',
 'hengelaar',
 'hengelclub',
 'hengelose',
 'hengeloër',
 'hengelsportvereniging',
 'hengelsportwin

Unnamed: 0,left context,word 0,right context
0,,[hand],[A] Reinerus .filius. arnulfi Bake
1,[hand],[A],Reinerus .filius. arnulfi Bake quem
2,[hand] [A],Reinerus,.filius. arnulfi Bake quem symon
3,[hand] [A] Reinerus,.filius.,arnulfi Bake quem symon Bake
4,[hand] [A] Reinerus .filius.,arnulfi,Bake quem symon Bake habet
5,[hand] [A] Reinerus .filius. arnulfi,Bake,quem symon Bake habet in
6,[A] Reinerus .filius. arnulfi Bake,quem,symon Bake habet in aduocacia
7,Reinerus .filius. arnulfi Bake quem,symon,Bake habet in aduocacia inuadiauit
8,.filius. arnulfi Bake quem symon,Bake,habet in aduocacia inuadiauit terram
9,arnulfi Bake quem symon Bake,habet,in aduocacia inuadiauit terram ecgart


1000

## Case study (sequential) 4: Find occurences of attributive adjectives not ending with -e, even though they are preceeded by a definite article

In [12]:
corpus_to_search="opensonar"
lexicon_to_search="molex"

# CORPUS: get [article + attributive adjective + nouns] combinations in which the adjective does not end with -e
df_corpus = search_corpus(r'[lemma="de|het"][word="^g(.+)[^e]$" & pos="ADJ"][pos="NOUN"]', corpus=corpus_to_search)
display(df_corpus)

# LEXICON: get adjectives the lemma of which does not end with -e
query=lexicon_query('^g(.+)[^e]$', 'ADJ', lexicon_to_search)
df_lexicon = search_lexicon(query, lexicon_to_search)
display(df_lexicon)

# LEXICON: get adjectives having a final -e in definite attributive use
print('Filtering lexicon results')
final_e_condition=df_lexicon.wordform.str.contains('e$')
df = df_lexicon[final_e_condition]
display(df)

# RESULT: get the records out of our first list in which the -e-less-adjectives match the lemma form of our last list
print('Wanted list:')
eless_forms = list(df.lemma)
no_final_e_condition = df_corpus['word 1'].isin(eless_forms)
display( df_corpus[no_final_e_condition] )

Unnamed: 0,left context,word 0,word 1,word 2,right context
0,die job gaan aanpakken owv,de,gratis,koffie,... Er zijn plaatsen waar
1,jaar geleden dood vloog tegen,de,glazen,gevel,van het Natuurhistorisch Museum in
2,eend vloog in 1995 tegen,de,glazen,gevel,"van het museum , viel"
3,de aandelen overnemen . Vanwaar,het,groot,verschil,? Dat ING een gezond
4,"manier doen , nu was",het,gewoon,zeeeeer,traaaaaaaaaaaaag .
5,"de goede wil hebben ,",het,gezond,verstand,en bereidheid om moeite te
6,doet me toch twijfelen aan,het,gezond,verstand,van Vlaanderen . [ /
7,een proximus kaart ! Bel,het,gratis,nr,080010200 en volg de instructies
8,dat hij bereid is voor,de,gratis,software,die hij kan downloaden nog
9,absurde . Als oplossing voor,het,groot,aantal,mensen zou je mensen om


Unnamed: 0,hyphenation,lemEntryId,lemPos,lemma,wordform,wordformId,wordformPos
0,ge/wa/fel/de,http://rdf.ivdnt.org/lexica/diamant/entry/molex/27460,http://universaldependencies.org/u/pos/ADJ,gewafeld,gewafelde,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/102144,http://universaldependencies.org/u/pos/ADJ
1,gra/na/ten,http://rdf.ivdnt.org/lexica/diamant/entry/molex/28905,http://universaldependencies.org/u/pos/ADJ,granaten,granaten,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/102404,http://universaldependencies.org/u/pos/ADJ
2,ge/re/gle/men/teer/de,http://rdf.ivdnt.org/lexica/diamant/entry/molex/26787,http://universaldependencies.org/u/pos/ADJ,gereglementeerd,gereglementeerde,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/102662,http://universaldependencies.org/u/pos/ADJ
3,gras/rij/ke,http://rdf.ivdnt.org/lexica/diamant/entry/molex/28966,http://universaldependencies.org/u/pos/ADJ,grasrijk,grasrijke,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/104989,http://universaldependencies.org/u/pos/ADJ
4,geilst,http://rdf.ivdnt.org/lexica/diamant/entry/molex/25753,http://universaldependencies.org/u/pos/ADJ,geil,geilst,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/105604,http://universaldependencies.org/u/pos/ADJ
5,ge/lijk/draads,http://rdf.ivdnt.org/lexica/diamant/entry/molex/26107,http://universaldependencies.org/u/pos/ADJ,gelijkdraads,gelijkdraads,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/107966,http://universaldependencies.org/u/pos/ADJ
6,gro/te/re,http://rdf.ivdnt.org/lexica/diamant/entry/molex/29606,http://universaldependencies.org/u/pos/ADJ,groot,grotere,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/108065,http://universaldependencies.org/u/pos/ADJ
7,geil,http://rdf.ivdnt.org/lexica/diamant/entry/molex/25753,http://universaldependencies.org/u/pos/ADJ,geil,geil,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/110851,http://universaldependencies.org/u/pos/ADJ
8,gra/du/e/le,http://rdf.ivdnt.org/lexica/diamant/entry/molex/28815,http://universaldependencies.org/u/pos/ADJ,gradueel,graduele,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/111804,http://universaldependencies.org/u/pos/ADJ
9,grijs/bruin,http://rdf.ivdnt.org/lexica/diamant/entry/molex/29179,http://universaldependencies.org/u/pos/ADJ,grijsbruin,grijsbruin,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/112688,http://universaldependencies.org/u/pos/ADJ


Filtering lexicon results


Unnamed: 0,hyphenation,lemEntryId,lemPos,lemma,wordform,wordformId,wordformPos
0,ge/wa/fel/de,http://rdf.ivdnt.org/lexica/diamant/entry/molex/27460,http://universaldependencies.org/u/pos/ADJ,gewafeld,gewafelde,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/102144,http://universaldependencies.org/u/pos/ADJ
2,ge/re/gle/men/teer/de,http://rdf.ivdnt.org/lexica/diamant/entry/molex/26787,http://universaldependencies.org/u/pos/ADJ,gereglementeerd,gereglementeerde,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/102662,http://universaldependencies.org/u/pos/ADJ
3,gras/rij/ke,http://rdf.ivdnt.org/lexica/diamant/entry/molex/28966,http://universaldependencies.org/u/pos/ADJ,grasrijk,grasrijke,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/104989,http://universaldependencies.org/u/pos/ADJ
6,gro/te/re,http://rdf.ivdnt.org/lexica/diamant/entry/molex/29606,http://universaldependencies.org/u/pos/ADJ,groot,grotere,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/108065,http://universaldependencies.org/u/pos/ADJ
8,gra/du/e/le,http://rdf.ivdnt.org/lexica/diamant/entry/molex/28815,http://universaldependencies.org/u/pos/ADJ,gradueel,graduele,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/111804,http://universaldependencies.org/u/pos/ADJ
10,goed/ge/bek/te,http://rdf.ivdnt.org/lexica/diamant/entry/molex/28432,http://universaldependencies.org/u/pos/ADJ,goedgebekt,goedgebekte,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/112821,http://universaldependencies.org/u/pos/ADJ
14,ge/we/tens/vol/le,http://rdf.ivdnt.org/lexica/diamant/entry/molex/27546,http://universaldependencies.org/u/pos/ADJ,gewetensvol,gewetensvolle,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/119192,http://universaldependencies.org/u/pos/ADJ
15,ge/in/cri/mi/neer/de,http://rdf.ivdnt.org/lexica/diamant/entry/molex/25762,http://universaldependencies.org/u/pos/ADJ,geïncrimineerd,geïncrimineerde,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/11979,http://universaldependencies.org/u/pos/ADJ
16,ge/ruis/ar/me,http://rdf.ivdnt.org/lexica/diamant/entry/molex/26864,http://universaldependencies.org/u/pos/ADJ,geruisarm,geruisarme,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/126222,http://universaldependencies.org/u/pos/ADJ
17,ge/bles/seer/de,http://rdf.ivdnt.org/lexica/diamant/entry/molex/25186,http://universaldependencies.org/u/pos/ADJ,geblesseerd,geblesseerde,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/126717,http://universaldependencies.org/u/pos/ADJ


Wanted list:


Unnamed: 0,left context,word 0,word 1,word 2,right context
3,de aandelen overnemen . Vanwaar,het,groot,verschil,? Dat ING een gezond
4,"manier doen , nu was",het,gewoon,zeeeeer,traaaaaaaaaaaaag .
5,"de goede wil hebben ,",het,gezond,verstand,en bereidheid om moeite te
6,doet me toch twijfelen aan,het,gezond,verstand,van Vlaanderen . [ /
9,absurde . Als oplossing voor,het,groot,aantal,mensen zou je mensen om


In [21]:
## Case study (sequential) 5: (morphosyntactic lexicon and possibly unannotated corpus) Look up inflected forms and spelling variants for a given lemma in a corpus

In [35]:
lexicon_to_search="molex"
corpus_to_search="chn"

lemma_to_look_for="denken"

# LEXICON: Search for the inflected forms of a lemma in a morphosyntactic lexicon
query=lexicon_query(lemma_to_look_for, None, lexicon_to_search)
df_lexicon = search_lexicon(query, lexicon_to_search)
display(df_lexicon)

# Put all inflected forms into a list
inflected_wordforms = list(df_lexicon.wordform)

# CORPUS: Look up the inflected forms in a (possibly unannotated) corpus
# beware: If the corpus is not annotated, all we can do is searching for the inflected words
#         But if the corpus is lemmatized, we have to make sure we're retrieving correct data by specifying the lemma as well
annotated_corpus = True
query = r'[lemma="'+lemma_to_look_for+r'" & word="'+r"|".join(inflected_wordforms)+r'"]' if annotated_corpus else r'[word="'+r"|".join(inflected_wordforms)+r'"]'
df_corpus = search_corpus(query, corpus=corpus_to_search)
display(df_corpus)

Unnamed: 0,Number,hyphenation,lemEntryId,lemPos,lemma,wordform,wordformId,wordformPos
0,http://universaldependencies.org/u/feat/Number.html#Plur,den/ken,http://rdf.ivdnt.org/lexica/diamant/entry/molex/105055,http://universaldependencies.org/u/pos/VERB,denken,denken,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/256472,http://universaldependencies.org/u/pos/VERB
1,http://universaldependencies.org/u/feat/Number.html#Sing,dacht,http://rdf.ivdnt.org/lexica/diamant/entry/molex/105055,http://universaldependencies.org/u/pos/VERB,denken,dacht,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/256464,http://universaldependencies.org/u/pos/VERB
2,http://universaldependencies.org/u/feat/Number.html#Sing,denk,http://rdf.ivdnt.org/lexica/diamant/entry/molex/105055,http://universaldependencies.org/u/pos/VERB,denken,denk,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/365942,http://universaldependencies.org/u/pos/VERB
3,http://universaldependencies.org/u/feat/Number.html#Plur,dach/ten,http://rdf.ivdnt.org/lexica/diamant/entry/molex/105055,http://universaldependencies.org/u/pos/VERB,denken,dachten,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/256462,http://universaldependencies.org/u/pos/VERB
4,http://universaldependencies.org/u/feat/Number.html#Sing,denkt,http://rdf.ivdnt.org/lexica/diamant/entry/molex/105055,http://universaldependencies.org/u/pos/VERB,denken,denkt,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/256476,http://universaldependencies.org/u/pos/VERB
5,http://universaldependencies.org/u/feat/Number.html#Sing,denk,http://rdf.ivdnt.org/lexica/diamant/entry/molex/105055,http://universaldependencies.org/u/pos/VERB,denken,denk,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/804045,http://universaldependencies.org/u/pos/VERB
6,http://universaldependencies.org/u/feat/Number.html#Sing,dacht,http://rdf.ivdnt.org/lexica/diamant/entry/molex/105055,http://universaldependencies.org/u/pos/VERB,denken,dacht,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/256468,http://universaldependencies.org/u/pos/VERB
7,http://universaldependencies.org/u/feat/Number.html#Plur,dach/ten,http://rdf.ivdnt.org/lexica/diamant/entry/molex/105055,http://universaldependencies.org/u/pos/VERB,denken,dachten,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/256461,http://universaldependencies.org/u/pos/VERB
8,http://universaldependencies.org/u/feat/Number.html#Plur,den/ken,http://rdf.ivdnt.org/lexica/diamant/entry/molex/105055,http://universaldependencies.org/u/pos/VERB,denken,denken,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/256470,http://universaldependencies.org/u/pos/VERB
9,http://universaldependencies.org/u/feat/Number.html#Sing,denkt,http://rdf.ivdnt.org/lexica/diamant/entry/molex/105055,http://universaldependencies.org/u/pos/VERB,denken,denkt,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/256477,http://universaldependencies.org/u/pos/VERB


Unnamed: 0,left context,word 0,right context
0,het Caribisch Gebied heeft Ik,denk,dat wij ook soortgelijke talenten
1,zich helemaal daarin vinden Hij,denkt,dat zijn departement heel veel
2,in de regio De samenleving,denkt,echter nog steeds dat het
3,Tijd om aan vakantiebesteding te,denken,Misschien heeft u al duidelijke
4,Suriname of naar het buitenland,Denkt,u wel aan uw medicatie
5,faciliteiten van het VCC Ik,denk,dat na de oplevering 15
6,boven gebracht Een kleine misstap,dacht,ik terwijl ik achteruit stapte
7,staan en niet na te,denken,over je eigen sterfelijkheid.Nyiragongo zit
8,ook aan een belangrijk moment,gedacht,De Anitri geloofshelden die dienstbaar
9,reële verwachtingen hebt dus niet,denkt,dat je er door dit
