# Chaining search


## Library functions: Search
 

In [142]:

import requests
import pandas as pd
import xml.etree.ElementTree as ET
import json
import urllib
AVAILABLE_CORPORA = ['chn', 'opensonar', 'zeebrieven', 'gysseling', 'nederlab']

# Get rid of ellipsis in display (otherwise relevant data might not be shown)
pd.set_option('display.max_colwidth',1000)

# Search methods

def search_corpus(query, corpus):
    if corpus not in AVAILABLE_CORPORA:
        raise ValueError("Unknown corpus: " + corpus)
    # Do request to federated content search corpora, so we get same output format for every corpus
    url = "http://portal.clarin.inl.nl/fcscorpora/clariah-fcs-endpoints/sru?operation=searchRetrieve&queryType=fcs&x-fcs-context=" + corpus + "&maximumRecords=20&query=" + urllib.parse.quote(query)
    response = requests.get(url)
    response_text = response.text
    df = parse_xml(response_text)
    return df

def search_corpus_multiple(queries, corpus):
    result_dict = {}
    for query in queries:
        result_dict[query] = search_corpus(query,corpus)
    return result_dict
   
def search_lexicon(query,corpus):
    endpoint = "http://172.16.4.56:8890/sparql"
    if (corpus=="diamant"):
        endpoint = "http://svprre02:8080/fuseki/tdb/sparql"
    
    # Accept header is needed for virtuoso, it isn't otherwise!
    response = requests.post(endpoint, data={"query":query}, headers = {"Accept":"application/sparql-results+json"})
        
    response_json = json.loads(response.text)
    records_json = response_json["results"]["bindings"]
    records_string = json.dumps(records_json)    
    df = pd.read_json(records_string, orient="records")
    
    # make sure cells containing NULL are added too, otherwise we'll end up with ill-formed data
    df = df.applymap(lambda x: '' if pd.isnull(x) else x["value"])
    
    return df

# Processing methods

def column_difference(df_column1, df_column2):
    set_df1 = set(df_column1)
    set_df2 = set(df_column2)
    diff_left = set_df1.difference(set_df2)
    diff_right = set_df2.difference(set_df1)
    intersec = set_df1.intersection(set_df2)
    return diff_left, diff_right, intersec

def diamant_get_synonyms(df):
    # Depending on the result type, we return the lemma or the definition text
    lemmas = set(df[df["inputMode"]=="defText"]["n_ontolex_writtenRep"])
    defTexts = set(df[df["inputMode"]=="lemma"]["n_syndef_definitionText"])
    return lemmas|defTexts

def parse_xml(text):
    # TODO: should we secure against untrusted XML?
    root = ET.fromstring(text)
    records = []
    n_words_in_hit = 0
    computed_nwih = False
    for entry in root.iter("{http://clarin.eu/fcs/resource}ResourceFragment"):
        for dataView in entry.findall("{http://clarin.eu/fcs/resource}DataView"):
            # We only take into account hits, ignore metadata and segmenting dataViews
            if (dataView.get("type")=="application/x-clarin-fcs-hits+xml"):
                result = dataView.find("{http://clarin.eu/fcs/dataview/hits}Result")
                left_context = result.text if result.text is not None else ''
                hits = list(result)
                last_hit = hits[-1]
                right_context = last_hit.tail if last_hit.tail is not None else ''
                hit_words = [hit.text for hit in hits]
                
                if not computed_nwih:
                    n_words_in_hit = len(hits)
                    computed_nwih=True
                kwic = [left_context] + hit_words + [right_context]
                records.append(kwic)
    columns = ["left context"] + ["word " + str(n) for n in range(n_words_in_hit)] + ["right context"]
    return pd.DataFrame(records, columns = columns)


# View methods

# results: dict of df's
# labels: list of label corresponding to the df's in results
def view_multiple_results(results, labels):
    assert len(labels)==len(results)
    for n,query in enumerate(results):
        df = results[query]
        if not df.empty:
            display(HTML('Resultaten voor <b>' + labels[n] + "</b>:"))
            display(df)

## Library functions: UI

In [143]:

import ipywidgets as widgets
from IPython.display import display
DEFAULT_QUERY = r'[lemma="boek" & pos="verb"]' #r'[lemma="boeken" pos="verb"]'
DEFAULT_CORPUS = "chn"

def create_corpus_ui():
    # Create UI elements
    corpusQueryField = widgets.Text(description="<b>CQL query:</b>", value=DEFAULT_QUERY)
    corpusField = widgets.Dropdown(
        options=AVAILABLE_CORPORA,
        value=DEFAULT_CORPUS,
        description='<b>Corpus:</b>',
    )
    '''corpusSearchButton = widgets.Button(
        description='Search',
        button_style='info', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Search',
    )
    # Handle events
    corpusSearchButton.on_click(corpus_search)'''

    # Stack UI elements in vertical box and display
    corpusUiBox = widgets.VBox([corpusQueryField,corpusField])
    display(corpusUiBox)
    
    # Return fields, so their contents are accessible from the global namespace of the Notebook
    return corpusQueryField, corpusField

def create_lexicon_ui():
    DEFAULT_SEARCHWORD = 'boek'
    DEFAULT_LEXICON = "diamant"

    # Create UI elements
    searchWordField = widgets.Text(description="<b>Word:</b>", value=DEFAULT_SEARCHWORD)
    lexiconField = widgets.Dropdown(
        options=['anw', 'celex', 'diamant', 'duelme', 'molex'],
        value=DEFAULT_LEXICON,
        description='<b>Lexicon:</b>',
    )
    '''lexSearchButton = widgets.Button(
        description='Search',
        button_style='info', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Search',
    )
    # Handle events
    lexSearchButton.on_click(lexicon_search)'''
    # Stack UI elements in vertical box and display
    lexUiBox = widgets.VBox([searchWordField,lexiconField])
    display(lexUiBox)
    return searchWordField, lexiconField

## Library functions: Queries

In [144]:
import re

def containsRegex(word):
    return ( word.find('^')>-1 or
            word.find('$')>-1 or 
            re.match("\(.+?\)", word) or
            re.match("\[.+?\]", word) or
            re.match("[\+*]", word) )
                     
def lexicon_query(word, lexicon):
    if (lexicon=="anw"):
        exactsearch = (not containsRegex(word))
        subpart = """FILTER ( regex(?lemma, \""""+word+"""\") || regex(?definition, \""""+word+"""\") ) . """
        if (exactsearch == True):
              subpart =  """
                { { ?lemId rdfs:label ?lemma .  
                values ?lemma { \""""+word+"""\"@nl \""""+word+"""\" } }                 
                UNION
                { ?definitionId lemon:value ?definition .
                values ?definition { \""""+word+"""\"@nl \""""+word+"""\" } } } .
                """               
        query = """PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
                  PREFIX anw: <http://rdf.ivdnt.org/lexica/anw>
                  PREFIX anwsch: <http://rdf.ivdnt.org/schema/anw/>
                  PREFIX lemon: <http://lemon-model.net/lemon#>
                  
                  SELECT ?lemId ?lemma ?writtenForm ?definition concat('', ?definitionComplement) as ?definitionComplement
                  FROM <http://rdf.ivdnt.org/lexica/anw>
                  WHERE {
                      ?lemId rdfs:label ?lemma .
                      ?lemId ontolex:sense ?senseId .
                      ?senseId lemon:definition ?definitionId .
                      ?definitionId lemon:value ?definition .
                      OPTIONAL { ?definitionId anwsch:definitionComplement ?definitionComplement .}
                      OPTIONAL { ?lemId ontolex:canonicalForm ?lemCFId . 
                          ?lemCFId ontolex:writtenRepresentation ?writtenForm . }
                      """+subpart+"""
                      }"""
    elif (lexicon=="diamant"):
        exactsearch = (not containsRegex(word))
        subpart1 = """?n_form ontolex:writtenRep ?n_ontolex_writtenRep . 
            FILTER regex(?n_ontolex_writtenRep, \""""+word+"""\") . """
        subpart2 = """?n_syndef diamant:definitionText ?n_syndef_definitionText .  
            FILTER regex(?n_ontolex_writtenRep, \""""+word+"""\") . """
        if (exactsearch == True):
            subpart1 =  """
                { ?n_form ontolex:writtenRep ?n_ontolex_writtenRep . 
                values ?n_ontolex_writtenRep { \""""+word+"""\"@nl \""""+word+"""\" } } 
                """                
            subpart2 = """
                { ?n_syndef diamant:definitionText ?n_syndef_definitionText . 
                values ?n_syndef_definitionText { \""""+word+"""\"@nl \""""+word+"""\" } } 
                """
        query = """
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        prefix prov: <http://www.w3.org/ns/prov#>
        prefix diamant: <http://rdf.ivdnt.org/schema/diamant#>
        prefix lexinfo: <http://www.lexinfo.net/ontology/2.0/lexinfo#>
        prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        prefix lemon: <http://lemon-model.net/lemon#>
        prefix ontolex: <http://www.w3.org/ns/lemon/ontolex#>
        prefix ud: <http://universaldependencies.org/u/pos/>
        prefix skos: <http://www.w3.org/2004/02/skos/core#>
        prefix dcterms: <http://purl.org/dc/terms/>
        prefix dc: <http://purl.org/dc/terms/>

        select ?n_entry ?n_form ?n_ontolex_writtenRep ?n_syndef ?n_sensedef ?n_sensedef_definitionText ?n_syndef_definitionText ?n_sense ?inputMode ?wy_f_show ?wy_t_show
        where
        {
        graph ?g
        {
        {
            """ + subpart1 + """
            { ?n_entry a ontolex:LexicalEntry} .
            { ?n_form a ontolex:Form} .
            { ?n_sense a ontolex:LexicalSense} .
            { ?n_syndef a diamant:SynonymDefinition} .
            { ?n_sensedef a lemon:SenseDefinition} .
            { ?n_syndef diamant:definitionText ?n_syndef_definitionText } .
            { ?n_sensedef diamant:definitionText ?n_sensedef_definitionText } .
            { ?n_entry ontolex:canonicalForm ?n_form } .
            { ?n_entry ontolex:sense ?n_sense } .
            { ?n_sense lemon:definition ?n_syndef } .
            { ?n_sense lemon:definition ?n_sensedef } .
              ?n_sense diamant:attestation ?n_attest_show .
              ?n_sense diamant:attestation ?n_attest_filter .
              ?n_attest_show diamant:text ?n_q_show .
              ?n_attest_filter diamant:text ?n_q_filter .
              ?n_attest_show a diamant:Attestation .
              ?n_attest_filter a diamant:Attestation .
              ?n_q_filter a diamant:Quotation .
              ?n_q_show a diamant:Quotation .
              ?n_q_filter diamant:witnessYearFrom ?wy_f_filter .
              ?n_q_filter diamant:witnessYearTo ?wy_t_filter .
              ?n_q_show diamant:witnessYearFrom ?wy_f_show .
              ?n_q_show diamant:witnessYearTo ?wy_t_show .
              FILTER (xsd:integer(?wy_f_show) >= 1200)
              FILTER (xsd:integer(?wy_t_show) >= 1200)
              FILTER (xsd:integer(?wy_f_show) <= 2018)
              FILTER (xsd:integer(?wy_t_show) <= 2018)
            { bind("lemma" as ?inputMode) } .
            } UNION
          {
            """ + subpart2 + """
            { ?n_sense a ontolex:LexicalSense} .
            { ?n_syndef a diamant:SynonymDefinition} .
            { ?n_sensedef a lemon:SenseDefinition} .
            { ?n_form a ontolex:Form} .
            { ?n_form ontolex:writtenRep ?n_ontolex_writtenRep } .  { ?n_entry a ontolex:LexicalEntry} .
            { ?n_entry ontolex:sense ?n_sense } .
            { ?n_sense lemon:definition ?n_syndef } .
            { ?n_sense lemon:definition ?n_sensedef } .
            { ?n_sensedef diamant:definitionText ?n_sensedef_definitionText } .
            { ?n_entry ontolex:canonicalForm ?n_form } .
            ?n_sense diamant:attestation ?n_attest_show .
            ?n_sense diamant:attestation ?n_attest_filter .
            ?n_attest_filter diamant:text ?n_q_filter .
            ?n_attest_show diamant:text ?n_q_show .
            ?n_q_filter diamant:witnessYearFrom ?wy_f_filter .
            ?n_q_filter diamant:witnessYearTo ?wy_t_filter .
            ?n_q_show diamant:witnessYearFrom ?wy_f_show .
            ?n_q_show diamant:witnessYearTo ?wy_t_show .
            ?n_attest_show a diamant:Attestation .
            ?n_attest_filter a diamant:Attestation .
            ?n_q_filter a diamant:Quotation .
            ?n_q_show a diamant:Quotation .
            FILTER (xsd:integer(?wy_f_show) >= 1200)
            FILTER (xsd:integer(?wy_t_show) >= 1200)
            FILTER (xsd:integer(?wy_f_show) <= 2018)
            FILTER (xsd:integer(?wy_t_show) <= 2018)
          { bind("defText" as ?inputMode) } .
            }
        }
        }"""
    elif (lexicon=="molex"):
        exactsearch = (not containsRegex(word))
        subpart1 = """"""
        subpart2 = """"""
        if (exactsearch == True):
            subpart1 =  """
                { ?lemCFId ontolex:writtenRep ?lemma . 
                values ?lemma { \""""+word+"""\"@nl \""""+word+"""\" } } 
                UNION
                { ?wordformId ontolex:writtenRep ?wordform . 
                values ?wordform { \""""+word+"""\"@nl \""""+word+"""\" } } .
                """        
        else:
            subpart2 = """FILTER ( regex(?lemma, \""""+word+"""\") || regex(?wordform, \""""+word+"""\") ) . """
        query = """
            PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
            PREFIX UD: <http://universaldependencies.org/u/>
            PREFIX diamant: <http://rdf.ivdnt.org/schema/diamant#>
            
            SELECT ?lemEntryId ?lemma ?lemPos ?wordformId ?wordform ?hyphenation ?wordformPos ?Gender ?Number
            FROM <http://rdf.ivdnt.org/lexica/molex>
            WHERE
            {
            ?lemEntryId ontolex:canonicalForm ?lemCFId .
            ?lemCFId ontolex:writtenRep ?lemma .
            """+subpart1+"""
            OPTIONAL {?lemEntryId UD:Gender ?Gender .}
            OPTIONAL {?lemEntryId UD:VerbForm ?verbform .}
            ?lemEntryId UD:pos ?lemPos .
            ?lemEntryId ontolex:lexicalForm ?wordformId .
            ?wordformId UD:pos ?wordformPos .
            OPTIONAL {?wordformId UD:Number ?Number .}
            OPTIONAL {?wordformId ontolex:writtenRep ?wordform .}
            OPTIONAL {?wordformId diamant:hyphenation ?hyphenation .}
            """+subpart2+"""
            }
        """
    elif (lexicon=="duelme"):
        exactsearch = (not containsRegex(word))
        subpart = """FILTER ( regex(?lemma, \""""+word+"""\") || regex(?wordform, \""""+word+"""\") ) ."""
        if (exactsearch == True):
            subpart =  """
                { ?y lmf:hasLemma ?dl .  
                values ?dl { \""""+word+"""\"@nl \""""+word+"""\" } }                 
                """        
        query = """
            PREFIX duelme: <http://rdf.ivdnt.org/lexica/duelme>
            PREFIX intskos: <http://ivdnt.org/schema/lexica#>
            PREFIX lmf: <http://www.lexinfo.net/lmf>
            PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
            PREFIX UD: <http://rdf.ivdnt.org/vocabs/UniversalDependencies2#>
            
            SELECT ?exampleSentence ?lemma ?gender ?number
            WHERE  {
                  ?d intskos:ExampleSentence ?exampleSentence .
                  ?d lmf:ListOfComponents [lmf:Component ?y] .
                  ?y lmf:hasLemma ?lemma . 
                  OPTIONAL {?y UD:Gender ?gender}
                  OPTIONAL {?y UD:Number ?number}
            """+subpart+"""
            }
        """
    elif (lexicon=="celex"):
        exactsearch = (not containsRegex(word))
        subpart = """FILTER ( regex(?lemma, \""""+word+"""\") ) . """
        if (exactsearch == True):
            subpart =  """
                { ?lemmaId ontolex:canonicalForm [ontolex:writtenRep ?lemma] .  
                values ?lemma { \""""+word+"""\"@nl \""""+word+"""\" } }                 
                """        
        query = """
            PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
            PREFIX celex: <http://rdf.ivdnt.org/lexica/celex>
            PREFIX UD: <http://rdf.ivdnt.org/vocabs/UniversalDependencies2#>
            PREFIX decomp: <http://www.w3.org/ns/lemon/decomp#>
            
            SELECT DISTINCT ?lemmaId ?lemma ?wordformId ?wordform ?number ?gender concat('',?subLemmata) AS ?subLemmata
            WHERE  {
                ?lemmaId ontolex:canonicalForm [ontolex:writtenRep ?lemma] .
                """+subpart+"""
                BIND( ?lemmaId AS ?lemmaIdIRI ).
                ?lemmaId ontolex:lexicalForm ?wordformId .
                ?wordformId ontolex:writtenRep ?wordform .
                OPTIONAL {?wordformId UD:Number ?number} .
                OPTIONAL {
                    ?lemmaId UD:Gender ?g . 
                        bind( 
                            if(?g = UD:Fem_Gender, 
                            UD:Com_Gender, 
                                if(?g = UD:Masc_Gender,
                                    UD:Com_Gender,
                                    UD:Neut_Gender
                                )
                            )
                            AS ?gender
                        )
                }
                OPTIONAL {
                    SELECT ?lemmaIdIRI (group_concat(DISTINCT concat(?partNr,":",?subLemma);separator=" + ") as ?subLemmata)
                    WHERE {
                        SELECT ?lemmaIdIRI ?celexComp ?aWordformId ?subLemma ?partNr
                        WHERE {
                                {
                                ?lemmaIdIRI decomp:constituent ?celexComp .
                                ?celexComp decomp:correspondsTo ?subLemmaId .
                                BIND( ?subLemmaId AS ?subLemmaIdIRI ) .
                                ?subLemmaIdIRI ontolex:canonicalForm [ontolex:writtenRep ?subLemma] .
                                BIND( ?lemmaIdIRI AS ?mainLemmaId ) .
                                ?mainLemmaId ontolex:lexicalForm ?aWordformId .
                                }
                                {
                                    {
                                        {?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#_1> ?celexComp .}
                                        UNION
                                        {?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#_2> ?celexComp .}
                                        UNION
                                        {?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#_3> ?celexComp .}
                                        UNION
                                        {?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#_4> ?celexComp .}
                                        UNION
                                        {?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#_5> ?celexComp .}
                                        UNION
                                        {?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#_6> ?celexComp .}                                        
                                    }
                                ?lemmaIdIRI ?rdfsynt ?celexComp .
                                BIND(IF(STRSTARTS(str(?rdfsynt), "http://www.w3.org/1999/02/22-rdf-syntax-ns#"), replace(STRAFTER(str(?rdfsynt), "#"), "_", ""), "999") AS ?partNr) .
                                MINUS {
                                    ?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#0> ?celexComp .
                                    }
                                }
                            FILTER (?partNr != "999") .
                            }
                            ORDER BY ?partNr
                            }
                        GROUP BY ?aWordformId ?lemmaIdIRI
                    }
            }
        """
        
    return query

def corpus_query_lemma(word):
    return r'[lemma="'+ word + r'"]'

def lexicon_query_allwords(lexicon):
    if (lexicon=="diamant"):
        query = """
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        prefix prov: <http://www.w3.org/ns/prov#>
        prefix diamant: <http://rdf.ivdnt.org/schema/diamant#>
        prefix lexinfo: <http://www.lexinfo.net/ontology/2.0/lexinfo#>
        prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        prefix lemon: <http://lemon-model.net/lemon#>
        prefix ontolex: <http://www.w3.org/ns/lemon/ontolex#>
        prefix ud: <http://universaldependencies.org/u/pos/>
        prefix skos: <http://www.w3.org/2004/02/skos/core#>
        prefix dcterms: <http://purl.org/dc/terms/>
        prefix dc: <http://purl.org/dc/terms/>

        select ?n_ontolex_writtenRep
        where
        {
        graph ?g
        {
        {
            { ?n_form ontolex:writtenRep ?n_ontolex_writtenRep} .
            { ?n_form a ontolex:Form} .
        }
        }
        }
        LIMIT 10000"""
    return query

## Corpus search

* Run the cell below to show the UI, and fill in your search query

In [72]:
#from chaininglib import ui

# Create corpus UI, creates references to field contents
corpusQueryField, corpusField = create_corpus_ui()

VBox(children=(Text(value='[lemma="boek" & pos="verb"]', description='<b>CQL query:</b>'), Dropdown(descriptio…

 * Click the cell below and press Run to perform the given query

In [73]:
#from chaininglib import search

query= corpusQueryField.value
corpus = corpusField.value
df_corpus = search_corpus(query,corpus)
display(df_corpus)



Unnamed: 0,left context,word 0,right context
0,of Imam Zij staat te,boek,als het eerste zwarte topmodel
1,Rihanna 4 Ze staat te,boek,als het eerste zwarte model
2,te pakken Gilaard staat te,boek,als een goed militair maar
3,verklaard Scotto d'Abusco staat te,boek,als een beloftevolle Italiaan van
4,als La Gioconda staat te,boek,als het portret van de
5,dat niet Beatrix staat te,boek,als een eigengereide koppige koningin
6,realpolitische belegging staat vandaag te,boek,als een van hun grootste
7,en biograaf Bastet staat te,boek,als een groot kenner van
8,Il Magnifico 1449-1492 staat te,boek,als een van de grootste
9,zou zijn Danneels staat te,boek,als een van de intelligentere


## Lexicon search

* Run the cell below to show the UI, and fill in your search query in the UI

In [134]:
#from chaininglib import ui
searchWordField, lexiconField = create_lexicon_ui()

VBox(children=(Text(value='boek', description='<b>Word:</b>'), Dropdown(description='<b>Lexicon:</b>', index=2…

 * Click the cell below and press Run to perform the given query

In [147]:
#from chaininglib import queries, search

search_word = searchWordField.value
lexicon = lexiconField.value
# USER: can replace this by own custom query
query = lexicon_query(word=search_word, lexicon=lexicon)

df_lexicon = search_lexicon(query, lexicon)
df_columns_list = list(df_lexicon.columns.values)
df_lexicon_in_columns = df_lexicon[df_columns_list]
display(df_lexicon_in_columns)

Unnamed: 0,Gender,Number,hyphenation,lemEntryId,lemPos,lemma,wordform,wordformId,wordformPos
0,http://universaldependencies.org/u/feat/Gender.html#Masc,http://universaldependencies.org/u/feat/Number.html#Plur,gro/ten,http://rdf.ivdnt.org/lexica/diamant/entry/molex/29607,http://universaldependencies.org/u/pos/NOUN,groot,groten,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/131193,http://universaldependencies.org/u/pos/NOUN
1,http://universaldependencies.org/u/feat/Gender.html#Masc,http://universaldependencies.org/u/feat/Number.html#Sing,groot,http://rdf.ivdnt.org/lexica/diamant/entry/molex/29607,http://universaldependencies.org/u/pos/NOUN,groot,groot,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/99632,http://universaldependencies.org/u/pos/NOUN
2,,,gro/te/re,http://rdf.ivdnt.org/lexica/diamant/entry/molex/29606,http://universaldependencies.org/u/pos/ADJ,groot,grotere,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/108065,http://universaldependencies.org/u/pos/ADJ
3,,,groot/ste,http://rdf.ivdnt.org/lexica/diamant/entry/molex/29606,http://universaldependencies.org/u/pos/ADJ,groot,grootste,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/141124,http://universaldependencies.org/u/pos/ADJ
4,,,gro/ter,http://rdf.ivdnt.org/lexica/diamant/entry/molex/29606,http://universaldependencies.org/u/pos/ADJ,groot,groter,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/182423,http://universaldependencies.org/u/pos/ADJ
5,,,grootst,http://rdf.ivdnt.org/lexica/diamant/entry/molex/29606,http://universaldependencies.org/u/pos/ADJ,groot,grootst,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/28226,http://universaldependencies.org/u/pos/ADJ
6,,,groots,http://rdf.ivdnt.org/lexica/diamant/entry/molex/29606,http://universaldependencies.org/u/pos/ADJ,groot,groots,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/356749,http://universaldependencies.org/u/pos/ADJ
7,,,gro/te,http://rdf.ivdnt.org/lexica/diamant/entry/molex/29606,http://universaldependencies.org/u/pos/ADJ,groot,grote,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/81126,http://universaldependencies.org/u/pos/ADJ
8,,,groot,http://rdf.ivdnt.org/lexica/diamant/entry/molex/29606,http://universaldependencies.org/u/pos/ADJ,groot,groot,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/99633,http://universaldependencies.org/u/pos/ADJ
9,,,groot,http://rdf.ivdnt.org/lexica/diamant/entry/molex/29607,http://universaldependencies.org/u/pos/NOUN,groot,groot,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/99632,http://universaldependencies.org/u/pos/NOUN


## Case study 1 (parallel): Frequency of *puur*+verb and *zuiver*+verb compared
* Below cell searches for *puur*+verb and for *zuiver*+verb in the CHN corpus
* Compare frequencies

In [5]:
#from chaininglib import search
from IPython.core.display import display, HTML

# Word 1: puur
word1= "puur"
df_corpus1 = search_corpus(r'[word="' + word1 + r'"][pos="verb"]',corpus="chn")
display(HTML('<b>' + word1 + '</b>'))
display(df_corpus1)

# Word 2: zuiver
word2 = "zuiver"
df_corpus2 = search_corpus(r'[word="' + word2 + r'"][pos="verb"]',"chn")
display(HTML('<b>' + word2 + '</b>'))
display(df_corpus2)

# Compute difference
diff_left, diff_right, intersec = column_difference(df_corpus1["word 1"], df_corpus2["word 1"])
# Elements of 1 that are not in 2
display(HTML('Werkwoorden voor <b>' + word1 + '</b> niet in <b>' + word2 + '</b>: ' + ", ".join(diff_left)))
# Elements of 2 that are not in 1
display(HTML('Werkwoorden voor <b>' + word1 + '</b> niet in <b>' + word2 + '</b>: ' + ", ".join(diff_right)))
# Elements both in 1 and 2
display(HTML('Werkwoorden zowel voor <b>' + word1 + '</b> als voor <b>' + word2 + '</b>: ' + ", ".join(intersec)))

Unnamed: 0,left context,word 0,word 1,right context
0,dik tevreden Ze hebben echt,goed,gepresteerd,Dit had ik niet verwacht
1,zouden zij deze nog heel,goed,moeten,kunnen gebruiken Er zullen korte
2,gevraagd Ook de meningsuiting werd,goed,benut,door 22 deelnemers en er
3,drank betreft is de bar,goed,voorzien,De wand van de bar
4,Het team volgens de assistent-trainer,goed,gemotiveerd,Waar we kracht ontbreken pakken
5,vinden juist dat ze mekaar,goed,aanvullen,en er niets op elkaars
6,probleem is dat we niet,goed,luisteren,Hij geeft iedereen een boodschap
7,te zorgen dat de stembusgang,goed,gaat,Om zeven uur s avonds
8,carrièrepad heeft ze dan ook,goed,uitgestippeld,Ze is op het Ministerie
9,kulturu wenkri Een missie die,goed,aansluit,bij de huidige plannen rond


Unnamed: 0,left context,word 0,word 1,right context
0,baby Ik wil het contact,zuiver,beperken,tot de baby en het
1,zo zul je als mens,zuiver,moeten,zijn om in aanmerking te
2,adviezen van mensen Ik heb,zuiver,vastgelegd,wat ik zelf spraakmakend vond
3,karaoke je ding Daarvoor is,zuiver,zingen,niet nodig Dat is juist
4,aan als het gevoel erachter,zuiver,is,Met deze woorden besloot Loes
5,Maar als je je gevoelens,zuiver,houdt,en goede wensen creëert dan
6,Cronie maar die was te,zuiver,genomen,Bij The Scorpions benutten Misiedjan
7,Woudman op Cairo Deze werd,zuiver,genomen,door Dwight Tempico 2-1 Een
8,de leerlingen herkenbaar en is,zuiver,beweert,de directeur Ook enkele leerkrachten
9,bakzeil De strafschop was te,zuiver,genomen,


## Case study 2 (sequential): Retrieve synonyms from DiaMaNT, look up in Gysseling
* Below cell searches for term "boek" in DiaMaNT, and looks up all variants in Gysseling

In [9]:
search_word = "boek"
lexicon = "diamant"
corpus= "gysseling"

# First, lookup synonyms in DiaMaNT
query = lexicon_query(word=search_word, lexicon=lexicon)
df_lexicon = search_lexicon(query, lexicon)
syns = diamant_get_synonyms(df_lexicon) 
syns.add(search_word) # Also add search word itself
display(HTML('Synoniemen voor <b>' + search_word + '</b>: ' + ", ".join(syns)))

# Search for all synonyms in corpus
## Create queries: search by lemma
syns_queries = [corpus_query_lemma(syn) for syn in syns]
## Search for all synonyms in corpus
result_dict = search_corpus_multiple(syns_queries, corpus)
view_multiple_results(result_dict, labels=list(syns))



Unnamed: 0,left context,word 0,right context
0,.iiii. ghecorne gulde broeders die de,boeke,oudenden sin. si moghen elc
1,viere ghecorne guldebroeders die de,boeke,houden si moghen elc haren
2,.iiii. ghecorne guldebroeders die de,boke,ouden si moghen elc haren
3,secundi willelmus de lapide willelmus,boec,Jn elst. arnulphus de keelne
4,"Heren M CC LXXX, due wart det",buec,begonnen. Desen csens es mer
5,"van poschen, due wart det",buec,begonnen. Desen pagt es mer
6,"van poschen, due wart det",buec,begonnen. Desen pagt es mer
7,"van poschen, due wart dit",buc,begonnen. Dese pegte es mer
8,"van poschen, due wart dit",buc,begonnen. Dit blift den bruderen
9,"van poschen, due wart dit",buc,begonnen. Dit sin degene die


Unnamed: 0,left context,word 0,right context
0,Ende roofde den tempel ende,sloten toe.,Dar na saen so starf hi
1,want haer mont was sekerleke,toe ghesloten,so starkeleke dat menne met


Unnamed: 0,left context,word 0,right context
0,Voert; dat een weuera die,werc,"ghenoch heft ter volre weken,"
1,met den weueren die en ghen,werc,en hebben te weuene; hine
2,platse hout inde weke ende,werc,heft; hi es sculdech en
3,van vresen van vinders. Dat,werc,es verbord; Ende hi moet
4,dade weuen; hi verborde dat,werc,ende .iij. lb. So wat ambochts man;
5,dade weuen hie verbord dat,werc,ende .iij. lb. Dit mach
6,es .v. ᨣ. [2] Dat,werc,es sculdich te stane an
7,es .v. ᨣ. [2] Dat,werc,es sculdich te stane an
8,te enighes drapeniers huus; omme,weerc,te beiaghene. jof omme enighen
9,staet die die mester gheen,weerc,gheuen ne wille; hie moet


Unnamed: 0,left context,word 0,right context
0,uan haueleker scult ende negene,orconden,ne heuet; die beclaghet es
1,tue eruahtege man heuet in,orconden,hi sal winnen sinen houestoel.
2,yemene dinghet ende hem uermet,orconden,die hi iegenwordech heuet ende
3,cateil. Ende dinghet hi sonder,orconden;,deghene dar hi up dinghet
4,hebben tue eruahteghe man in,orconden.,ende si sullen hem helpen
5,hebbe tue eruahteghe man te,orconden,dat hem uergolden si. Neware
6,uan doder hant ende negene,orconden,ne heuet hi biede sinen
7,hebben tue eruahteghe man te,orconden.,Doed en uremde man enen
8,Tiemen hem meer dar neghene,orconden,ne sien; met enen eruahtegen
9,dan enen eruahteghen man in,orconden;,die orconde moet sueren binnen


Unnamed: 0,left context,word 0,right context
0,didscher talen ende ic vten,texte,van den vire ewangelisten makde
1,in vele staden es de,text,van der ewangelien also donker
2,vele meerre sijn dan de,text,van der ewangelien alte male. Ende
3,didsche alse si in den,texte,"gescreuen sijn, so bleue dit"


Unnamed: 0,left context,word 0,right context
0,ende met .iiij. draden roder,lijsten,in .ij. euelten die hier ieghen
1,lib. het ne ware die,lijste,vanden lakene. Vort eist ghecuert
2,ᨮᨬ. het ne ware die,lijste,vanden lakene [3] vort es
3,langher ende dat met ere,lijste,Ende in so wat cammen dat
4,van wits te moreideine die,lijste,ne laghe binnen diere ieghen dade
5,.xxx. pond ende met .ij.,lijsten,.ix. drade vp elken egh
6,drade vp elken egh blawer,lijsten,ende der in ghesceert diere ieghen dade
7,.xxviij. pond met .i. roder,lijste,Ende dese vorseide saye salmen
8,elc warpin say ene blaeuwe,"lijste,",ende tweueline ne ghene. Ende so wie
9,ne ghene. Ende so wie die blaeuwe,lijste,scerde an weuelin say ouer


Unnamed: 0,left context,word 0,right context
0,van der vorseider stede. ten,ghewarke,boef. van der fermerien van
1,graf. Ende gheft aldaer ten,ghewerke.,vander kerken vijf pont vlaemsche.
2,deelne. vord gaf soe den,ghewerke,van onser vrouwen kerke. twintich
3,kerke. twintich sceleghe vlaemsche. den,ghewerke,van sinte saluators kerke. twintich
4,sceleghe vlaemsche. vord te elken,ghewerke,van allen kerken die binne
5,ende xv. ᨮᨬ te haren,"ghewerke,",Vort gheuic broder pauwels vanine
6,Tonser vrouwen in brugghe ten,ghewerke,xl. ᨣ. Aldar tsinte saluatoers
7,saluatoers xl. ᨣ also. Ten,ghewerke,te sinte baues bute brugghe
8,engeen man en ware met,gewerke.,hi en waert selue dan
9,wille te wederstane Ende gode,ghewerke,in elke stede Te begharne


Unnamed: 0,left context,word 0,right context
0,uan haueleker scult ende negene,orconden,ne heuet; die beclaghet es
1,tue eruahtege man heuet in,orconden,hi sal winnen sinen houestoel.
2,yemene dinghet ende hem uermet,orconden,die hi iegenwordech heuet ende
3,cateil. Ende dinghet hi sonder,orconden;,deghene dar hi up dinghet
4,hebben tue eruahteghe man in,orconden.,ende si sullen hem helpen
5,hebbe tue eruahteghe man te,orconden,dat hem uergolden si. Neware
6,uan doder hant ende negene,orconden,ne heuet hi biede sinen
7,hebben tue eruahteghe man te,orconden.,Doed en uremde man enen
8,Tiemen hem meer dar neghene,orconden,ne sien; met enen eruahtegen
9,dan enen eruahteghen man in,orconden;,die orconde moet sueren binnen


Unnamed: 0,left context,word 0,right context
0,Ende roofde den tempel ende,sloten toe.,Dar na saen so starf hi
1,want haer mont was sekerleke,toe ghesloten,so starkeleke dat menne met


Unnamed: 0,left context,word 0,right context
0,.iiii. ghecorne gulde broeders die de,boeke,oudenden sin. si moghen elc
1,viere ghecorne guldebroeders die de,boeke,houden si moghen elc haren
2,.iiii. ghecorne guldebroeders die de,boke,ouden si moghen elc haren
3,secundi willelmus de lapide willelmus,boec,Jn elst. arnulphus de keelne
4,"Heren M CC LXXX, due wart det",buec,begonnen. Desen csens es mer
5,"van poschen, due wart det",buec,begonnen. Desen pagt es mer
6,"van poschen, due wart det",buec,begonnen. Desen pagt es mer
7,"van poschen, due wart dit",buc,begonnen. Dese pegte es mer
8,"van poschen, due wart dit",buc,begonnen. Dit blift den bruderen
9,"van poschen, due wart dit",buc,begonnen. Dit sin degene die


Unnamed: 0,left context,word 0,right context
0,didscher talen ende ic vten,texte,van den vire ewangelisten makde
1,in vele staden es de,text,van der ewangelien also donker
2,vele meerre sijn dan de,text,van der ewangelien alte male. Ende
3,didsche alse si in den,texte,"gescreuen sijn, so bleue dit"


## Case study (parallel) 3: Find corpus words not in lexicon; list most frequent ones.
* Only parallel if you can ask the lexicon a list of all words.
* Currently only working: ask DiaMaNT list of words (limited at 10000)

In [22]:
# Query lexicon to give list of all words
lexicon="diamant"
query = lexicon_query_allwords(lexicon)
df_lexicon = search_lexicon(query, lexicon)
display(df_lexicon)

Unnamed: 0,n_ontolex_writtenRep
0,gaardenaar
1,gardeniers
2,gairdenairs
3,gardenaer
4,gaerdenaer
5,gaerdenere
6,onwetig
7,onwetich
8,onwettech
9,sturing
