# Chaining search



## Sphinx documentatie: https://pythonhosted.org/an_example_pypi_project/sphinx.html
## in voorbeelden handige python functies opnemen
## zoals ; .sort_values(ascending=False,by=['raw_freq']));  list enz


## Corpus search

* Run the cell below to show the UI, and fill in your search query

In [None]:
from chaininglib.ui.search import create_corpus_ui

# Create corpus UI, creates references to field contents
corpusQueryField, corpusField = create_corpus_ui()

 * Click the cell below and press Run to perform the given query

In [None]:
from chaininglib.search.CorpusQuery import *
from chaininglib.ui.dfui import display_df

#from chaininglib import search
query= corpusQueryField.value
corpus_name = corpusField.value
df_corpus = create_corpus(corpus_name).pattern(query).method("blacklab").search().kwic()
#df_corpus = load_dataframe('mijn_resultaten.csv')
display_df(df_corpus, labels="Results")



## Lexicon search

* Run the cell below to show the UI, and fill in your search query in the UI

In [None]:
from chaininglib.ui.search import create_lexicon_ui

#from chaininglib import ui
searchWordField, lexiconField = create_lexicon_ui()

 * Click the cell below and press Run to perform the given query

In [None]:
from chaininglib.search.LexiconQuery import *
from chaininglib.ui.dfui import display_df

search_word = searchWordField.value
lexicon_name = lexiconField.value
# USER: can replace this by own custom query
lex = create_lexicon(lexicon_name).lemma(search_word).search()
df_lexicon = lex.kwic()
display_df(df_lexicon)
#df_columns_list = list(df_lexicon.columns.values)
#df_lexicon_in_columns = df_lexicon[df_columns_list]
#display(df_lexicon_in_columns)

## Case study 1 (parallel): Frequency of *puur*+verb and *zuiver*+verb compared
* Below cell searches for *puur*+verb and for *zuiver*+verb in the CHN corpus
* Compare frequencies

In [None]:
#from chaininglib import search
from IPython.core.display import display, HTML
from chaininglib.search.CorpusQuery import *
from chaininglib.ui.dfui import display_df
from chaininglib.utils.dfops import column_difference

# Word 1: puur
word1= "puur"
cq1 = create_corpus("chn").pattern(r'[word="' + word1 + r'"][pos="verb"]')
df_corpus1 = cq1.search().kwic()
display_df(df_corpus1, word1)

# Word 2: zuiver
word2 = "zuiver"
cq2 = create_corpus("chn").pattern(r'[word="' + word2 + r'"][pos="verb"]')
df_corpus2 = cq2.search().kwic()
display_df(df_corpus2, word2)

# Compute difference
diff_left, diff_right, intersec = column_difference(df_corpus1["word 1"], df_corpus2["word 1"])
# Elements of 1 that are not in 2
display(HTML('Werkwoorden voor <b>' + word1 + '</b> niet in <b>' + word2 + '</b>: ' + ", ".join(diff_left)))
# Elements of 2 that are not in 1
display(HTML('Werkwoorden voor <b>' + word1 + '</b> niet in <b>' + word2 + '</b>: ' + ", ".join(diff_right)))
# Elements both in 1 and 2
display(HTML('Werkwoorden zowel voor <b>' + word1 + '</b> als voor <b>' + word2 + '</b>: ' + ", ".join(intersec)))

## Case study 2 (sequential): Retrieve synonyms from DiaMaNT, look up in Gysseling
* Below cell searches for term "boek" in DiaMaNT, and looks up all variants in Gysseling

In [None]:
from chaininglib.search.CorpusQuery import *
from chaininglib.search.LexiconQuery import *
from IPython.core.display import display, HTML
from chaininglib.search.corpusQueries import corpus_query
from chaininglib.process.lexicon import get_diamant_synonyms
from chaininglib.ui.dfui import display_df

search_word = "boek"
lexicon_name = "diamant"
corpus= "gysseling"

# First, lookup synonyms in DiaMaNT
lq = create_lexicon(lexicon_name).lemma(search_word).search()
df_lexicon = lq.kwic()
syns = get_diamant_synonyms(df_lexicon)
syns.add(search_word) # Also add search word itself
display(HTML('Synoniemen voor <b>' + search_word + '</b>: ' + ", ".join(syns)))

# Search for all synonyms in corpus
## Create queries: search by lemma
syns_queries = [corpus_query(lemma=syn) for syn in syns]
## Search for all synonyms in corpus
cq = create_corpus(corpus).pattern(syns_queries)
df = cq.search().kwic()
display_df(df)



## Case study 3: Build a frequency list of the lemma of some corpus output

In [None]:
from chaininglib.search.CorpusQuery import *
from chaininglib.process.corpus import *
from chaininglib.ui.dfui import *

# do some corpus search

corpus_to_search="chn"
df_corpus = create_corpus(corpus_to_search).detailed_context(True).pos("NOUN").search().kwic()
display_df(df_corpus)

# compute and display a table of the frequencies of the lemmata

freq_df = get_frequency_list(df_corpus)
display_df(freq_df)

## Case study (sequential) 4: Find occurences of attributive adjectives not ending with -e, even though they are preceeded by a definite article

In [None]:
from chaininglib.search.CorpusQuery import *
from chaininglib.search.LexiconQuery import *
from chaininglib.utils.dfops import df_filter
from chaininglib.ui.dfui import display_df

corpus_to_search="opensonar"
lexicon_to_search="molex"

# CORPUS: get [article + attributive adjective + nouns] combinations in which the adjective does not end with -e
print('Get occurences of attributive adjectives not ending with -e')
cq = create_corpus(corpus_to_search).pattern(r'[lemma="de|het"][word="^g(.+)[^e]$" & pos="ADJ"][pos="NOUN"]')
df_corpus = cq.search().kwic()

# LEXICON: get adjectives the lemma of which does not end with -e
lq = create_lexicon(lexicon_to_search).lemma('^g(.+)[^e]$').pos('ADJ').search()
df_lexicon = lq.search().kwic()

# LEXICON: get adjectives having a final -e in definite attributive use
print('Filtering lexicon results')
final_e_condition = df_filter(df_lexicon["wordform"], 'e$')
df_lexicon_form_e = df_lexicon[ final_e_condition ]

# RESULT: get the records out of our first list in which the -e-less-adjectives match the lemma form of our last list
print('List of attributive adjectives not ending with -e even though they should have a final -e:')
e_forms = list(df_lexicon_form_e.lemma)
no_final_e_condition = df_filter(df_corpus["word 1"], query=set(e_forms), method="isin")
result_df = df_corpus[ no_final_e_condition ]
display_df( result_df )

## Case study (sequential) 5: (morphosyntactic lexicon and possibly unannotated corpus) Look up inflected forms and spelling variants for a given lemma in a corpus

In [None]:
from chaininglib.ui.dfui import display_df
from chaininglib.search.CorpusQuery import *
from chaininglib.search.LexiconQuery import *

lexicon_to_search="molex"
corpus_to_search="chn"

##############################################
# TODO  zelfde met meerdere lemmata en gegroepeerd 
##############################################

lemma_to_look_for="denken"

# LEXICON: Search for the inflected forms of a lemma in a morphosyntactic lexicon
lq = create_lexicon(lexicon_to_search).lemma(lemma_to_look_for).search()
df_lexicon = lq.kwic()
display_df(df_lexicon)

# Put all inflected forms into a list
inflected_wordforms = list(df_lexicon.wordform)

# CORPUS: Look up the inflected forms in a (possibly unannotated) corpus
# beware: If the corpus is not annotated, all we can do is searching for the inflected words
#         But if the corpus is lemmatized, we have to make sure we're retrieving correct data by specifying the lemma as well
annotated_corpus = True
query = r'[lemma="'+lemma_to_look_for+r'" & word="'+r"|".join(inflected_wordforms)+r'"]' if annotated_corpus else r'[word="'+r"|".join(inflected_wordforms)+r'"]'
cq = create_corpus(corpus_to_search).pattern(query).search()
df_corpus = cq.kwic() 
display_df(df_corpus)

## Case study 6:
## Build a function with which we can gather all lemmata 
## of a lexicon with a given part-of-speech,
## and use that function to build a frequecy list of those lemmata in a corpus

In [None]:
from chaininglib.search.LexiconQuery import *
from chaininglib.search.CorpusQuery import *
from chaininglib.process.corpus import get_frequency_list
from chaininglib.ui.dfui import display_df
import numpy as np


# build a function as required. We will run it afterwards

def get_frequency_list_given_a_corpus(lexicon, pos, corpus):
    
    # LEXICON: get a lemmata list to work with

    # query the lexicon
    lq = create_lexicon(lexicon).pos(pos).search()
    df_lexicon = lq.kwic()

    # Put the results into an array, so we can loop through the found lemmata
    lexicon_lemmata_arr = [w.lower() for w in df_lexicon["writtenForm"]]

    # Instantiate a DataFrame, in which we will gather all single lemmata occurences
    df_full_list = pd.DataFrame()


    # CORPUS: loop through the lemmata list, query the corpus with each lemma, and count the results

    # It's a good idea to query more than one lemma at at the time,
    # but not too many, otherwise the server will get overloaded!
    nr_of_lemmata_to_query_atonce = 100

    # loop over lemmata list 
    for i in range(0, len(lexicon_lemmata_arr), nr_of_lemmata_to_query_atonce):
        
        # slice to small array of lemmata to query at once
        small_lemmata_arr = lexicon_lemmata_arr[i : i+nr_of_lemmata_to_query_atonce] 

        # join set of lemmata to send them in a query all at once
        # beware: single quotes need escaping
        lemmata_list = "|".join(small_lemmata_arr).replace("'", "\\\\'")
        cq = create_corpus(corpus).pattern(r'[lemma="' + lemmata_list + r'"]').search()
        df_corpus = cq.kwic()

        # add the results to the full list
        df_full_list = pd.concat( [df_full_list, df_corpus["lemma 0"]] )     
        

    # make sure the columnswith contains the lemmata is called 'lemma', as it is required by the get_frequency_list function
    df_full_list.columns = ['lemma']

    # we're done with querying, build the frequency list now
    freq_df = get_frequency_list(df_full_list)

    return freq_df

    
# run it!

lexicon="molex"
corpus_to_search="chn"
pos="CONJ"

freq_df = get_frequency_list_given_a_corpus(lexicon, pos, corpus_to_search)

display_df(freq_df)

## Case study 7: Build a frequency table of some corpus, based on lemma list of a given lexicon

In [None]:
from chaininglib.utils.dfops import get_rank_diff
from chaininglib.ui.dfui import display_df

# For this case study, we need to run the previous case study first, because it generates a function we need here

base_lexicon="molex"
corpus_to_search1="opensonar"
corpus_to_search2="chn"

# build frequency tables of two corpora

df_frequency_list1 = get_frequency_list_given_a_corpus(base_lexicon, "NOUN", corpus_to_search1)
# sort and display
df_top25_descending = df_frequency_list1.sort_values(ascending=False,by=['token count']).head(25)
df_top25_ascending =  df_frequency_list1.sort_values(ascending=True, by=['rank']).head(25)
display_df( df_top25_ascending )
print(type(df_top25_descending['token count']))
display_df( df_top25_descending['token count'], labels='chart df1', mode='chart' )

df_frequency_list2 = get_frequency_list_given_a_corpus(base_lexicon, "NOUN", corpus_to_search2)
# sort and display
df_top25_descending = df_frequency_list2.sort_values(ascending=False,by=['token count']).head(25)
df_top25_ascending =  df_frequency_list2.sort_values(ascending=True, by=['rank']).head(25)
display_df( df_top25_ascending )
display_df( df_top25_descending['token count'], labels='chart df2', mode='chart' )


# TODO: lemmata tonen die in 1 of 2 ontbreken

# compute the rank diff of lemmata in frequency tables

# sort and display
df_rankdiffs = get_rank_diff(df_frequency_list1, df_frequency_list2)

display_df(df_rankdiffs.sort_values(by=['rank_diff']).head(25))

df_top25_descending = df_rankdiffs.sort_values(ascending=False, by=['rank_diff']).head(25)
display_df( df_top25_descending['rank_diff'], labels='chart large diff', mode='chart' )

df_top25_ascending = df_rankdiffs.sort_values(ascending=True, by=['rank_diff']).head(25)
display_df( df_top25_ascending['rank_diff'], labels='chart small diff', mode='chart' )

## Case study 8: search in a corpus for wordforms of a lemma, which are not included in this lemma's paramadigm in a lexicon

In [1]:
from chaininglib.search.LexiconQuery import *
from chaininglib.search.CorpusQuery import *
from chaininglib.ui.dfui import display_df



# Let's build a function to do the job:
# The function will require a lexicon name and a part-of-speech to limit the search to, and the name of a corpus to be searched.
# It will return a Pandas DataFrame associating lemmata to their paradigms ('known_wordforms' column) and
# missing wordforms found in the corpus ('unknown_wordforms' column).

def get_missing_wordforms(lexicon, pos, corpus):    
    
    print('Finding missing wordforms in a lexicon can take a long time...');
    
    # LEXICON: 
    # get a lemmata list having a given part-of-speech
    
    lq = create_lexicon(lexicon).pos(pos).search()
    df_lexicon = lq.kwic()
    
    # Put the results into an array, so we can loop through the list of lemmata
    lexicon_lemmata_arr = [w.lower() for w in df_lexicon["writtenForm"]]
    
    # Prepare the output:
    # instantiate a DataFrame for storing lemmata and mssing wordforms
    df_enriched_lexicon = pd.DataFrame(index=lexicon_lemmata_arr, columns=['lemma', 'pos', 'known_wordforms', 'unknown_wordforms'])
    df_enriched_lexicon.index.name = 'lemmata'
    
    # CORPUS: 
    # loop through the lemmata list, query the corpus for each lemma, 
    # and compute paradigms differences between both

    
    # loop through the lemmata list
    # and query the corpus for occurances of the lemmata
    
    # It's a good idea to work with more than one lemma at the time (speed)!
    nr_of_lemmata_to_query_atonce = 100
    
    for i in range(0, len(lexicon_lemmata_arr), nr_of_lemmata_to_query_atonce):
        
        # slice to small array of lemmata to query at once
        small_lemmata_arr = lexicon_lemmata_arr[i : i+nr_of_lemmata_to_query_atonce]
        
        # join set of lemmata to send them in a query all at once
        # beware: single quotes need escaping
        lemmata_list = "|".join(small_lemmata_arr).replace("'", "\\\\'")
        cq = create_corpus(corpus).pattern(r'[lemma="' + lemmata_list + r'" & pos="'+pos+'"]').search()
        df_corpus = cq.kwic()
        
        # if the corpus gave results,
        # query the lexicon for the same lemmata
        # and compare the paradigms!
        
        if (len(df_corpus)>0):
            for one_lemma in small_lemmata_set: 
                
                # look up the known wordforms in the lexicon
                ql = create_lexicon(lexicon).lemma(one_lemma).pos(pos).search()
                df_known_wordforms = ql.kwic()
                
                # we have a lexicon paradigm to compare, do the job now
                if (len(df_known_wordforms) != 0):
                    
                    # gather the lexicon wordforms in a set
                    known_wordforms = set( df_known_wordforms['wordform'].str.lower() )
                    
                    # gather the corpus wordforms (of the same lemma) in a set too
                    corpus_lemma_filter = (df_corpus['lemma 0'] == one_lemma)
                    corpus_wordforms = set( (df_corpus[ corpus_lemma_filter ])['word 0'].str.lower() )
                    
                    # Now compute the differences:
                    # gather in a set all the corpus wordforms that cannot be found in the lexicon wordforms 
                    unknown_wordforms = corpus_wordforms.difference(known_wordforms)

                    # If we found some missing wordforms, add the results to the output!
                    
                    if (len(unknown_wordforms) !=0):                        
                        # The index of our results will be a key consisting of lemma + part-of-speech
                        # Part-of-speech is needed to distinguish homonyms with different grammatical categories.
                        # Of course, we need to take glosses into account too to do a truely correct job
                        # But we didn't do it here
                        key = one_lemma + pos
                        df_enriched_lexicon.at[key, 'lemma'] = one_lemma
                        df_enriched_lexicon.at[key, 'pos'] = pos
                        df_enriched_lexicon.at[key, 'known_wordforms'] = known_wordforms
                        df_enriched_lexicon.at[key, 'unknown_wordforms'] = unknown_wordforms
                
    # return non-empty results, t.i. cases in which we found some wordforms
    return df_enriched_lexicon[ df_enriched_lexicon['unknown_wordforms'].notnull() ]


# Run the function!

base_lexicon="molex"
corpus_to_search="opensonar"

df = get_missing_wordforms(base_lexicon, "VERB", corpus_to_search)

# After such a heavy process, it's a good idea to save the results

df.to_csv( "missing_wordforms.csv", index=False)

display_df(df)


Finding missing wordforms in a lexicon can take a long time...
[Fno viable alternative at input '[lemma="3d-printen|4d-printen|bbq\\'en|bmx\\'en|rt\\'en|twittervasten|aaien|aanaarden|aanbakken|aanbelanden|aanbelangen|aanbellen|aanbenen|aanbesteden|aanbetalen|aanbevelen|aanbidden|aanbieden|aanbijten|aanbinden|aanblaffen|aanblazen|aanblijven|aanblikken|aanboren|aanbouwen|aanbraden|aanbranden|aanbreien|aanbreken|aanbrengen|aandammen|aandampen|aandienen|aandijken|aandikken|aandoen|aandraaien|aandragen|aandraven|aandrijven|aandringen|aandrukken|aanduiden|aandurven|aanduwen|aandweilen|aaneenflansen|aaneengrenzen|aaneengroeien|aaneenhangen|aaneenknopen|aaneenpraten|aaneenrijgen|aaneenschakelen|aaneenschrijven|aaneensluiten|aaneensmeden|aanerven|aanfietsen|aanflitsen|aanfloepen|aanfruiten|aangaan|aangalopperen|aangapen|aangespen|aangeven|aangieten|aangloeien|aangooien|aangorden|aangrijnzen|aangrijpen|aangroeien|aanhaken|aanhalen|aanhangen|aanharken|aanhebben|aanhechten|aanheffen|aanhelen|aanh

[Fno viable alternative at input '[lemma="authenticeren|authentificeren|autoclaveren|autocrossen|autodaten|autodelen|autokamperen|autoklonen|automatiseren|automutileren|autopetten|autoracen|autorijden|autoriseren|autosurfen|avaleren|avanceren|aviveren|avondmalen|avonturen|azen|baanderen|baantjerijden|baanwielrennen|babbelen|babysitten|babyzwemmen|back-uppen|backen|backflippen|backpacken|backspacen|backspinnen|badderen|baden|badgen|badineren|badmintonnen|bagatelliseren|baggeren|bakenen|bakeren|bakkeleien|bakken|baksen|balanceren|balderen|balen|balkaniseren|balken|ballasten|ballen|balletdansen|ballonvaren|balloteren|balsemen|baltsen|balwerpen|bammen|bamzaaien|banaliseren|bandstoten|banen|banjeren|bankdrukken|banken|bankhangen|bankieren|banksparen|bankverzekeren|bankwerken|bankzitten|bannen|banvloeken|barbakotten|barbecueën|barebacken|baren|barplassen|barreren|barricaderen|barsten|baseballen|basejumpen|basen|baseren|bashen|basketballen|basketten|bassen|bastaarderen|bastioneren|baten|bati

[Fno viable alternative at input '[lemma="boren|borgen|borneren|borrelen|borstelen|borstvoeden|borstzwemmen|bosseleren|bossen|boswonen|botaniseren|botdotteren|boten|boteren|botoxen|botsen|bottelen|botten|bottlenecken|botvieren|boucharderen|bouderen|boulderen|bouncen|bouwen|bouwputgluren|bovenblijven|bovendrijven|bovenhalen|bovenkomen|bovenkopen|bovenliggen|bovenwinkelwonen|bowlen|boxhoppen|boycotten|brabbelen|braden|braderen|brainstormen|brainwashen|braiseren|braken|brallen|bramen|branden|brandingkanoën|brandmerken|brandschatten|brandschilderen|brandstichten|braseren|brassen|braveren|breakdancen|breakdansen|breedparkeren|breeuwen|breidelen|breien|breinbruisen|breinjoggen|breinstormen|breken|brengen|brevetteren|brevieren|bricoleren|bridgen|briefen|briefschrijven|briesen|brillen|britsen|brocheren|broddelen|broebelen|broeden|broeien|broekzakbellen|broezen|brokkelen|brokken|brommen|bronzen|bronzeren|broodroven|brossen|brouilleren|brouwen|browsen|bruiken|bruinbakken|bruinen|bruineren|bruis

[Fno viable alternative at input '[lemma="doctoreren|documenteren|doddelen|doden|doe-het-zelven|doedelen|doelen|doelpunten|doemdenken|doemen|doen|doezelen|doggydansen|dogmatiseren|dokken|dokteren|doldraaien|dolen|doleren|dollariseren|dollen|domeinkapen|domen|domesticeren|domiciliëren|domineren|dominoën|dommelen|dompelen|dompen|donderen|donderjagen|donderstenen|donderstralen|doneren|donkeren|doodbijten|doodblijven|doodbloeden|doodbranden|doodconcurreren|dooddoen|dooddrinken|dooddrukken|doodergeren|doodfluiten|doodgaan|doodgooien|doodhakken|doodhongeren|doodkappen|doodknijpen|doodknuffelen|doodknuppelen|doodlachen|doodleggen|doodliggen|doodlopen|doodmaken|doodmartelen|doodmeppen|doodpesten|doodpolderen|doodranselen|doodrijden|doodschamen|doodschieten|doodschoppen|doodschrikken|doodschudden|doodslaan|doodspuiten|doodsteken|doodtrappen|doodvallen|doodvechten|doodverklaren|doodverven|doodvriezen|doodwerken|doodzwijgen|doofpotten|dooien|doorademen|dooragenderen|doorakkeren|doorbehandelen|do

[Fno viable alternative at input '[lemma="fotograferen|fotokopiëren|fotoshoppen|fotozetten|fouilleren|fourneren|foutparkeren|foxtrotten|fracken|fractioneren|fraggen|fragmentariseren|fragmenteren|franchisen|frankeren|frapperen|fraseren|frauderen|frazelen|freaken|freefighten|freelancen|freeriden|freerunnen|freestylen|freestyleskiën|freewheelen|frequenteren|frescoschilderen|fretten|fretteren|frezen|frictioneren|friemelen|frijnen|frisbeeën|friseren|frituren|frommelen|fronsen|frontloaden|frotten|frotteren|fruiten|frummelen|frunniken|frustreren|frutselen|frutten|fröbelen|ftp\\'en|fucken|fuifroeien|fuiven|fulmineren|functioneren|fundamentaliseren|funderen|fundraisen|fungeren|funken|funshoppen|fuseren|fusilleren|fusioneren|futselen|fêteren|föhnen|gaaibollen|gaaischieten|gaan|gaarkoken|gaarsmoren|gaarstoven|gadeslaan|gaffelen|gaggelen|gakken|gallen|galmen|galonneren|galopperen|galvaniseren|gamen|gangbangen|gangmaken|gansrijden|gansslaan|ganstrekken|ganzenborden|gapen|gappen|garanderen|garen|ga

[Fno viable alternative at input '[lemma="indrammen|indrijven|indringen|indrinken|indrogen|indruisen|indrukken|indrummen|indruppelen|induceren|induffelen|induiken|industrialiseren|indutten|induwen|ineenduiken|ineenflansen|ineengrijpen|ineenknutselen|ineenkrimpen|ineenlopen|ineenschrompelen|ineenschuiven|ineenslaan|ineensteken|ineenstorten|ineenstrengelen|ineenstuiken|ineenvlechten|ineenvloeien|ineenvouwen|ineenzakken|ineenzetten|ineenzijgen|ineenzitten|inenten|inertiseren|infaden|infantiliseren|infecteren|infiltreren|inflateren|influenceren|influisteren|informatiseren|informeren|infrezen|infunderen|infuseren|ingaan|ingeven|ingieten|ingipsen|inglijden|inglippen|ingooien|ingraven|ingraveren|ingriffen|ingrijpen|ingroeien|ingroeven|inhaken|inhakken|inhalen|inhaleren|inhebben|inheien|inhiberen|inhouden|inhouwen|inhuizen|inhuldigen|inhuren|ininkten|initialiseren|initiëren|injagen|injecteren|inkaderen|inkakken|inkalven|inkankeren|inkapselen|inkepen|inkeren|inkerven|inkijken|inklappen|inklare

[Fno viable alternative at input '[lemma="leegeten|leeggieten|leeggooien|leeghalen|leegkieperen|leegkopen|leegleasen|leeglepelen|leeglopen|leegmaken|leegplukken|leegplunderen|leegpompen|leegrijden|leegroven|leegruimen|leegschenken|leegscheppen|leegschieten|leegschrapen|leegschudden|leegspuiten|leegstaan|leegstelen|leegstorten|leegstromen|leegtrekken|leegvissen|leegzuigen|leerlooien|leertouwen|leewieken|legaliseren|legateren|legen|legeren|leggen|legitimeren|legoën|leiden|leidinggeven|leken|lekken|lekkerbekken|lekprikken|lekrijden|lekschieten|lekslaan|leksteken|lellen|lemen|lenen|lengen|lenigen|lenzen|lepelen|leppen|leraren|leren|lernen|lesgeven|lessen|letsen|letten|letteren|lettergieten|letterzetten|letterziften|leunen|leunstoelreizen|leuren|leuteren|levelen|leven|levensloopsparen|leveren|lezen|liberaliseren|licenseren|licentiëren|lichten|liefhebben|liefhebberen|liefkozen|liegen|liften|ligfietsen|liggen|lijden|lijken|lijmen|lijndansen|lijnen|lijntekenen|lijntrekken|lijsten|lijstshoppen

[Fno viable alternative at input '[lemma="ogen|ohaën|oliën|omarmen|ombatterijen|omberen|ombinden|ombladeren|omblazen|omboeken|omboorden|ombouwen|ombrengen|ombuigen|omcirkelen|omdelen|omdenken|omdijken|omdoen|omdolen|omdonderen|omdopen|omdraaien|omdragen|omduikelen|omduwen|omdwalen|omfietsen|omflikkeren|omfloersen|omgaan|omgespen|omgeven|omgieten|omgooien|omgorden|omgrachten|omgraven|omgrenzen|omhakken|omhalen|omhangen|omhebben|omheinen|omhelzen|omhoogbrengen|omhoogdoen|omhoogdrijven|omhoogdrukken|omhoogduwen|omhooggaan|omhoogglijden|omhooggooien|omhooghalen|omhooghangen|omhoogheffen|omhooghijsen|omhooghouden|omhoogkijken|omhoogklappen|omhoogklimmen|omhoogkomen|omhoogkruipen|omhoogkrullen|omhooglopen|omhoognivelleren|omhoogpersen|omhoogpompen|omhoogrijden|omhoogschieten|omhoogschroeven|omhoogschuiven|omhoogslaan|omhoogspringen|omhoogspuiten|omhoogsteken|omhoogstijgen|omhoogstuwen|omhoogtakelen|omhoogtillen|omhoogtrekken|omhoogvallen|omhoogvliegen|omhoogvoeren|omhoogwerken|omhoogwijzen|

[Fno viable alternative at input '[lemma="opsmeren|opsmukken|opsmullen|opsnijden|opsnoeien|opsnoepen|opsnorren|opsnuiven|opsodemieteren|opsolferen|opsommen|opsouperen|opspannen|opsparen|opspatten|opspelden|opspelen|opspeuren|opspicen|opspieën|opspitsen|opspitten|opsplitsen|opspoelen|opsporen|opspringen|opspuiten|opspuwen|opstaan|opstapelen|opstappen|opstarten|opsteken|opstellen|opstijgen|opstijven|opstikken|opstoken|opstomen|opstoppen|opstormen|opstoten|opstoven|opstralen|opstrijken|opstromen|opstropen|opstuiken|opstuiten|opstuiven|opsturen|opstuwen|optakelen|optassen|optekenen|optellen|opteren|optiefen|optikken|optillen|optimaliseren|optimmeren|optomen|optooien|optoppen|optornen|optreden|optrekken|optrommelen|optuigen|optutten|opvallen|opvangen|opvaren|opvatten|opvegen|opveren|opverven|opvijzelen|opvissen|opvlammen|opvliegen|opvoeden|opvoeren|opvolgen|opvorderen|opvouwen|opvragen|opvreten|opvriezen|opvrijen|opvrolijken|opvullen|opwaaien|opwaarderen|opwachten|opwandelen|opwarmen|opwas

[Fno viable alternative at input '[lemma="recycleren|redden|redderen|redekavelen|reden|redeneren|redetwisten|redevoeren|redigeren|redirecten|redoubleren|redresseren|reduceren|reeuwen|referentiëren|refereren|reflecteren|reformeren|refreshen|refuseren|regarderen|regelen|regelruimen|regenen|regenereren|regeren|regionaliseren|regisseren|registeren|registreren|reglementeren|regresseren|regulariseren|reguleren|rehabiliteren|rehydreren|reien|reiken|reikhalzen|reilen|reinigen|reizen|rekenen|rekeningrijden|rekestreren|rekken|rekruteren|rekwestreren|rekwireren|relanceren|relateren|relativeren|relaxen|relaxeren|relayeren|releasen|releveren|relibeleggen|relinen|rellen|relokaliseren|relschoppen|remasteren|rembourseren|remediëren|remigreren|remiseren|remitteren|remixen|remmen|remodelleren|remonteren|remplaceren|remunereren|renationaliseren|renderen|rennen|renommeren|renonceren|renoveren|renseigneren|rentabiliseren|renten|rentenieren|renumereren|renvooieren|reorganiseren|repareren|repasseren|repatri

[Fno viable alternative at input '[lemma="smukken|smullen|smulpapen|smurfen|snaaien|snabbelen|snacken|snakken|snapchatten|snappen|snateren|snauwen|snebberen|sneeuwballen|sneeuwen|sneeuwruimen|sneeuwschoenlopen|sneeuwschoenwandelen|snellen|snellezen|snelschaken|snelwandelen|sneren|snerken|snerpen|sneukelen|sneuken|sneupen|sneuvelen|sneven|snibben|snieren|sniffen|snijden|snikken|snipperen|snoeien|snoeken|snoepen|snoeren|snoeven|snoezelen|snokken|snookeren|snoozen|snorkelen|snorkelskiën|snorken|snorren|snotteren|snowboarden|snowcatskiën|snowferen|snuffelen|snuffen|snuisteren|snuiten|snuiven|snurken|socialiseren|socializen|sodemieteren|sodomiseren|soebatten|soezen|sofisticeren|softballen|soggen|soigneren|solderen|solemniseren|soleren|solfegiëren|solidariseren|sollen|solliciteren|solveren|somberen|sombermansen|sommeren|sonderen|sonjabakkeren|soppen|sorteren|souffleren|soundchecken|soundmixen|souperen|souteneren|sovjetiseren|spaarbellen|spachtelputzen|spaden|spalken|spammen|spanen|spannen|s

[Fno viable alternative at input '[lemma="toesnellen|toesnijden|toesnoeren|toespelden|toespelen|toespitsen|toespreken|toespringen|toestaan|toestappen|toesteken|toestemmen|toestoppen|toestormen|toestromen|toesturen|toetakelen|toetasten|toeten|toeteren|toetreden|toetrekken|toetsen|toevallen|toeven|toevertrouwen|toevliegen|toevloeien|toevoegen|toevoeren|toevouwen|toewaaien|toewenden|toewenken|toewensen|toewerken|toewerpen|toewijden|toewijzen|toewuiven|toezeggen|toezenden|toezien|toezingen|toezwaaien|togen|toiletteren|tokkelen|tokken|tolereren|tolken|tollen|tomateren|tomen|toneelspelen|tonen|tongen|tongschrapen|tongworstelen|tongzoenen|tonifiëren|tonnen|tooien|toonzetten|toornen|toosten|top-downleren|topcoaten|tophoppen|toppen|toproeien|torderen|torenen|tormenteren|tornen|tornooien|torpederen|torsen|tortelen|tossen|totaliseren|toucheren|touperen|touren|touwen|touwklimmen|touwtjespringen|touwtrekken|toveren|traanogen|traceren|trachten|trainen|traineren|trakteren|traliën|trammen|trampolines

[Fno viable alternative at input '[lemma="vergruizen|verguizen|vergulden|vergunnen|verhaasten|verhagen|verhakkelen|verhakken|verhakselen|verhakstukken|verhalen|verhalvezolen|verhandelen|verhangen|verhapstukken|verharden|verharen|verhaspelen|verheerlijken|verheffen|verheimelijken|verhelderen|verhelen|verhelpen|verheugen|verhevigen|verhinderen|verhippen|verhitten|verhoeden|verhoeren|verhogen|verhollandsen|verhonderdvoudigen|verhongeren|verhopen|verhoren|verhouden|verhovaardigen|verhuizen|verhullen|verhuren|verhypothekeren|verifiëren|verijdelen|verindischen|verinnerlijken|verinnigen|verjagen|verjaren|verjongen|verkalken|verkankeren|verkassen|verkavelen|verkazen|verkennen|verkeren|verkerven|verketteren|verkiezen|verkijken|verkillen|verkindsen|verklanken|verklappen|verklaren|verkleden|verkleinen|verkletsen|verkleumen|verkleuren|verkleven|verklikken|verklinken|verkloeken|verkloten|verklungelen|verknallen|verknechten|verkneukelen|verkneuteren|verkniezen|verknippen|verknoeien|verknollen|verkn

[Fno viable alternative at input '[lemma="voortstormen|voortstrompelen|voortstuwen|voortsudderen|voortsukkelen|voorttrekken|voortvaren|voortverkopen|voortvertellen|voortvloeien|voortwerken|voortwoeden|voortwoekeren|voortzeggen|voortzetten|voortzeulen|voortzieken|vooruitbetalen|vooruitblikken|vooruitbranden|vooruitbrengen|vooruitdenken|vooruitgaan|vooruitgrijpen|vooruithelpen|vooruitkijken|vooruitkomen|vooruitlopen|vooruitmaken|vooruitrijden|vooruitschuiven|vooruitsnellen|vooruitspringen|vooruitsteken|vooruitsturen|vooruitwerken|vooruitwerpen|vooruitwijzen|vooruitzetten|vooruitzien|voorvallen|voorverkopen|voorvertonen|voorverwarmen|voorvoelen|voorwenden|voorwerpen|voorzeggen|voorzetten|voorzien|voorzingen|voorzitten|voorzuiveren|vorderen|vormen|vormgeven|vorsen|vossen|voteren|votsen|vousvoyeren|vouwen|vozen|vragen|vreemdgaan|vreten|vrezen|vriesdrogen|vriezen|vrijbuiten|vrijdenken|vrijen|vrijgeven|vrijhouden|vrijkomen|vrijkopen|vrijlaten|vrijlopen|vrijloten|vrijmaken|vrijpleiten|vrijroo

Unnamed: 0_level_0,lemma,pos,known_wordforms,unknown_wordforms
lemmata,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


HBox(children=(Label(value='Sla uw resultaten op:'), Text(value='mijn_resultaten.csv'), Button(button_style='w…

## Case study 9: Train a tagger with data from an annotated corpus, and do something cool

In [None]:
from chaininglib.ui.dfui import display_df
from chaininglib.process.corpus import get_tagger
from chaininglib.search.CorpusQuery import *
from chaininglib.search.LexiconQuery import *

import pandas as pd

base_lexicon="molex"

# we have a given word, let's say: "loop"
some_word = "loop"

# get the paradigm of the lemma our word is a part of
l = create_lexicon(base_lexicon).lemma(some_word).search()
df_paradigm = l.kwic()
display_df(df_paradigm)

# gather some pattern including our word, out of annotated corpora
# here: DET + ADJ + 'loop'

dfs_all_corpora = []

for one_corpus in get_available_corpora():
    print('querying '+one_corpus+'...')
    c = create_corpus(one_corpus).word(some_word).detailed_context(True).search()
    df_corpus = c.kwic() 
    
    # store the results
    dfs_all_corpora.append(df_corpus)

# get a tagger trained with our corpus data
tagger = get_tagger(dfs_all_corpora)

# Use the trained tagger to tag unknown sentences
# The input must be like: tagger.tag(['today','is','a','beautiful','day'])

sentence = 'Mijn buurman kijkt door de loop van zijn geweer'
tagged_sentence = tagger.tag( sentence.split() )

print(tagged_sentence)


# Know we can lemmatize each occurence of our lemma in the new sentences

## Case study 10: Search in corpus and filter on metadata
First, we request all available metadata fields of the corpus. Then, we issue a search query, and request all metadata fields for the result. Finally, we filter on metadata values.

In [None]:
from chaininglib.search.metadata import get_available_metadata
from chaininglib.utils.dfops import df_filter
from chaininglib.ui.dfui import display_df
from chaininglib.search.CorpusQuery import *


corpus_name="zeebrieven"
query=r'[lemma="boek"]'
# Request all metadata fields from corpus
fields = get_available_metadata(corpus_name)
# Perform query and ask all metadata
c = create_corpus(corpus_name).pattern(query).extra_fields_doc(fields["document"]).search()
df_corpus = c.kwic()

# Filter on year: > 1700
df_filter_year = df_corpus[df_corpus["witnessYear_from"].astype('int32') > 1700] 
display_df(df_filter_year, labels="After 1700")

# Filter on sender birth place Amsterdam
condition = df_filter(df_corpus["afz_geb_plaats"], regex_or_set="Amsterdam")
df_filter_place = df_corpus[ condition ]
display_df(df_filter_place, labels="Sender born in Amsterdam")


# Group by birth place
df = property_freq(df_corpus,"afz_loc_plaats")
display_df(df, labels="Most frequent sender locations")

## Case study 11: Visualizing h-dropping

In [None]:
from chaininglib.search.CorpusQuery import *
from chaininglib.search.metadata import get_available_metadata
from chaininglib.ui.dfui import display_df

corpus_to_search="chn"

fields = get_available_metadata(corpus_to_search)


df_corpus1 = create_corpus(corpus_to_search).pattern(r'[lemma="h[aeo].*" & word="[aeo].*"]').extra_fields_doc(fields["document"]).search().kwic()
df_corpus1 = create_corpus(corpus_to_search).pattern(r'[lemma="h[aeo].*" & word="h[aeo].*"]').extra_fields_doc(fields["document"]).search().kwic()

display_df( df_corpus1)
display_df( df_corpus2)

display_df( df_corpus1.groupby(["Region", "Date"]), labels="h-dropping", mode='chart')
display_df( df_corpus2.groupby(["Region", "Date"]), labels="normal", mode='chart')

## Case study 12: gather data from several corpora and generate a lexicon out of that

In [None]:
from chaininglib.ui.dfui import display_df
from chaininglib.process.corpus import extract_lexicon
from chaininglib.search.CorpusQuery import *
from chaininglib.search.LexiconQuery import *

dfs_all_corpora = []

for one_corpus in get_available_corpora():
    print('querying '+one_corpus+'...')
    c = create_corpus(one_corpus).lemma("lopen").detailed_context(True).search()
    df_corpus = c.kwic() 
    
    # store the results
    dfs_all_corpora.append(df_corpus)

# extract lexicon and show the result
extracted_lexicon = extract_lexicon(dfs_all_corpora, posColumnName="universal_dependency")
display(extracted_lexicon)

## Case study 13: search treebank with some pattern

In [None]:
from chaininglib.search.TreebankQuery import *


print ("search...")

tbq = create_treebank().pattern("xquery //node[@cat='pp' and node[@cat='ap' and node[@cat='np']]]").search()
#tbq = create_treebank().pattern("xquery //node[@cat='ap' and node[@cat='np']]").search()

print ("get XML...")

xml = tbq.xml()
#print(xml)

print ("get trees and their string representations...")

trees = tbq.trees()

for tree in trees:
    display(tree.toString(posTag=True))

df = tbq.results()
    
display(df)