# Chaining search



## Sphinx documentatie: https://pythonhosted.org/an_example_pypi_project/sphinx.html
## in voorbeelden handige python functies opnemen
## zoals ; .sort_values(ascending=False,by=['raw_freq']));  list enz


## Corpus search

* Run the cell below to show the UI, and fill in your search query

In [None]:
from chaininglib.ui.search import create_corpus_ui

# Create corpus UI, creates references to field contents
corpusQueryField, corpusField = create_corpus_ui()

 * Click the cell below and press Run to perform the given query

In [None]:
from chaininglib.search.CorpusQuery import *
from chaininglib.ui.dfui import display_df

#from chaininglib import search
query= corpusQueryField.value
corpus_name = corpusField.value
df_corpus = create_corpus(corpus_name).pattern(query).method("blacklab").results()
#df_corpus = load_dataframe('mijn_resultaten.csv')
display_df(df_corpus, labels="Results")



## Lexicon search

* Run the cell below to show the UI, and fill in your search query in the UI

In [None]:
from chaininglib.ui.search import create_lexicon_ui

#from chaininglib import ui
searchWordField, lexiconField = create_lexicon_ui()

 * Click the cell below and press Run to perform the given query

In [None]:
from chaininglib.search.LexiconQuery import *
from chaininglib.ui.dfui import display_df

search_word = searchWordField.value
lexicon_name = lexiconField.value
# USER: can replace this by own custom query
df_lexicon = create_lexicon(lexicon_name).lemma(search_word).results()
display_df(df_lexicon)
#df_columns_list = list(df_lexicon.columns.values)
#df_lexicon_in_columns = df_lexicon[df_columns_list]
#display(df_lexicon_in_columns)

## Case study 1 (parallel): Frequency of *puur*+verb and *zuiver*+verb compared
* Below cell searches for *puur*+verb and for *zuiver*+verb in the CHN corpus
* Compare frequencies

In [None]:
#from chaininglib import search
from IPython.core.display import display, HTML
from chaininglib.search.CorpusQuery import *
from chaininglib.ui.dfui import display_df
from chaininglib.utils.dfops import column_difference

# Word 1: puur
word1= "puur"
cq1 = create_corpus("chn").pattern(r'[word="' + word1 + r'"][pos="verb"]')
df_corpus1 = cq1.results()
display_df(df_corpus1, word1)

# Word 2: zuiver
word2 = "zuiver"
cq2 = create_corpus("chn").pattern(r'[word="' + word2 + r'"][pos="verb"]')
df_corpus2 = cq2.results()
display_df(df_corpus2, word2)

# Compute difference
diff_left, diff_right, intersec = column_difference(df_corpus1["word 1"], df_corpus2["word 1"])
# Elements of 1 that are not in 2
display(HTML('Werkwoorden voor <b>' + word1 + '</b> niet in <b>' + word2 + '</b>: ' + ", ".join(diff_left)))
# Elements of 2 that are not in 1
display(HTML('Werkwoorden voor <b>' + word1 + '</b> niet in <b>' + word2 + '</b>: ' + ", ".join(diff_right)))
# Elements both in 1 and 2
display(HTML('Werkwoorden zowel voor <b>' + word1 + '</b> als voor <b>' + word2 + '</b>: ' + ", ".join(intersec)))

## Case study 2 (sequential): Retrieve synonyms from DiaMaNT, look up in Gysseling
* Below cell searches for term "boek" in DiaMaNT, and looks up all variants in Gysseling

In [None]:
from chaininglib.search.CorpusQuery import *
from chaininglib.search.LexiconQuery import *
from IPython.core.display import display, HTML
from chaininglib.search.corpusQueries import corpus_query
from chaininglib.process.lexicon import get_diamant_synonyms
from chaininglib.ui.dfui import display_df

search_word = "boek"
lexicon_name = "diamant"
corpus= "gysseling"

# First, lookup synonyms in DiaMaNT
lq = create_lexicon(lexicon_name).lemma(search_word)
df_lexicon = lq.results()
syns = get_diamant_synonyms(df_lexicon) 
syns.add(search_word) # Also add search word itself
display(HTML('Synoniemen voor <b>' + search_word + '</b>: ' + ", ".join(syns)))

# Search for all synonyms in corpus
## Create queries: search by lemma
syns_queries = [corpus_query(lemma=syn) for syn in syns]
## Search for all synonyms in corpus
cq = create_corpus(corpus).pattern(syns_queries)
result_dict = cq.results()
display_df(result_dict, labels=list(syns))



## Case study 3: Build a frequency list of the lemma of some corpus output

In [8]:
from chaininglib.search.CorpusQuery import *
from chaininglib.utils.dfops import *
from chaininglib.ui.dfui import *

# do some corpus search

corpus_to_search="chn"
df_corpus = create_corpus(corpus_to_search).detailed_context(True).pos("NOUN").results()
display_df(df_corpus)

# compute and display a table of the frequencies of the lemmata

freq_df = get_frequency_list(df_corpus)
display_df(freq_df)

Unnamed: 0,lemma 0,universal_dependency 0,word 0,lemma 1,universal_dependency 1,word 1,lemma 2,universal_dependency 2,word 2,lemma 3,...,word 7,lemma 8,universal_dependency 8,word 8,lemma 9,universal_dependency 9,word 9,lemma 10,universal_dependency 10,word 10
0,1,NUM,1,kennen,VERB,Ken,je,PRON,je,het,...,Ravales?2,hoe,ADV,Hoe,worden,VERB,wordt,,,
1,robin,PROPN,Robin,ravales?2,PROPN,Ravales?2,hoe,ADV,Hoe,worden,...,de,volksmond,NOUN,volksmond,genoemd?3,NOUN,genoemd?3,in,ADP,In
2,worden,VERB,wordt,de,,de,palmentuin,NOUN,Palmentuin,in,...,In,welk,DET,welk,district,NOUN,district,zijn,VERB,is
3,de,,de,palmentuin,NOUN,Palmentuin,in,ADP,in,de,...,welk,district,NOUN,district,zijn,VERB,is,de,,de
4,de,,de,volksmond,NOUN,volksmond,genoemd?3,NOUN,genoemd?3,in,...,de,dichter,NOUN,dichter,michael,PROPN,Michael,slory,PROPN,Slory
5,in,ADP,In,welk,DET,welk,district,NOUN,district,zijn,...,Slory,geboren?4,PROPN,geboren?4,hoe,ADV,Hoe,worden,VERB,wordt
6,hoe,ADV,Hoe,worden,VERB,wordt,meerzorg,PROPN,Meerzorg,in,...,5,wie,PRON,Wie,zijn,VERB,was,de,,de
7,noemen,VERB,genoemd,5,SYM,5,wie,PRON,Wie,zijn,...,het,eerste,NUM,eerste,Jeugdparlement?6,NOUN,Jeugdparlement?6,hoe,ADV,Hoe
8,de,,de,voorzitter,NOUN,voorzitter,van,ADP,van,het,...,noemt,men,PRON,men,de,,de,indira,PROPN,Indira
9,men,PRON,men,de,,de,indira,PROPN,Indira,gandhiweg,...,Wanica,in,ADP,in,de,,de,volksmond?7,NOUN,volksmond?7


HBox(children=(Label(value='Sla uw resultaten op:'), Text(value='mijn_resultaten.csv'), Button(button_style='w…

Unnamed: 0,lemmata,token count,perc,rank
0,de,17,0.155963,1
1,in,10,0.091743,2
2,zijn,6,0.055046,3
3,hoe,6,0.055046,3
4,volksmond,5,0.045872,5
5,worden,5,0.045872,5
6,van,4,0.036697,9
7,het,4,0.036697,9
8,genoemd?3,4,0.036697,9
9,welk,4,0.036697,9


HBox(children=(Label(value='Sla uw resultaten op:'), Text(value='mijn_resultaten.csv'), Button(button_style='w…

## Case study (sequential) 4: Find occurences of attributive adjectives not ending with -e, even though they are preceeded by a definite article

In [9]:
from chaininglib.search.CorpusQuery import *
from chaininglib.search.LexiconQuery import *
from chaininglib.utils.dfops import df_filter
from chaininglib.ui.dfui import display_df

corpus_to_search="opensonar"
lexicon_to_search="molex"

# CORPUS: get [article + attributive adjective + nouns] combinations in which the adjective does not end with -e
print('Get occurences of attributive adjectives not ending with -e')
cq = create_corpus(corpus_to_search).pattern(r'[lemma="de|het"][word="^g(.+)[^e]$" & pos="ADJ"][pos="NOUN"]')
df_corpus = cq.results()

# LEXICON: get adjectives the lemma of which does not end with -e
lq = create_lexicon(lexicon_to_search).lemma('^g(.+)[^e]$').pos('ADJ')
df_lexicon = lq.results()

# LEXICON: get adjectives having a final -e in definite attributive use
print('Filtering lexicon results')
final_e_condition = df_filter(df_lexicon["wordform"], 'e$')
df_lexicon_form_e = df_lexicon[ final_e_condition ]

# RESULT: get the records out of our first list in which the -e-less-adjectives match the lemma form of our last list
print('List of attributive adjectives not ending with -e even though they should have a final -e:')
e_forms = list(df_lexicon_form_e.lemma)
no_final_e_condition = df_filter(df_corpus["word 1"], e_forms, method="isin")
result_df = df_corpus[ no_final_e_condition ]
display_df( result_df )

Get occurences of attributive adjectives not ending with -e
Filtering lexicon results
List of attributive adjectives not ending with -e even though they should have a final -e:


Unnamed: 0,left context,lemma 0,universal_dependency 0,word 0,lemma 1,universal_dependency 1,word 1,lemma 2,universal_dependency 2,word 2,right context
3,de aandelen overnemen . Vanwaar,het,DET,het,groot,ADJ,groot,verschil,NOUN,verschil,? Dat ING een gezond
4,"manier doen , nu was",het,PRON,het,gewoon,ADJ,gewoon,zeeeeer,NOUN,zeeeeer,traaaaaaaaaaaaag .
5,"de goede wil hebben ,",het,DET,het,gezond,ADJ,gezond,verstand,NOUN,verstand,en bereidheid om moeite te
6,doet me toch twijfelen aan,het,DET,het,gezond,ADJ,gezond,verstand,NOUN,verstand,van Vlaanderen . [ /
9,absurde . Als oplossing voor,het,DET,het,groot,ADJ,groot,aantal,NOUN,aantal,mensen zou je mensen om


HBox(children=(Label(value='Sla uw resultaten op:'), Text(value='mijn_resultaten.csv'), Button(button_style='w…

## Case study (sequential) 5: (morphosyntactic lexicon and possibly unannotated corpus) Look up inflected forms and spelling variants for a given lemma in a corpus

In [7]:
from chaininglib.ui.dfui import display_df
from chaininglib.search.CorpusQuery import *
from chaininglib.search.LexiconQuery import *

lexicon_to_search="molex"
corpus_to_search="chn"

##############################################
# TODO  zelfde met meerdere lemmata en gegroepeerd 
##############################################

lemma_to_look_for="denken"

# LEXICON: Search for the inflected forms of a lemma in a morphosyntactic lexicon
lq = create_lexicon(lexicon_to_search).lemma(lemma_to_look_for)
df_lexicon = lq.results()
display_df(df_lexicon)

# Put all inflected forms into a list
inflected_wordforms = list(df_lexicon.wordform)

# CORPUS: Look up the inflected forms in a (possibly unannotated) corpus
# beware: If the corpus is not annotated, all we can do is searching for the inflected words
#         But if the corpus is lemmatized, we have to make sure we're retrieving correct data by specifying the lemma as well
annotated_corpus = True
query = r'[lemma="'+lemma_to_look_for+r'" & word="'+r"|".join(inflected_wordforms)+r'"]' if annotated_corpus else r'[word="'+r"|".join(inflected_wordforms)+r'"]'
cq = create_corpus(corpus_to_search).pattern(query)
df_corpus = cq.results() 
display_df(df_corpus)

Unnamed: 0,Number,hyphenation,lemEntryId,lemPos,lemma,wordform,wordformId,wordformPos
0,http://universaldependencies.org/u/feat/Number.html#Plur,den/ken,http://rdf.ivdnt.org/lexica/diamant/entry/molex/105055,http://universaldependencies.org/u/pos/VERB,denken,denken,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/256472,http://universaldependencies.org/u/pos/VERB
1,http://universaldependencies.org/u/feat/Number.html#Sing,dacht,http://rdf.ivdnt.org/lexica/diamant/entry/molex/105055,http://universaldependencies.org/u/pos/VERB,denken,dacht,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/256464,http://universaldependencies.org/u/pos/VERB
2,http://universaldependencies.org/u/feat/Number.html#Sing,denk,http://rdf.ivdnt.org/lexica/diamant/entry/molex/105055,http://universaldependencies.org/u/pos/VERB,denken,denk,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/365942,http://universaldependencies.org/u/pos/VERB
3,http://universaldependencies.org/u/feat/Number.html#Plur,dach/ten,http://rdf.ivdnt.org/lexica/diamant/entry/molex/105055,http://universaldependencies.org/u/pos/VERB,denken,dachten,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/256462,http://universaldependencies.org/u/pos/VERB
4,http://universaldependencies.org/u/feat/Number.html#Sing,denkt,http://rdf.ivdnt.org/lexica/diamant/entry/molex/105055,http://universaldependencies.org/u/pos/VERB,denken,denkt,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/256476,http://universaldependencies.org/u/pos/VERB
5,http://universaldependencies.org/u/feat/Number.html#Sing,denk,http://rdf.ivdnt.org/lexica/diamant/entry/molex/105055,http://universaldependencies.org/u/pos/VERB,denken,denk,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/804045,http://universaldependencies.org/u/pos/VERB
6,http://universaldependencies.org/u/feat/Number.html#Sing,dacht,http://rdf.ivdnt.org/lexica/diamant/entry/molex/105055,http://universaldependencies.org/u/pos/VERB,denken,dacht,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/256468,http://universaldependencies.org/u/pos/VERB
7,http://universaldependencies.org/u/feat/Number.html#Plur,dach/ten,http://rdf.ivdnt.org/lexica/diamant/entry/molex/105055,http://universaldependencies.org/u/pos/VERB,denken,dachten,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/256461,http://universaldependencies.org/u/pos/VERB
8,http://universaldependencies.org/u/feat/Number.html#Plur,den/ken,http://rdf.ivdnt.org/lexica/diamant/entry/molex/105055,http://universaldependencies.org/u/pos/VERB,denken,denken,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/256470,http://universaldependencies.org/u/pos/VERB
9,http://universaldependencies.org/u/feat/Number.html#Sing,denkt,http://rdf.ivdnt.org/lexica/diamant/entry/molex/105055,http://universaldependencies.org/u/pos/VERB,denken,denkt,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/256477,http://universaldependencies.org/u/pos/VERB


HBox(children=(Label(value='Sla uw resultaten op:'), Text(value='mijn_resultaten.csv'), Button(button_style='w…

Unnamed: 0,left context,lemma 0,universal_dependency 0,word 0,right context
0,het Caribisch Gebied heeft Ik,denken,VERB,denk,dat wij ook soortgelijke talenten
1,zich helemaal daarin vinden Hij,denken,VERB,denkt,dat zijn departement heel veel
2,in de regio De samenleving,denken,VERB,denkt,echter nog steeds dat het
3,Tijd om aan vakantiebesteding te,denken,VERB,denken,Misschien heeft u al duidelijke
4,Suriname of naar het buitenland,denken,VERB,Denkt,u wel aan uw medicatie
5,faciliteiten van het VCC Ik,denken,VERB,denk,dat na de oplevering 15
6,boven gebracht Een kleine misstap,denken,VERB,dacht,ik terwijl ik achteruit stapte
7,staan en niet na te,denken,VERB,denken,over je eigen sterfelijkheid.Nyiragongo zit
8,ook aan een belangrijk moment,denken,VERB,gedacht,De Anitri geloofshelden die dienstbaar
9,reële verwachtingen hebt dus niet,denken,VERB,denkt,dat je er door dit


HBox(children=(Label(value='Sla uw resultaten op:'), Text(value='mijn_resultaten.csv'), Button(button_style='w…

## Case study 6:
## Build a function with which we can gather all lemmata 
## of a lexicon with a given part-of-speech,
## and use that function to build a frequecy list of those lemmata in a corpus

In [6]:
from chaininglib.search.LexiconQuery import *
from chaininglib.search.CorpusQuery import *
from chaininglib.process.corpus import get_frequency_list
from chaininglib.ui.dfui import display_df
import numpy as np


# build a function as required. We will run it afterwards

def get_frequency_list_given_a_corpus(lexicon, pos, corpus):
    
    # LEXICON: get a lemmata list to work with

    # query the lexicon
    lq = create_lexicon(lexicon).pos(pos)
    df_lexicon = lq.results()

    # Put the results into an array, so we can loop through the found lemmata
    lexicon_lemmata_arr = [w.lower() for w in df_lexicon["writtenForm"]]

    # Instantiate a DataFrame, in which we will gather all single lemmata occurences
    df_full_list = pd.DataFrame()


    # CORPUS: loop through the lemmata list, query the corpus with each lemma, and count the results

    # It's a good idea to query more than one lemma at at the time,
    # but not too many, otherwise the server will get overloaded!
    nr_of_lemmata_to_query_atonce = 100

    # loop over lemmata list 
    for i in range(0, len(lexicon_lemmata_arr), nr_of_lemmata_to_query_atonce):
        
        # slice to small array of lemmata to query at once
        small_lemmata_arr = lexicon_lemmata_arr[i : i+nr_of_lemmata_to_query_atonce] 

        # join set of lemmata to send them in a query all at once
        # beware: single quotes need escaping
        lemmata_list = "|".join(small_lemmata_arr).replace("'", "\\\\'")
        cq = create_corpus(corpus).pattern(r'[lemma="' + lemmata_list + r'"]')
        df_corpus = cq.results()

        # add the results to the full list
        df_full_list = pd.concat( [df_full_list, df_corpus["lemma 0"]] )     
        

    # make sure the columnswith contains the lemmata is called 'lemma', as it is required by the get_frequency_list function
    df_full_list.columns = ['lemma']

    # we're done with querying, build the frequency list now
    freq_df = get_frequency_list(df_full_list)

    return freq_df

    
# run it!

lexicon="molex"
corpus_to_search="chn"
pos="CONJ"

freq_df = get_frequency_list_given_a_corpus(lexicon, pos, corpus_to_search)

display_df(freq_df)

Unnamed: 0,lemmata,token count,perc,rank
0,en,4,0.4,1
1,al,1,0.1,4
2,als,1,0.1,4
3,dan,1,0.1,4
4,dat,1,0.1,4
5,dus,1,0.1,4
6,waar,1,0.1,4


HBox(children=(Label(value='Sla uw resultaten op:'), Text(value='mijn_resultaten.csv'), Button(button_style='w…

## Case study 7: Build a frequency table of some corpus, based on lemma list of a given lexicon

In [None]:
from chaininglib.utils.dfops import get_rank_diff
from chaininglib.ui.dfui import display_df

# For this case study, we need to run the previous case study first, because it generates a function we need here

base_lexicon="molex"
corpus_to_search1="opensonar"
corpus_to_search2="chn"

# build frequency tables of two corpora

df_frequency_list1 = get_frequency_list_given_a_corpus(base_lexicon, "NOUN", corpus_to_search1)
# sort and display
df_top25_descending = df_frequency_list1.sort_values(ascending=False,by=['token count']).head(25)
df_top25_ascending =  df_frequency_list1.sort_values(ascending=True, by=['rank']).head(25)
display_df( df_top25_ascending )
print(type(df_top25_descending['token count']))
display_df( df_top25_descending['token count'], labels='chart df1', mode='chart' )

df_frequency_list2 = get_frequency_list_given_a_corpus(base_lexicon, "NOUN", corpus_to_search2)
# sort and display
df_top25_descending = df_frequency_list2.sort_values(ascending=False,by=['token count']).head(25)
df_top25_ascending =  df_frequency_list2.sort_values(ascending=True, by=['rank']).head(25)
display_df( df_top25_ascending )
display_df( df_top25_descending['token count'], labels='chart df2', mode='chart' )


# TODO: lemmata tonen die in 1 of 2 ontbreken

# compute the rank diff of lemmata in frequency tables

# sort and display
df_rankdiffs = get_rank_diff(df_frequency_list1, df_frequency_list2)

display_df(df_rankdiffs.sort_values(by=['rank_diff']).head(25))

df_top25_descending = df_rankdiffs.sort_values(ascending=False, by=['rank_diff']).head(25)
display_df( df_top25_descending['rank_diff'], labels='chart large diff', mode='chart' )

df_top25_ascending = df_rankdiffs.sort_values(ascending=True, by=['rank_diff']).head(25)
display_df( df_top25_ascending['rank_diff'], labels='chart small diff', mode='chart' )

## Case study 8: search in a corpus for wordforms of a lemma, which are not included in this lemma's paramadigm in a lexicon

In [None]:
from chaininglib.search.LexiconQuery import *
from chaininglib.search.CorpusQuery import *
from chaininglib.process.combined import get_missing_wordforms
from chaininglib.ui.dfui import display_df



# Let's build a function to do the job:
# The function will require a lexicon name and a part-of-speech to limit the search to, and the name of a corpus to be searched.
# It will return a Pandas DataFrame associating lemmata to their paradigms ('known_wordforms' column) and
# missing wordforms found in the corpus ('unknown_wordforms' column).

def get_missing_wordforms(lexicon, pos, corpus):    
    
    print('Finding missing wordforms in a lexicon can take a long time...');
    
    # LEXICON: 
    # get a lemmata list having a given part-of-speech
    
    lq = create_lexicon(lexicon).pos(pos)
    df_lexicon = lq.results()
    
    # Put the results into an array, so we can loop through the list of lemmata
    lexicon_lemmata_arr = [w.lower() for w in df_lexicon["writtenForm"]]
    
    # Prepare the output:
    # instantiate a DataFrame for storing lemmata and mssing wordforms
    df_enriched_lexicon = pd.DataFrame(index=lexicon_lemmata_arr, columns=['lemma', 'pos', 'known_wordforms', 'unknown_wordforms'])
    df_enriched_lexicon.index.name = 'lemmata'
    
    # CORPUS: 
    # loop through the lemmata list, query the corpus for each lemma, 
    # and compute paradigms differences between both

    
    # loop through the lemmata list
    # and query the corpus for occurances of the lemmata
    
    # It's a good idea to work with more than one lemma at the time (speed)!
    nr_of_lemmata_to_query_atonce = 100
    
    for i in range(0, len(lexicon_lemmata_arr), nr_of_lemmata_to_query_atonce):
        
        # slice to small array of lemmata to query at once
        small_lemmata_arr = lexicon_lemmata_arr[i : i+nr_of_lemmata_to_query_atonce]
        
        # join set of lemmata to send them in a query all at once
        # beware: single quotes need escaping
        lemmata_list = "|".join(small_lemmata_arr).replace("'", "\\\\'")
        cq = create_corpus(corpus).pattern(r'[lemma="' + lemmata_list + r'" pos="'+pos+'"]')
        df_corpus = cq.results()
        
        # if the corpus gave results,
        # query the lexicon for the same lemmata
        # and compare the paradigms!
        
        if (len(df_corpus)>0):
            for one_lemma in small_lemmata_set: 
                
                # look up the known wordforms in the lexicon
                ql = create_lexicon(lexicon).lemma(one_lemma).pos(pos)
                df_known_wordforms = ql.results()
                
                # we have a lexicon paradigm to compare, do the job now
                if (len(df_known_wordforms) != 0):
                    
                    # gather the lexicon wordforms in a set
                    known_wordforms = set( df_known_wordforms['wordform'].str.lower() )
                    
                    # gather the corpus wordforms (of the same lemma) in a set too
                    corpus_lemma_filter = (df_corpus['lemma 0'] == one_lemma)
                    corpus_wordforms = set( (df_corpus[ corpus_lemma_filter ])['word 0'].str.lower() )
                    
                    # Now compute the differences:
                    # gather in a set all the corpus wordforms that cannot be found in the lexicon wordforms 
                    unknown_wordforms = corpus_wordforms.difference(known_wordforms)

                    # If we found some missing wordforms, add the results to the output!
                    
                    if (len(unknown_wordforms) !=0):                        
                        # The index of our results will be a key consisting of lemma + part-of-speech
                        # Part-of-speech is needed to distinguish homonyms with different grammatical categories.
                        # Of course, we need to take glosses into account too to do a truely correct job
                        # But we didn't do it here
                        key = one_lemma + pos
                        df_enriched_lexicon.at[key, 'lemma'] = one_lemma
                        df_enriched_lexicon.at[key, 'pos'] = pos
                        df_enriched_lexicon.at[key, 'known_wordforms'] = known_wordforms
                        df_enriched_lexicon.at[key, 'unknown_wordforms'] = unknown_wordforms
                
    # return non-empty results, t.i. cases in which we found some wordforms
    return df_enriched_lexicon[ df_enriched_lexicon['unknown_wordforms'].notnull() ]


# Run the function!

base_lexicon="molex"
corpus_to_search="opensonar"

df = get_missing_wordforms(base_lexicon, "VERB", corpus_to_search)

# After such a heavy process, it's a good idea to save the results

df.to_csv( "missing_wordforms.csv", index=False)

display_df(df)


## Case study 9: Train a tagger with data from an annotated corpus, and do something cool

In [3]:
from chaininglib.ui.dfui import display_df
from chaininglib.process.corpus import get_tagger
from chaininglib.search.CorpusQuery import *
from chaininglib.search.LexiconQuery import *

import pandas as pd

base_lexicon="molex"

# we have a given word, let's say: "loop"
some_word = "loop"

# get the paradigm of the lemma our word is a part of
l = create_lexicon(base_lexicon).lemma(some_word)
df_paradigm = l.results()
display_df(df_paradigm)

# gather some pattern including our word, out of annotated corpora
# here: DET + ADJ + 'loop'

dfs_all_corpora = []

for one_corpus in get_available_corpora():
    print('querying '+one_corpus+'...')
    c = create_corpus(one_corpus).word(some_word).detailed_context(True)
    df_corpus = c.results() 
    
    # store the results
    dfs_all_corpora.append(df_corpus)

# get a tagger trained with our corpus data
tagger = get_tagger(dfs_all_corpora)

# Use the trained tagger to tag unknown sentences
# The input must be like: tagger.tag(['today','is','a','beautiful','day'])

sentence = 'Mijn buurman kijkt door de loop van zijn geweer'
tagged_sentence = tagger.tag( sentence.split() )

print(tagged_sentence)


# Know we can lemmatize each occurence of our lemma in the new sentences

3


Unnamed: 0,Gender,Number,hyphenation,lemEntryId,lemPos,lemma,wordform,wordformId,wordformPos
0,http://universaldependencies.org/u/feat/Gender.html#Masc,http://universaldependencies.org/u/feat/Number.html#Sing,loop,http://rdf.ivdnt.org/lexica/diamant/entry/molex/45573,http://universaldependencies.org/u/pos/NOUN,loop,loop,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/89078,http://universaldependencies.org/u/pos/NOUN
1,http://universaldependencies.org/u/feat/Gender.html#Masc,http://universaldependencies.org/u/feat/Number.html#Plur,lo/pen,http://rdf.ivdnt.org/lexica/diamant/entry/molex/45573,http://universaldependencies.org/u/pos/NOUN,loop,lopen,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/156030,http://universaldependencies.org/u/pos/NOUN
2,http://universaldependencies.org/u/feat/Gender.html#Masc,http://universaldependencies.org/u/feat/Number.html#Sing,loop,http://rdf.ivdnt.org/lexica/diamant/entry/molex/45573,http://universaldependencies.org/u/pos/NOUN,loop,loop,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/89078,http://universaldependencies.org/u/pos/NOUN
3,,http://universaldependencies.org/u/feat/Number.html#Sing,loop,http://rdf.ivdnt.org/lexica/diamant/entry/molex/106637,http://universaldependencies.org/u/pos/VERB,lopen,loop,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/560193,http://universaldependencies.org/u/pos/VERB
4,,http://universaldependencies.org/u/feat/Number.html#Sing,loop,http://rdf.ivdnt.org/lexica/diamant/entry/molex/106637,http://universaldependencies.org/u/pos/VERB,lopen,loop,http://rdf.ivdnt.org/lexica/diamant/wordform/molex/822354,http://universaldependencies.org/u/pos/VERB


HBox(children=(Label(value='Sla uw resultaten op:'), Text(value='mijn_resultaten.csv'), Button(button_style='w…

querying chn...
querying opensonar...
querying zeebrieven...
querying gysseling...
querying nederlab...
[('Mijn', 'PRON OR DET'), ('buurman', 'SCONJ OR CCONJ'), ('kijkt', 'NOUN'), ('door', 'ADP'), ('de', 'DET'), ('loop', 'NOUN'), ('van', 'ADP'), ('zijn', 'DET'), ('geweer', 'NOUN')]


## Case study 10: Search in corpus and filter on metadata
First, we request all available metadata fields of the corpus. Then, we issue a search query, and request all metadata fields for the result. Finally, we filter on metadata values.

In [2]:
from chaininglib.search.metadata import get_available_metadata
from chaininglib.utils.dfops import df_filter
from chaininglib.ui.dfui import display_df
from chaininglib.search.CorpusQuery import *


corpus_name="zeebrieven"
query=r'[lemma="boek"]'
# Request all metadata fields from corpus
fields = get_available_metadata(corpus_name)
# Perform query and ask all metadata
c = create_corpus(corpus_name).pattern(query).extra_fields_doc(fields["document"]) 
df_corpus = c.results()

# Filter on year: > 1700
df_filter_year = df_corpus[df_corpus["witnessYear_from"].astype('int32') > 1700] 
display_df(df_filter_year, labels="After 1700")

# Filter on sender birth place Amsterdam
condition = df_filter(df_corpus["afz_geb_plaats"], regex_or_set="Amsterdam")
df_filter_place = df_corpus[ condition ]
display_df(df_filter_place, labels="Sender born in Amsterdam")


# Group by birth place
df = property_freq(df_corpus,"afz_loc_plaats")
display_df(df, labels="Most frequent sender locations")

Unnamed: 0,left context,lemma 0,universal_dependency 0,word 0,aantal_paginas,aantal_woorden,adr_beroep,adr_bijzonderheden,adr_geb_decennium,adr_geb_jaar,...,signatuur,status,subcorpus,title,trans_bestand,trans_datum,type_brief,witnessYear_from,witnessYear_to,right context
3,Schrijfpampier a 6 Sr t,boek,NOUN,boek.,2,440,,,unknown,unknown,...,HCA 30-322,6,"18V, 18Dbc","To J.H. Martens, 11 november 1780",07-01-2009 032-034-TR-def,2009-04-15 00:00:00,business,1780,1780,50 ditto ongsneeden ditto a
4,"voorne missive, dat UEDs mijn",boek,NOUN,boeken,2,291,secretaris,,1750,1751,...,HCA 30-323,6,18D,"To Gabriel Lixraaven, 10 januari 1781",08-01-2009 109-110-TR-def,2009-11-21 00:00:00,private,1781,1781,"bij berntrop laat verkoopen, als"
5,"verbeeld hebbe, dat wat geleerde",boek,NOUN,boeken,2,291,secretaris,,1750,1751,...,HCA 30-323,6,18D,"To Gabriel Lixraaven, 10 januari 1781",08-01-2009 109-110-TR-def,2009-11-21 00:00:00,private,1781,1781,"aan gaat, schouten altoos geprafereert"
6,"beijde, accuratesse goede order der",boek,NOUN,"boecken,",2,559,reder,,1750,1757,...,HCA 30-323,6,18D,"To David Wendorp, 16 december 1780",08-01-2009 133-135-TR-def,2009-09-09 00:00:00,private,1780,1780,"& Voorsigtigheijd, die prijse ik"
7,rollen matten MR & Eenige,boek,NOUN,boeken,3,844,,,1730,1736,...,HCA 30-323,6,18D,"To Magdalena Wendorp-Bagge, 16 december 1780",08-01-2009 149-151-TR-def,2009-06-10 00:00:00,private,1780,1780,Maar also die twee Scheepen
8,"te Senden, & wat de",boek,NOUN,boeken,3,844,,,1730,1736,...,HCA 30-323,6,18D,"To Magdalena Wendorp-Bagge, 16 december 1780",08-01-2009 149-151-TR-def,2009-06-10 00:00:00,private,1780,1780,aan betreft Zo Gelieft maar
9,laaten wagten tot ik mijn,boek,NOUN,boeken,2,630,,,unknown,unknown,...,HCA 30-319,6,"18S, 18A, 18Da","To Christiaan Frederik Coller, december 1780",1108-1110-TR-def,2009-01-22 00:00:00,private,1780,1780,"heeft, want weet anders niet"
10,& houw nu alle de,boek,NOUN,boeken,2,630,,,unknown,unknown,...,HCA 30-319,6,"18S, 18A, 18Da","To Christiaan Frederik Coller, december 1780",1108-1110-TR-def,2009-01-22 00:00:00,private,1780,1780,& bonte & die sijn
11,gantsche regel die in myn,boek,NOUN,Boek,3,1448,,,unknown,unknown,...,HCA 30-368,6,"18S, 18Da","To Christina Bakker-Smits, 26 september 1780",1156-1159-TR-def,2008-09-11 00:00:00,private,1780,1780,"niet te vinden is, en"
16,van concept en wilde syn,boek,NOUN,boeken,2,669,koopman,,unknown,unknown,...,HCA 30-353,6,18Dbc,"To Johan Godfried Meeler, 5 januari 1781",17-06-2009 324-326-TR-def,2009-10-22 00:00:00,business,1781,1781,absolut ter secretarij versorgen Eyndelyk




Unnamed: 0,left context,lemma 0,universal_dependency 0,word 0,aantal_paginas,aantal_woorden,adr_beroep,adr_bijzonderheden,adr_geb_decennium,adr_geb_jaar,...,signatuur,status,subcorpus,title,trans_bestand,trans_datum,type_brief,witnessYear_from,witnessYear_to,right context
0,heeft 4 gl 0 Aen,boek,NOUN,boeken,2,496,schipper,,1620,1623,...,HCA 32-1845-2,6,17B,"To Lukas Pruijs, 13 januari 1661",06-01-2009 243-245-TR,2009-05-11 00:00:00,private,1661,1661,en pampier en pennen en
1,lijeue man stelt alles te,boek,NOUN,boeck,1,900,schipper,,1620,1623,...,HCA 32-1845-2,6,"17A, 17B","To Lukas Pruijs, 6 juni 1664",06-01-2009 249-252-TR-def,2009-04-09 00:00:00,private,1664,1664,waet ghij uijt geft dat
4,"voorne missive, dat UEDs mijn",boek,NOUN,boeken,2,291,secretaris,,1750,1751,...,HCA 30-323,6,18D,"To Gabriel Lixraaven, 10 januari 1781",08-01-2009 109-110-TR-def,2009-11-21 00:00:00,private,1781,1781,"bij berntrop laat verkoopen, als"
5,"verbeeld hebbe, dat wat geleerde",boek,NOUN,boeken,2,291,secretaris,,1750,1751,...,HCA 30-323,6,18D,"To Gabriel Lixraaven, 10 januari 1781",08-01-2009 109-110-TR-def,2009-11-21 00:00:00,private,1781,1781,"aan gaat, schouten altoos geprafereert"
11,gantsche regel die in myn,boek,NOUN,Boek,3,1448,,,unknown,unknown,...,HCA 30-368,6,"18S, 18Da","To Christina Bakker-Smits, 26 september 1780",1156-1159-TR-def,2008-09-11 00:00:00,private,1780,1780,"niet te vinden is, en"
19,"ik u bij dese, 2",boek,NOUN,"Boecken,",1,351,onderschrijver,,1650,1652,...,HCA 30-228,6,17B,"To Martinus Bruno, 7 november 1672",3-1-2008 185-TR-def,2010-02-19 00:00:00,private,1672,1672,die bij mij gedrukt en
20,seggen dat in ander mans,boek,NOUN,bocken,2,367,,,unknown,unknown,...,HCA 30-642-1,6,17B,"To Elsje Schoonhoven, 9 december 1664",3b-1-2008 161-162-TR-def,2009-06-02 00:00:00,private,1664,1664,quadt studeren is voor waert
30,ik het wissiltie wegens die,boek,NOUN,boeken,1,229,kapitein,,1710,1718,...,HCA 30-336,6,"18A, 18Da","To Gerrit Harmeijer, 7 oktober 1780",KB 336-018-TR-def,2008-11-21 00:00:00,private,1780,1780,die onse Zoon Gerrit heeft
31,ik myn liede man wat,boek,NOUN,bokken,1,176,derde waak,,1740,1748,...,HCA 30-750,6,18Da,"To Christiaan de Cerff, december 1779",KB c14-1-c14-2-TR-def,2010-12-06 00:00:00,private,1779,1779,wat kastengen wat nuyten wat


HBox(children=(Label(value='Sla uw resultaten op:'), Text(value='SenderborninAmsterdam.csv'), Button(button_st…

NameError: name 'property_freq' is not defined

## Case study 11: Visualizing h-dropping

In [None]:
from chaininglib.search.CorpusQuery import *
from chaininglib.search.metadata import get_available_metadata
from chaininglib.ui.dfui import display_df

corpus_to_search="chn"

fields = get_available_metadata(corpus_to_search)


df_corpus1 = create_corpus(corpus_to_search).pattern(r'[lemma="h[aeo].*" & word="[aeo].*"]').extra_fields_doc(fields["document"]).results()
df_corpus1 = create_corpus(corpus_to_search).pattern(r'[lemma="h[aeo].*" & word="h[aeo].*"]').extra_fields_doc(fields["document"]).results()

display_df( df_corpus1)
display_df( df_corpus2)

display_df( df_corpus1.groupby(["Region", "Date"]), labels="h-dropping", mode='chart')
display_df( df_corpus2.groupby(["Region", "Date"]), labels="normal", mode='chart')

## Case study 12: gather data from several corpora and generate a lexicon out of that

In [1]:
from chaininglib.ui.dfui import display_df
from chaininglib.process.corpus import extract_lexicon
from chaininglib.search.CorpusQuery import *
from chaininglib.search.LexiconQuery import *

dfs_all_corpora = []

for one_corpus in get_available_corpora():
    print('querying '+one_corpus+'...')
    c = create_corpus(one_corpus).lemma("lopen").detailed_context(True)
    df_corpus = c.results() 
    
    # store the results
    dfs_all_corpora.append(df_corpus)

# extract lexicon and show the result
extracted_lexicon = extract_lexicon(dfs_all_corpora, posColumnName="universal_dependency")
display(extracted_lexicon)

querying chn...
querying opensonar...
querying zeebrieven...
querying gysseling...
querying nederlab...
extracting lexicon...


Unnamed: 0,lemma,universal_dependency,word
0,aan,ADP,an
1,aandoen,VERB,an ghedaen.
2,aankomen,VERB,koomen
3,aanlopen,VERB,aen
4,aanlopen,VERB,loopen
5,aarde,NOUN,arden
6,aarde,NOUN,erde
7,accijns,NOUN,assize
8,achtentwintigste,NUM,28
9,af,ADP,af
