# Lesk algorithm
http://www.nltk.org/howto/wsd.html

In [1]:
from nltk.wsd import lesk

In [2]:
sent = 'a customer has a first name'.split(' ')
print(lesk(sent, 'have'))

Synset('rich_person.n.01')


In [3]:
from nltk.corpus import wordnet as wn

In [4]:
for ss in wn.synsets('have'):
    print(ss, ss.definition())

Synset('rich_person.n.01') a person who possesses great material wealth
Synset('have.v.01') have or possess, either in a concrete or an abstract sense
Synset('have.v.02') have as a feature
Synset('experience.v.03') go through (mental or physical states or experiences)
Synset('own.v.01') have ownership or possession of
Synset('get.v.03') cause to move; cause to be in a certain position or condition
Synset('consume.v.02') serve oneself to, or consume regularly
Synset('have.v.07') have a personal or business relationship with someone
Synset('hold.v.03') organize or be responsible for
Synset('have.v.09') have left
Synset('have.v.10') be confronted with
Synset('have.v.11') undergo
Synset('have.v.12') suffer from; be ill with
Synset('induce.v.02') cause to do; cause to act in a specified manner
Synset('accept.v.02') receive willingly something given or offered
Synset('receive.v.01') get something; come into possession of
Synset('suffer.v.02') undergo (as of injuries and illnesses)
Synset('ha

# ACL 2016 SuperSenses
https://github.com/UKPLab/acl2016-supersense-embeddings

# Rungsted
https://github.com/coastalcph/rungsted

# AMALGrAM 2.0
https://github.com/nschneid/pysupersensetagger

In [5]:
import spacy
import pandas as pd
import os
from nltk import tokenize

nlp = spacy.load("en_core_web_sm")

In [6]:
def get_supersenses(sent):
    doc = nlp(sent)
    target_file = '../../pysupersensetagger/input'

    if os.path.exists(target_file):
        os.remove(target_file)
        
    with open(target_file, 'a') as inp_file:
        for token in doc:
            inp_file.write(f'{token}\t{token.tag_}\n')
    
    
    os.system(f'cd {"/".join(target_file.split("/")[:-1])} && sh sst.sh {target_file.split("/")[-1]}')
    
    return pd.read_csv('../../pysupersensetagger/input.pred.tags', sep='\t', names=['token', 'lemma', 'POS-tag', 'MWE+supersense tag', 'MWE parent offset', 'MWE attachment strength', 'supersense label', 'sentence ID'])

In [7]:
def is_important_for_class(supersense_df):
    # Rule 1: there needs to be at least two nouns in the sentences
    if len(supersense_df[supersense_df['POS-tag'] == 'NN']) > 1:
        return True
    else:
        return False
    
def get_class_metadata(supersense_df):
    
    

def apply_bucketing(summary):
    sentences = tokenize.sent_tokenize(summary)
    
    sentences_supersenses = [get_supersenses(sentence) for sentence in sentences]
    
    bucketed_data = {
        'class': list(filter(lambda x: is_important_for_class(x), sentences_supersenses)),
    }
    
    metadata = {
        'class': map(lambda x: get_class_metadata(x), bucketed_data['class']),
    }
    
    return ({
        bucket: [' '.join(df['token'].values) for df in bucketed_data[bucket]]
        for bucket in bucketed_data.keys()
    }, )
    

IndentationError: expected an indented block (<ipython-input-7-c18d74828d13>, line 12)

In [None]:
apply_bucketing('A reservation is made for a specific time , date and number of people . The reservation also captures the name and phone number of the person making the reservation . Each reservation is assigned a unique reservation number .')

{'classes': ['A reservation is made for a specific time , date and number of people .',
  'The reservation also captures the name and phone number of the person making the reservation .',
  'Each reservation is assigned a unique reservation number .']}

In [None]:
tester = get_supersenses('A reservation is made for a specific time , date and number of people .')

In [None]:
supersense_df = tester.copy()

In [None]:
mask = (tester['POS-tag'] == 'DT')


import spacy

nlp = spacy.load("en_core_web_sm")

doc = nlp(' '.join(tester[~mask]['token'].to_list()))

for chunk in doc.noun_chunks:
    print(chunk.text)

reservation
specific time
date
number
people


In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

doc = nlp('A reservation is made for a specific time , date and number of people . The reservation also captures the name and phone number of the person making the reservation . Each reservation is assigned a unique reservation number .')

for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

A reservation reservation nsubjpass made
a specific time time pobj for
date date conj time
number number conj date
people people pobj of
The reservation reservation nsubj captures
the name name dobj captures
phone number number conj name
the person person pobj of
the reservation reservation dobj making
Each reservation reservation nsubjpass assigned
a unique reservation number number dobj assigned


In [None]:
supersense_df = tester.copy()

prepositions = supersense_df[supersense_df['POS-tag'] == 'IN']

metadata = {
    'objects': []
}

# Check if there are groups of nouns
if len(prepositions) > 0:
    # Define target groups
    targets = ['NN', 'NNS']
        
    for pps in prepositions.iterrows():
        # If ending or beginning position, it can't be a group of nouns
        if not (pps[0] == 0 or pps[0] >= len(supersense_df) - 1):
            # Define surroundings
            preword = supersense_df.loc[pps[0] - 1]
            postword = supersense_df.loc[pps[0] + 1]
            
            # Check if surroundings are in target groups
            if preword['POS-tag'] in targets and postword['POS-tag'] in targets:
                # Add grouping to metadata
                metadata['objects'].append(preword.token + ' ' + pps[1].token + ' ' + postword.token)
                
                # Drop extracted groups from selection
                mask = supersense_df.index.isin(list(range(pps[0] -1, pps[0] + 2)))
                supersense_df = supersense_df[~mask]
        
    # Now get all other nouns and add to metadata
    nouns = supersense_df[supersense_df['POS-tag'].isin(targets)]
    metadata['objects'] = metadata['objects'] + nouns['token'].to_list()

{'objects': ['number of people', 'reservation', 'time', 'date']}


In [None]:
tester

Unnamed: 0,token,lemma,POS-tag,MWE+supersense tag,MWE parent offset,MWE attachment strength,supersense label,sentence ID
1,A,a,DT,O,0,,,
2,reservation,reservation,NN,O-COGNITION,0,,COGNITION,
3,is,be,VBZ,O-`a,0,,`a,
4,made,make,VBN,O-creation,0,,creation,
5,for,for,IN,O,0,,,
6,a,a,DT,O,0,,,
7,specific,specific,JJ,O,0,,,
8,time,time,NN,O-TIME,0,,TIME,
9,",",",",",",O,0,,,
10,date,date,NN,O-TIME,0,,TIME,


In [None]:
get_supersenses('A reservation is made for a specific time , date and number of people . The reservation also captures the name and phone number of the person making the reservation . Each reservation is assigned a unique reservation number .')

Unnamed: 0,token,lemma,POS-tag,MWE+supersense tag,MWE parent offset,MWE attachment strength,supersense label,sentence ID
1,A,a,DT,O,0,,,
2,reservation,reservation,NN,O-COGNITION,0,,COGNITION,
3,is,be,VBZ,O-`a,0,,`a,
4,made,make,VBN,O-creation,0,,creation,
5,for,for,IN,O,0,,,
6,a,a,DT,O,0,,,
7,specific,specific,JJ,O,0,,,
8,time,time,NN,O-TIME,0,,TIME,
9,",",",",",",O,0,,,
10,date,date,NN,O-TIME,0,,TIME,


### Use cases

In [None]:
tester = 'A user can buy a beverage, perform scheduled maintenance, make repairs and load items.'
get_supersenses(tester)

Unnamed: 0,token,lemma,POS-tag,MWE+supersense tag,MWE parent offset,MWE attachment strength,supersense label,sentence ID
1,A,a,DT,O,0,,,
2,user,user,NN,O-PERSON,0,,PERSON,
3,can,can,MD,O,0,,,
4,buy,buy,VB,O-possession,0,,possession,
5,a,a,DT,O,0,,,
6,beverage,beverage,NN,O-FOOD,0,,FOOD,
7,",",",",",",O,0,,,
8,perform,perform,VB,O-`a,0,,`a,
9,scheduled,schedule,VBN,O-social,0,,social,
10,maintenance,maintenance,NN,O-ACT,0,,ACT,


### Activities

In [None]:
tester = 'If the bill is higher than 3000 euros, the bill is sent to the finance department.'
get_supersenses(tester)

Unnamed: 0,token,lemma,POS-tag,MWE+supersense tag,MWE parent offset,MWE attachment strength,supersense label,sentence ID
1,If,if,IN,O,0,,,
2,the,the,DT,O,0,,,
3,bill,bill,NN,O-POSSESSION,0,,POSSESSION,
4,is,be,VBZ,O-stative,0,,stative,
5,higher,higher,JJR,O,0,,,
6,than,than,IN,O,0,,,
7,3000,3000,CD,O,0,,,
8,euros,euro,NNS,O-QUANTITY,0,,QUANTITY,
9,",",",",",",O,0,,,
10,the,the,DT,O,0,,,


### Classes

In [None]:
tester = 'Each person has name and birthdate and may work at a job.'
get_supersenses(tester)

Unnamed: 0,token,lemma,POS-tag,MWE+supersense tag,MWE parent offset,MWE attachment strength,supersense label,sentence ID
1,Each,each,DT,O,0,,,
2,person,person,NN,O-PERSON,0,,PERSON,
3,has,have,VBZ,O-stative,0,,stative,
4,name,name,NN,O-COMMUNICATION,0,,COMMUNICATION,
5,and,and,CC,O,0,,,
6,birthdate,birthdate,NN,O-COMMUNICATION,0,,COMMUNICATION,
7,and,and,CC,O,0,,,
8,may,may,MD,O,0,,,
9,work,work,VB,O-social,0,,social,
10,at,at,IN,O,0,,,


# BookNLP

In [7]:
!python3 -m pip install booknlp

Defaulting to user installation because normal site-packages is not writeable




In [40]:
from booknlp.booknlp import BookNLP
import pandas as pd
import numpy as np

In [51]:
model_params={
    "pipeline": "entity,supersense,event", 
    "model": "small"
}

booknlp=BookNLP("en", model_params)

def get_annotated_df(filename):
    # File identifier
    file_id = '.'.join(filename.split('.')[:-1])

    # Input file to process
    input_file=f'./../../PurePlainDataset/output/{filename}'

    # Output directory to store resulting files in
    output_directory=f'output/{file_id}/'

    # Publish results
    booknlp.process(input_file, output_directory, file_id)

    # Gather results
    df = pd.read_csv(f'{output_directory}{file_id}.tokens', sep='\t')

    # Gather supersenses
    supersenses = pd.read_csv(f'{output_directory}{file_id}.supersense', sep='\t')

    # Begin merging both dataframes
    df['supersense_category'] = ''

    # Loop over supersense df
    for index, row in supersenses.iterrows():
        # Add supersense to all rows in range of starting and ending tokens
        for token in range(row.start_token, row.end_token + 1):
            df.loc[df.token_ID_within_document == token, 'supersense_category'] = row.supersense_category
    
    # Gather entities
    supersenses = pd.read_csv(f'{output_directory}{file_id}.entities', sep='\t')

    # Begin merging entities
    df['prop'] = ''
    df['cat'] = ''
    df['COREF'] = np.nan

    # Loop over entity df
    for index, row in supersenses.iterrows():
        # Add entities to all rows in range of starting and ending tokens
        for token in range(row.start_token, row.end_token + 1):
            df.loc[df.token_ID_within_document == token, ['prop', 'cat', 'COREF']] = [row.prop, row['cat'], row.COREF]
    
    sent_groups = df.groupby('sentence_ID')
    return [sent_groups.get_group(x) for x in sent_groups.groups]

{'pipeline': 'entity,supersense,event', 'model': 'small'}
--- startup: 7.021 seconds ---


In [52]:
dfs = get_annotated_df('0000 - cctns.txt')

--- spacy: 0.816 seconds ---
--- entities: 7.964 seconds ---
--- quotes: 0.010 seconds ---
--- name coref: 0.001 seconds ---
--- TOTAL (excl. startup): 11.654 seconds ---, 5430 words
