In [1]:
import requests
from tqdm.notebook import tqdm
import pandas as pd

In [2]:
input_text = '''
Romano's is the finest Italian restaurant in the city.
Unless you are a celebrity or a good friend of Romano you will need a reservation.
A reservation is made for a specific time, date and number of people.
The reservation also captures the name and phone number of the person making the reservation.
Each reservation is assigned a unique reservation number.
There are two categories of reservations at Romano's: individual reservations and banquet reservations.
Additional reservation information captured when an individual makes a reservation includes seating preference (inside or patio) and smoking preference (smoking or nonsmoking).
Additional reservation information captured for banquet reservations includes the group name and the method of payment.
Seating at Romano's is limited.
Romano's has a fixed number of tables.
Each table is identified by a unique table number.
Each of the tables is further described by a unique free form description such as "located by the North window", "located in front of the fountain", "by the kitchen door".
Each table is classified as a 2-person, 4-person or 6-person table.
When a reservation is made, Romano associates a specific tal4 number(s) to the reservation.
A table can be utilized many times over the evening by many reservations.
Romano tends to overbook tables.
Therefore, there can be overlapping table reservations.
The management structure at Romano's is hierarchical.
There are several restaurant managers who report to Romano.
The managers are responsible for managing the Maitre'd and the chefs as well as ensuring that the guests have a pleasant dining experience.
The Maitre'd is responsible for managing the waiters, bartenders and bus personnel.
The Chefs are responsible for managing the cooks and dishwashers.
Each person working for Romano's must be classified as either a manager, Maitre'd, waiter ,bartender, chef, cook, bus person or dishwasher.
Additional information maintained by Romano's for each person includes the persons name, date of birth and drivers license number.
When the reservation party arrives at Romano's the reservation is assigned to one waiter.
A waiter can be assigned to many reservations during the course of the evening.
The menu at Romano's is exquisite.
There are many exciting and exotic items.
Each menu item is identified by a unique menu item number.
Information maintained by Romano's for each menu item includes an item description of (e.g. "chicken marsala", "fish soup", "endive salad","1988 Merlot wine", etc.), and item prep time.
Each menu item is classified by Romano's as "appetizer", "entree", "dessert" or "beverage".
The price of each menu item can vary based on the time of day.
For example, some of the menu items have different lunch and dinner prices.
Some of the menu items change prices for happy hour.
In order to calculate the check at the end of the dinner, the waiter maintains a list, by reservation number, of the menu items ordered and the time that the menu item was ordered.
In other words, each reservation can be associated with many menu items and a menu item can be associated with many reservations.
In addition to menu items, Romano's maintains a list of the food items that are utilized by the restaurant such as chicken, mushrooms, bread sticks, red sauce, cream sauce, etc.
Food items are utilized in the preparation of menu items.
Each food item-is identified by a unique food item number.
'''

## Summarization

In [6]:
url = 'http://192.168.2.8:5000/summarize'
data = {'target': input_text}

summary_res = requests.post(url, data)
summary = summary_res.json()['summary']

In [7]:
summary

['A reservation is made for a specific time , date and number of people .',
 'reservation also captures the name and phone number of the person making the reservation .',
 'Each reservation is assigned a unique reservation number .']

## Bucketing
### Supersense tagging

In [8]:
def get_supersenses(sentence):
    url = 'http://localhost:5001/tag'
    data = {'target': sentence}

    tagging_res = requests.post(url, data)
    tags = tagging_res.json()['tags']
    splitted_tags = [line.split('\t') for line in tags]
    tag_df = pd.DataFrame(splitted_tags, columns=['word_no', 'token', 'lemma', 'POS-tag', 'extended_supersense', 'skip1', 'skip2', 'supersense', 'skip3'])
    
    tag_df = tag_df[tag_df['token'].notna()]
    return tag_df

supersenses = [get_supersenses(sent) for sent in tqdm(summary)]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [9]:
supersenses[0]

Unnamed: 0,word_no,token,lemma,POS-tag,extended_supersense,skip1,skip2,supersense,skip3
0,1,A,a,DT,O,0,,,
1,2,reservation,reservation,NN,O-COGNITION,0,,COGNITION,
2,3,is,be,VBZ,O-`a,0,,`a,
3,4,made,make,VBN,O-creation,0,,creation,
4,5,for,for,IN,O,0,,,
5,6,a,a,DT,O,0,,,
6,7,specific,specific,JJ,O,0,,,
7,8,time,time,NN,O-TIME,0,,TIME,
8,9,",",",",",",O,0,,,
9,10,date,date,NN,O-TIME,0,,TIME,


### Rule-based bucketing

In [10]:
def get_metadata(supersense_df):
    prepositions = supersense_df[supersense_df['POS-tag'] == 'IN']

    metadata = {
        'objects': []
    }

    # Check if there are groups of nouns
    if len(prepositions) > 0:
        # Define target groups
        targets = ['NN', 'NNS']
            
        for pps in prepositions.iterrows():
            # If ending or beginning position, it can't be a group of nouns
            if not (pps[0] == 0 or pps[0] >= len(supersense_df) - 1):
                # Define surroundings
                preword = supersense_df.loc[pps[0] - 1]
                postword = supersense_df.loc[pps[0] + 1]
                
                # Check if surroundings are in target groups
                if preword['POS-tag'] in targets and postword['POS-tag'] in targets:
                    # Add grouping to metadata
                    metadata['objects'].append(preword.token + ' ' + pps[1].token + ' ' + postword.token)
                    
                    # Drop extracted groups from selection
                    mask = supersense_df.index.isin(list(range(pps[0] -1, pps[0] + 2)))
                    supersense_df = supersense_df[~mask]
            
        # Now get all other nouns and add to metadata
        nouns = supersense_df[supersense_df['POS-tag'].isin(targets)]
        metadata['objects'] = metadata['objects'] + nouns['token'].to_list()
    
    return metadata


def is_important_for_class(supersense_df):
    # Rule 1: there needs to be at least two nouns in the sentences
    if len(supersense_df[supersense_df['POS-tag'] == 'NN']) > 1:
        return True
    else:
        return False

def apply_bucketing(sentences_supersenses):
    return {
        'classes': list(filter(lambda x: is_important_for_class(x), sentences_supersenses))
    }

In [11]:
buckets = apply_bucketing(supersenses)

In [12]:
buckets['classes']

[   word_no        token        lemma POS-tag extended_supersense skip1 skip2  \
 0        1            A            a      DT                   O     0         
 1        2  reservation  reservation      NN         O-COGNITION     0         
 2        3           is           be     VBZ                O-`a     0         
 3        4         made         make     VBN          O-creation     0         
 4        5          for          for      IN                   O     0         
 5        6            a            a      DT                   O     0         
 6        7     specific     specific      JJ                   O     0         
 7        8         time         time      NN              O-TIME     0         
 8        9            ,            ,       ,                   O     0         
 9       10         date         date      NN              O-TIME     0         
 10      11          and          and      CC                   O     0         
 11      12       number    

## Metadata classification

In [31]:
class_df

Unnamed: 0,word_no,token,lemma,POS-tag,extended_supersense,skip1,skip2,supersense,skip3
0,1,A,a,DT,O,0,,,
1,2,reservation,reservation,NN,O-COGNITION,0,,COGNITION,
2,3,is,be,VBZ,O-`a,0,,`a,
3,4,made,make,VBN,O-creation,0,,creation,
4,5,for,for,IN,O,0,,,
5,6,a,a,DT,O,0,,,
6,7,specific,specific,JJ,O,0,,,
7,8,time,time,NN,O-TIME,0,,TIME,
8,9,",",",",",",O,0,,,
9,10,date,date,NN,O-TIME,0,,TIME,


In [52]:
target_dfs = list(map(lambda x: x[['token', 'lemma', 'POS-tag', 'supersense']].values.tolist(), buckets['classes']))

url = 'http://localhost:5002/retrieve-metadata'
data = {'target': target_dfs}

metadata_res = requests.post(url, data)
metadata = metadata_res.json()['metadata']

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [46]:
target_dfs[0]

[['A', 'a', 'DT', ''],
 ['reservation', 'reservation', 'NN', 'COGNITION'],
 ['is', 'be', 'VBZ', '`a'],
 ['made', 'make', 'VBN', 'creation'],
 ['for', 'for', 'IN', ''],
 ['a', 'a', 'DT', ''],
 ['specific', 'specific', 'JJ', ''],
 ['time', 'time', 'NN', 'TIME'],
 [',', ',', ',', ''],
 ['date', 'date', 'NN', 'TIME'],
 ['and', 'and', 'CC', ''],
 ['number', 'number', 'NN', 'QUANTITY'],
 ['of', 'of', 'IN', ''],
 ['people', 'person', 'NNS', 'PERSON'],
 ['.', '.', '.', '']]

In [47]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][2]
    
    features = {
        'word': word,
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'postag': postag,
        'postag[:2]': postag[:2],
        'lemma': sent[i][1],
        'supersense': sent[i][3]
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][2]
        features.update({
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][2]
        features.update({
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, lemma, postag, supersense, label in sent]

In [48]:
sent2features(target_dfs[0])

[{'word': 'A',
  'word[-3:]': 'A',
  'word[-2:]': 'A',
  'postag': 'DT',
  'postag[:2]': 'DT',
  'lemma': 'a',
  'supersense': '',
  'BOS': True,
  '+1:postag': 'NN',
  '+1:postag[:2]': 'NN'},
 {'word': 'reservation',
  'word[-3:]': 'ion',
  'word[-2:]': 'on',
  'postag': 'NN',
  'postag[:2]': 'NN',
  'lemma': 'reservation',
  'supersense': 'COGNITION',
  '-1:postag': 'DT',
  '-1:postag[:2]': 'DT',
  '+1:postag': 'VBZ',
  '+1:postag[:2]': 'VB'},
 {'word': 'is',
  'word[-3:]': 'is',
  'word[-2:]': 'is',
  'postag': 'VBZ',
  'postag[:2]': 'VB',
  'lemma': 'be',
  'supersense': '`a',
  '-1:postag': 'NN',
  '-1:postag[:2]': 'NN',
  '+1:postag': 'VBN',
  '+1:postag[:2]': 'VB'},
 {'word': 'made',
  'word[-3:]': 'ade',
  'word[-2:]': 'de',
  'postag': 'VBN',
  'postag[:2]': 'VB',
  'lemma': 'make',
  'supersense': 'creation',
  '-1:postag': 'VBZ',
  '-1:postag[:2]': 'VB',
  '+1:postag': 'IN',
  '+1:postag[:2]': 'IN'},
 {'word': 'for',
  'word[-3:]': 'for',
  'word[-2:]': 'or',
  'postag': 'IN