# Restaurant

## Import

In [61]:
import pandas as pd
import numpy as np
import nltk
from sklearn.metrics import accuracy_score
from neo4j.v1 import GraphDatabase, basic_auth
from collections import defaultdict

In [2]:
refs_utts = pd.read_pickle('utts_refs.pkl')
props = pd.read_pickle('restaurants_props.pkl')
len(refs_utts), len(props)

(414, 52256)

In [3]:
refs_utts[:5]

Unnamed: 0,text,bot,o,ind,mask,gid,target
0,"[i, want, a, moderately, priced, restaurant, i...",api_call R_cuisine west moderate,trn,2,True,2,prezzo
2,"[cheap, restaurant, in, the, north, part, of, ...",api_call R_cuisine north cheap,trn,2,True,11,da_vinci_pizzeria
3,"[cheap, restaurant, in, the, south, part, of, ...",api_call R_cuisine south cheap,trn,2,True,12,the_lucky_star
4,"[cheap, restaurant, serving, indian, food]",api_call indian R_location cheap,trn,2,True,15,the_gandhi
5,"[thai, food]",api_call thai R_location R_price,trn,2,True,22,bangkok_city


In [4]:
props[:5]

Unnamed: 0,rname,attr_key,attr_value
3,saint_johns_chop_house,R_cuisine,british
4,saint_johns_chop_house,R_location,west
7,saint_johns_chop_house,R_price,moderate
10,prezzo,R_cuisine,italian
11,prezzo,R_location,west


## Process Text

In [5]:
stemmer = nltk.stem.snowball.EnglishStemmer()

def stem(sentence):
    return [stemmer.stem(w) for w in sentence]

In [6]:
test = pd.DataFrame()
test['text'] = [stem(s) for s in refs_utts.text]
test['frame'] = [tuple(stem(f.split()[1:])) for f in refs_utts.bot]
len(test)

414

In [7]:
# Remove poorly formatted frames
test = test[test.frame.map(len) == 3]
len(test)

405

In [8]:
test[:5]

Unnamed: 0,text,frame
0,"[i, want, a, moder, price, restaur, in, the, w...","(r_cuisin, west, moder)"
1,"[cheap, restaur, in, the, north, part, of, town]","(r_cuisin, north, cheap)"
2,"[cheap, restaur, in, the, south, part, of, town]","(r_cuisin, south, cheap)"
3,"[cheap, restaur, serv, indian, food]","(indian, r_locat, cheap)"
4,"[thai, food]","(thai, r_locat, r_price)"


In [9]:
knowledge = pd.DataFrame()
knowledge['restaurant'] = props.rname.copy()
knowledge['key'] = [stemmer.stem(s) for s in props.attr_key]
knowledge['value'] = [stemmer.stem(s) for s in props.attr_value]

In [10]:
knowledge[:5]

Unnamed: 0,restaurant,key,value
3,saint_johns_chop_house,r_cuisin,british
4,saint_johns_chop_house,r_locat,west
7,saint_johns_chop_house,r_price,moder
10,prezzo,r_cuisin,italian
11,prezzo,r_locat,west


In [11]:
# A dictionary of keys to the list of values they can take
# In this instance, keys form mutually exclusive lists of values
types = knowledge[['key', 'value']] \
    .groupby('key') \
    .aggregate(lambda x: tuple(set(x))) \
    .reset_index() \
    .set_index('key') \
    .value \
    .to_dict()

In [12]:
types['r_cuisin'][:5]

('asian_orient', 'vietnames', 'lebanes', 'african', 'thai')

In [13]:
types['r_locat']

('centr', 'south', 'west', 'east', 'north')

In [14]:
types['r_price']

('expens', 'moder', 'cheap')

## Create Knowledge Graph

In [15]:
# Create a neo4j session
driver = GraphDatabase.driver('bolt://localhost:7687', auth=basic_auth('neo4j', 'neo4j'))

In [16]:
# WARNING: This will clear the database when run!
def reset_db():
    session = driver.session()
    session.run('MATCH (n) DETACH DELETE n')

In [17]:
reset_db()

In [18]:
session = driver.session()

for i,row in knowledge.iterrows():
    subject, relation, obj = row.restaurant, row.key, row.value
    session.run('''
        MERGE (s:SUBJECT {name: $subject}) 
        MERGE (o:OBJECT  {name: $obj}) 
        MERGE (s)-[r:RELATION {name: $relation}]->(o)
    ''', { 
        'subject': subject,
        'relation': relation,
        'obj': obj
    })

## Test
#### Baseline
The baseline accuracy is the slot accuracy, calculated by the assumption of not knowing any frame values for any of the sentences.

In [19]:
dont_know = tuple(types.keys())
dont_know

('r_cuisin', 'r_locat', 'r_price')

In [20]:
base_predicted = list(dont_know) * len(test)
base_actual = [w for frame in test.frame for w in frame]

In [21]:
accuracy_score(base_actual, base_predicted)

0.45267489711934156

#### Accuracy

In [91]:
# Cache properties from DB
# Running this query will obtain all properties at this point in time
def get_properties():
    session = driver.session()
    return session.run('''
        MATCH ()-[r:RELATION]->(o:OBJECT) 
        RETURN collect(distinct o.name) AS properties
    ''').single()['properties']

In [92]:
# def get_types():
#     session = driver.session()
#     result = session.run('''
#         MATCH ()-[r:RELATION]->(o:OBJECT) 
#         RETURN collect(distinct [r.name, o.name]) AS pair
#     ''').single()[0]
    
#     g_types = defaultdict(lambda: [])
#     for k,v in result:
#         g_types[k].append(v)
#     return g_types

In [115]:
properties = set(get_properties())

In [116]:
# Hotword listener
def is_hotword(word):
    return word in properties

In [117]:
is_hotword('british'), is_hotword('python')

(True, False)

In [122]:
# Issue DB queries
def find_slot(prop):
    return session.run('''
        MATCH (s:SUBJECT)-[r:RELATION]->(o:OBJECT {name:$name}) 
        RETURN collect(distinct [r.name, o.name]) AS properties
    ''', {
        'name': prop
    })

def extract(result):
    return result.single()['properties'][0]

In [123]:
session = driver.session()
extract(find_slot('west'))

['r_locat', 'west']

In [167]:
session = driver.session()
all_slots = [[find_slot(word) for word in sentence if is_hotword(word)] for sentence in test.text]
extracted_slots = [[tuple(extract(slot)) for slot in slots] for slots in all_slots]
test['slots'] = extracted_slots

In [168]:
def to_frame(slots):
    frame = list(dont_know)
    s = dict(slots)
    
    for i,x in enumerate(frame):
        if x in s.keys():
            frame[i] = s[x]
    
    return tuple(frame)

In [169]:
test['predicted'] = [to_frame(slot) for slot in test.slots]

In [170]:
test[:5]

Unnamed: 0,text,frame,slots,predicted
0,"[i, want, a, moder, price, restaur, in, the, w...","(r_cuisin, west, moder)","[(r_price, moder), (r_locat, west)]","(r_cuisin, west, moder)"
1,"[cheap, restaur, in, the, north, part, of, town]","(r_cuisin, north, cheap)","[(r_price, cheap), (r_locat, north)]","(r_cuisin, north, cheap)"
2,"[cheap, restaur, in, the, south, part, of, town]","(r_cuisin, south, cheap)","[(r_price, cheap), (r_locat, south)]","(r_cuisin, south, cheap)"
3,"[cheap, restaur, serv, indian, food]","(indian, r_locat, cheap)","[(r_price, cheap), (r_cuisin, indian)]","(indian, r_locat, cheap)"
4,"[thai, food]","(thai, r_locat, r_price)","[(r_cuisin, thai)]","(thai, r_locat, r_price)"


In [171]:
base_predicted = [w for frame in test.predicted for w in frame]
base_actual = [w for frame in test.frame for w in frame]

In [172]:
accuracy_score(base_actual, base_predicted)

0.96954732510288066