# Process the text

To run this notebook, the following python packages are required (all installable via pip): `pandas`, `numpy`, `nltk`, `neo4j-driver`

In [223]:
import pandas as pd
import numpy as np
import nltk

In [224]:
# Read the bAbI data as CSV
data = pd.read_csv('src/main/resources/qa1_single-supporting-fact_train.txt', delimiter='\t', names=['sentence', 'answer', 'fact']).fillna('')

# Use NLTK to tokenize the sentences into arrays of words
data['sentence'] = data.apply(lambda row: nltk.word_tokenize(row['sentence'])[1:], axis=1)

# Create a DataFrame with just the statements
statements = data[data['answer'] == '']
statements = statements.reset_index(drop=True)
statements = statements.drop('answer', axis=1)
statements = statements.drop('fact', axis=1)

# Create a DataFrame with just the questions
questions = data[data['answer'] != '']
questions = questions.reset_index(drop=True)

In [225]:
statements[:5]

Unnamed: 0,sentence
0,"[Mary, moved, to, the, bathroom, .]"
1,"[John, went, to, the, hallway, .]"
2,"[Daniel, went, back, to, the, hallway, .]"
3,"[Sandra, moved, to, the, garden, .]"
4,"[John, moved, to, the, office, .]"


In [226]:
questions[:5]

Unnamed: 0,sentence,answer,fact
0,"[Where, is, Mary, ?]",bathroom,1
1,"[Where, is, Daniel, ?]",hallway,4
2,"[Where, is, Daniel, ?]",hallway,4
3,"[Where, is, Daniel, ?]",office,11
4,"[Where, is, Sandra, ?]",bathroom,8


In [227]:
statements['tag'] = statements.apply(lambda row: nltk.pos_tag(row['sentence']), axis=1)

In [228]:
def extract_triple(tags):
    subject, relation, obj = '', '', ''
    for word,tag in tags:
        if tag == 'NNP':
            subject = word
        elif tag == 'VBD':
            relation = word
        elif tag == 'NN':
            obj = word
    return (subject, relation, obj)

In [229]:
statements['triple'] = statements.apply(lambda row: extract_triple(row['tag']), axis=1)

In [230]:
statements[:5]

Unnamed: 0,sentence,tag,triple
0,"[Mary, moved, to, the, bathroom, .]","[(Mary, NNP), (moved, VBD), (to, TO), (the, DT...","(Mary, moved, bathroom)"
1,"[John, went, to, the, hallway, .]","[(John, NNP), (went, VBD), (to, TO), (the, DT)...","(John, went, hallway)"
2,"[Daniel, went, back, to, the, hallway, .]","[(Daniel, NNP), (went, VBD), (back, RB), (to, ...","(Daniel, went, hallway)"
3,"[Sandra, moved, to, the, garden, .]","[(Sandra, NNP), (moved, VBD), (to, TO), (the, ...","(Sandra, moved, garden)"
4,"[John, moved, to, the, office, .]","[(John, NNP), (moved, VBD), (to, TO), (the, DT...","(John, moved, office)"


### Debug Functions

In [274]:
def person_data(person):
    return statements[statements['triple'].map(lambda t: t[0] == person)]

In [275]:
person_data('Mary')[:5]

Unnamed: 0,sentence,tag,triple
0,"[Mary, moved, to, the, bathroom, .]","[(Mary, NNP), (moved, VBD), (to, TO), (the, DT...","(Mary, moved, bathroom)"
6,"[Mary, moved, to, the, hallway, .]","[(Mary, NNP), (moved, VBD), (to, TO), (the, DT...","(Mary, moved, hallway)"
12,"[Mary, went, to, the, bedroom, .]","[(Mary, NNP), (went, VBD), (to, TO), (the, DT)...","(Mary, went, bedroom)"
20,"[Mary, went, to, the, bedroom, .]","[(Mary, NNP), (went, VBD), (to, TO), (the, DT)...","(Mary, went, bedroom)"
29,"[Mary, moved, to, the, office, .]","[(Mary, NNP), (moved, VBD), (to, TO), (the, DT...","(Mary, moved, office)"


# Run Queries

In [368]:
from neo4j.v1 import GraphDatabase, basic_auth

In [369]:
# Create a neo4j session
driver = GraphDatabase.driver("bolt://localhost:7687", auth=basic_auth("neo4j", "neo4j"))

In [370]:
# WARNING: This will clear the database!
def reset_db():
    session = driver.session()
    session.run("MATCH (n) DETACH DELETE n")

In [371]:
# Create the graph based on each triple
def create(query):
    session = driver.session()
    for subject,relation,obj in statements['triple']:
        session.run(query, { 
            'subject': subject,
            'relation': relation,
            'obj': obj
        })

### V1: Direct relationships

In [372]:
reset_db()

In [373]:
# Create a direct relationship between subject and object
create("""
    MERGE (s:SUBJECT {name: $subject}) 
    MERGE (o:OBJECT  {name: $obj}) 
    MERGE (s)-[r:RELATION {name: $relation}]->(o)
""")

### V2: Nodes for relationships

In [374]:
reset_db()

In [375]:
# Represent each relation as a node
create("""
    MERGE (s:SUBJECT {name: $subject})
    MERGE (o:OBJECT  {name: $obj})
    CREATE (s)-[:R0]->(r:RELATION {name: $relation})-[:R1]->(o)
""")

### V3: Linked list of relationships

In [377]:
reset_db()

In [378]:
# Represent each relation as a node, ordered by a linked list (per subject)
create("""
    MERGE (s:SUBJECT {name: $subject})
    MERGE (o:OBJECT  {name: $obj})
    CREATE (s)-[:R0]->(r:RELATION {name: $relation})-[:R1]->(o)

    WITH s,r,o

    MATCH (s)-[:R0]->(r2:RELATION)
    WHERE r2 <> r AND NOT (r2)-[:NEXT]->() 
    CREATE (r2)-[:NEXT]->(r)
""")

#### Find the room a person is currently in

In [379]:
def find_person(name):
    session = driver.session()
    return session.run("""
        MATCH (s:SUBJECT {name:$name})-->(r:RELATION)-->(o:OBJECT)
        WHERE NOT (r)-[:NEXT]->()
        RETURN s,r,o
    """, {
        'name': name
    })

In [380]:
# Where is Mary?
record = find_person('Mary').single()
print(record['o'].get('name'))

kitchen


In [381]:
# Verify that this is true
person_data('Mary')[-1:]

Unnamed: 0,sentence,tag,triple
1994,"[Mary, journeyed, to, the, kitchen, .]","[(Mary, NNP), (journeyed, VBD), (to, TO), (the...","(Mary, journeyed, kitchen)"


#### Find the list of rooms a person was in, ordered by recency

In [423]:
def find_person_history(name, n=100):
    session = driver.session()
    return session.run("""
        MATCH (s:SUBJECT {name:$name})-->(r:RELATION)-->(o:OBJECT)
        WHERE NOT (r)-[:NEXT]->()
        
        WITH s,r,o
        
        MATCH (s)-->(r_prev:RELATION)-[k*1..""" + str(n) + """]->(r), (r_prev)-->(o_prev:OBJECT)
        
        WITH size(k) AS dist, r, o, r_prev, o_prev
        ORDER BY size(k)
        
        WITH r, o, r_prev, o_prev
        RETURN [r.name] + collect(r_prev.name) AS relation, [o.name] + collect(o_prev.name) AS obj
    """, {
        'name': name
    })

In [424]:
# Where has John been recently?
record = find_person_history('John', n=10).single()
history = list(zip(record['relation'], record['obj']))

In [425]:
history[:5]

[('went', 'bedroom'),
 ('went', 'garden'),
 ('went', 'office'),
 ('', 'bedroom'),
 ('travelled', 'hallway')]

In [426]:
# Verify that John has been to to those places, in that order
person_data('John')[-5:].iloc[::-1]

Unnamed: 0,sentence,tag,triple
1995,"[John, went, back, to, the, bedroom, .]","[(John, NNP), (went, VBD), (back, RB), (to, TO...","(John, went, bedroom)"
1989,"[John, went, back, to, the, garden, .]","[(John, NNP), (went, VBD), (back, RB), (to, TO...","(John, went, garden)"
1986,"[John, went, back, to, the, office, .]","[(John, NNP), (went, VBD), (back, RB), (to, TO...","(John, went, office)"
1982,"[John, journeyed, to, the, bedroom, .]","[(John, NNP), (journeyed, NN), (to, TO), (the,...","(John, , bedroom)"
1979,"[John, travelled, to, the, hallway, .]","[(John, NNP), (travelled, VBD), (to, TO), (the...","(John, travelled, hallway)"
