# Process the text

To run this notebook, the following python packages are required (install via pip): `pandas`, `numpy`, `nltk`, `neo4j-driver`

In [589]:
import pandas as pd
import numpy as np
import nltk

In [590]:
# Read the bAbI data as CSV
filename = 'src/main/resources/qa1_single-supporting-fact_train.txt'
data = pd.read_csv(filename, delimiter='\t', names=['sentence', 'answer', 'factid'])
data = data.fillna('')

# Use NLTK to tokenize the sentences into arrays of words
tokenize = lambda row: nltk.word_tokenize(row['sentence'])[1:]
data['sentence'] = data.apply(tokenize, axis=1)

# Create a DataFrame with just the statements
statements = data[data['answer'] == ''] \
    .reset_index(drop=True) \
    .drop('answer', axis=1) \
    .drop('factid', axis=1)

# Create a DataFrame with just the questions
questions = data[data['answer'] != ''] \
    .reset_index(drop=True)

In [591]:
data[:6]

Unnamed: 0,sentence,answer,factid
0,"[Mary, moved, to, the, bathroom, .]",,
1,"[John, went, to, the, hallway, .]",,
2,"[Where, is, Mary, ?]",bathroom,1.0
3,"[Daniel, went, back, to, the, hallway, .]",,
4,"[Sandra, moved, to, the, garden, .]",,
5,"[Where, is, Daniel, ?]",hallway,4.0


In [592]:
statements[:4]

Unnamed: 0,sentence
0,"[Mary, moved, to, the, bathroom, .]"
1,"[John, went, to, the, hallway, .]"
2,"[Daniel, went, back, to, the, hallway, .]"
3,"[Sandra, moved, to, the, garden, .]"


In [593]:
questions[:2]

Unnamed: 0,sentence,answer,factid
0,"[Where, is, Mary, ?]",bathroom,1
1,"[Where, is, Daniel, ?]",hallway,4


In [594]:
# Tag each token as a part of speech
pos_tag = lambda row: nltk.pos_tag(row['sentence'])
statements['tag'] = statements.apply(pos_tag, axis=1)

In [595]:
# Extract a (subject, relation, object) triple from each sentence based on the POS tags
def extract_triple(tags):
    subject, relation, obj = '', '', ''
    for word,tag in tags:
        if tag == 'NNP':
            subject = word
        elif tag == 'VBD':
            relation = word
        elif tag == 'NN':
            obj = word
    return (subject, relation, obj)

In [596]:
statements['triple'] = statements.apply(lambda row: extract_triple(row['tag']), axis=1)

In [597]:
statements[:5]

Unnamed: 0,sentence,tag,triple
0,"[Mary, moved, to, the, bathroom, .]","[(Mary, NNP), (moved, VBD), (to, TO), (the, DT...","(Mary, moved, bathroom)"
1,"[John, went, to, the, hallway, .]","[(John, NNP), (went, VBD), (to, TO), (the, DT)...","(John, went, hallway)"
2,"[Daniel, went, back, to, the, hallway, .]","[(Daniel, NNP), (went, VBD), (back, RB), (to, ...","(Daniel, went, hallway)"
3,"[Sandra, moved, to, the, garden, .]","[(Sandra, NNP), (moved, VBD), (to, TO), (the, ...","(Sandra, moved, garden)"
4,"[John, moved, to, the, office, .]","[(John, NNP), (moved, VBD), (to, TO), (the, DT...","(John, moved, office)"


### Debug Functions

In [598]:
# Get all statements that refer to the specified person
def person_data(person):
    return statements[statements['triple'].map(lambda t: t[0] == person)]

In [599]:
person_data('Sandra')[:3]

Unnamed: 0,sentence,tag,triple
3,"[Sandra, moved, to, the, garden, .]","[(Sandra, NNP), (moved, VBD), (to, TO), (the, ...","(Sandra, moved, garden)"
5,"[Sandra, journeyed, to, the, bathroom, .]","[(Sandra, NNP), (journeyed, VBD), (to, TO), (t...","(Sandra, journeyed, bathroom)"
10,"[Sandra, travelled, to, the, office, .]","[(Sandra, NNP), (travelled, VBD), (to, TO), (t...","(Sandra, travelled, office)"


In [600]:
# Get the n most recent statements that refer to the
# specified person in reverse chronological order
def most_recent(person, n=5):
    return person_data(person)[-n:].iloc[::-1]

In [601]:
most_recent('Daniel', n=3)

Unnamed: 0,sentence,tag,triple
1999,"[Daniel, went, to, the, garden, .]","[(Daniel, NNP), (went, VBD), (to, TO), (the, D...","(Daniel, went, garden)"
1996,"[Daniel, travelled, to, the, kitchen, .]","[(Daniel, NNP), (travelled, VBD), (to, TO), (t...","(Daniel, travelled, kitchen)"
1992,"[Daniel, moved, to, the, office, .]","[(Daniel, NNP), (moved, VBD), (to, TO), (the, ...","(Daniel, moved, office)"


# Run Queries

In [602]:
from neo4j.v1 import GraphDatabase, basic_auth

In [603]:
# Create a neo4j session
driver = GraphDatabase.driver('bolt://localhost:7687', auth=basic_auth('neo4j', 'neo4j'))

In [604]:
# WARNING: This will clear the database when run!
def reset_db():
    session = driver.session()
    session.run('MATCH (n) DETACH DELETE n')

In [605]:
# Create a graph based on each triple
def create(query):
    session = driver.session()
    for subject,relation,obj in statements['triple']:
        session.run(query, { 
            'subject': subject,
            'relation': relation,
            'obj': obj
        })

### V1: Direct relationships

In [606]:
reset_db()

In [607]:
# Create a direct relationship between subject and object
create('''
    MERGE (s:SUBJECT {name: $subject}) 
    MERGE (o:OBJECT  {name: $obj}) 
    MERGE (s)-[r:RELATION {name: $relation}]->(o)
''')

### V2: Nodes for relationships

In [608]:
reset_db()

In [609]:
# Represent each relation as a node
create('''
    MERGE (s:SUBJECT {name: $subject})
    MERGE (o:OBJECT  {name: $obj})
    CREATE (s)-[:R0]->(r:RELATION {name: $relation})-[:R1]->(o)
''')

### V3: Linked list of relationships

In [610]:
reset_db()

In [611]:
# Represent each relation as a node, ordered by a linked list (per subject)
create('''
    MERGE (s:SUBJECT {name: $subject})
    MERGE (o:OBJECT  {name: $obj})
    CREATE (s)-[:R0]->(r:RELATION {name: $relation})-[:R1]->(o)

    WITH s,r,o

    MATCH (s)-[:R0]->(r2:RELATION)
    WHERE r2 <> r AND NOT (r2)-[:NEXT]->() 
    CREATE (r2)-[:NEXT]->(r)
''')

In [612]:
# Find the room a person is currently in
def find_person(person):
    query = '''
        MATCH (s:SUBJECT {name:$name})-->(r:RELATION)-->(o:OBJECT)
        WHERE NOT (r)-[:NEXT]->()
        RETURN s,r,o
    '''
    session = driver.session()
    return session.run(query, {'name': person})

In [613]:
# Where is Mary?
record = find_person('Mary').single()
print(record['o'].get('name'))

office


In [614]:
# Verify that this is true
most_recent('Mary', n=1)

Unnamed: 0,sentence,tag,triple
1994,"[Mary, journeyed, to, the, kitchen, .]","[(Mary, NNP), (journeyed, VBD), (to, TO), (the...","(Mary, journeyed, kitchen)"


In [615]:
# Find the list of rooms a person was in, ordered by recency
def find_person_history(person, n=100):
    length = str(n) if n >= 1 else ''
    
    query = '''
        MATCH (s:SUBJECT {name:$name})-->(r:RELATION)-->(o:OBJECT)
        WHERE NOT (r)-[:NEXT]->()
        
        WITH s,r,o
        
        MATCH (s)-->(r_prev:RELATION)-[k*1..%s]->(r), (r_prev)-->(o_prev:OBJECT)
        
        WITH size(k) AS dist, r, o, r_prev, o_prev
        ORDER BY size(k)
        
        WITH r, o, r_prev, o_prev
        RETURN [r.name] + collect(r_prev.name) AS relation, [o.name] + collect(o_prev.name) AS obj
    '''
    query = query % length
    
    session = driver.session()
    record = session.run(query, {'name': person}).single()
    history = list(zip(record['relation'], record['obj']))[:-1]
    
    return history

In [616]:
# Where has John been recently?
history = find_person_history('John', n=5)

In [617]:
history

[('went', 'bedroom'),
 ('went', 'garden'),
 ('went', 'office'),
 ('', 'bedroom'),
 ('travelled', 'hallway')]

In [618]:
# Verify that John has been to to those places, in that order
most_recent('John', n=5)

Unnamed: 0,sentence,tag,triple
1995,"[John, went, back, to, the, bedroom, .]","[(John, NNP), (went, VBD), (back, RB), (to, TO...","(John, went, bedroom)"
1989,"[John, went, back, to, the, garden, .]","[(John, NNP), (went, VBD), (back, RB), (to, TO...","(John, went, garden)"
1986,"[John, went, back, to, the, office, .]","[(John, NNP), (went, VBD), (back, RB), (to, TO...","(John, went, office)"
1982,"[John, journeyed, to, the, bedroom, .]","[(John, NNP), (journeyed, NN), (to, TO), (the,...","(John, , bedroom)"
1979,"[John, travelled, to, the, hallway, .]","[(John, NNP), (travelled, VBD), (to, TO), (the...","(John, travelled, hallway)"
