In [1]:
from collections import defaultdict
import json
from operator import *
import pickle
import re

import matplotlib.pylab as pl
import numpy as np
from scipy.sparse import csr_matrix

In [2]:
input_dir = '/data/yu_gai/cfq'
output_dir = '/work/yu_gai/cfq'

# The CFQ dataset

In [3]:
df = sqlCtx.read.parquet(f'{input_dir}/dataset.parquet').sort('index').persist()

## Some samples

In [4]:
df.rdd.map(lambda r: r['question']).take(10)

["Did  Jackie's female actor edit and produce Rad Plaid",
 "Did !Women Art Revolution's writer, director, editor, and star direct, produce, and write Conceiving Ada",
 'Did #FMV Flymen Vision employ a spouse of a character',
 "Did 'Gog' Helen''s writer, editor, cinematographer, producer, and director marry and influence Jacques Demy",
 "Did 'Murder' Legendre's male actor marry Lillian Lugosi",
 "Did (Buried) Over the Roofs' Spanish executive producer edit Shepherdesses",
 "Did ...And Justice for All's Canadian director executive produce Mokhtar",
 "Did 1 Night in Paris' male cinematographer's Canadian spouse's spouse marry Mayte Garcia",
 "Did 1/3/10's cinematographer and star edit, direct, and write Nakounine, 78 RPM, and The Subject of the Picture",
 "Did 1/3/10's star, director, editor, cinematographer, and writer write and direct Photographic Memory"]

In [5]:
n = 5
rs = df.rdd.map(lambda r: [r['questionPatternModEntities'], r['sparqlPatternModEntities']]).take(n)
for i, [question, query] in enumerate(rs):
    print(question)
    print(query)
    if i < n - 1:
        print()

Did M1 's female actor edit and produce M0
SELECT count(*) WHERE {
?x0 ns:film.actor.film/ns:film.performance.character M1 .
?x0 ns:film.editor.film M0 .
?x0 ns:film.producer.film|ns:film.production_company.films M0 .
?x0 ns:people.person.gender ns:m.02zsn
}

Did M0 's writer , director , editor , and star direct , produce , and write M1
SELECT count(*) WHERE {
?x0 ns:film.actor.film/ns:film.performance.film M0 .
?x0 ns:film.director.film M0 .
?x0 ns:film.director.film M1 .
?x0 ns:film.editor.film M0 .
?x0 ns:film.producer.film|ns:film.production_company.films M1 .
?x0 ns:film.writer.film M0 .
?x0 ns:film.writer.film M1
}

Did M1 employ a spouse of a character
SELECT count(*) WHERE {
?x0 ns:people.person.spouse_s/ns:people.marriage.spouse|ns:fictional_universe.fictional_character.married_to/ns:fictional_universe.marriage_of_fictional_characters.spouses ?x1 .
?x1 a ns:fictional_universe.fictional_character .
FILTER ( ?x0 != ?x1 ) .
M1 ns:business.employer.employees/ns:business.employmen

## SPARQL syntax

In [6]:
p = re.compile(r'[A-Z]+')
df.rdd.flatMap(lambda r: re.findall(p, r['sparql'])).distinct().collect()

['WHERE', 'SELECT', 'FILTER', 'DISTINCT']

In [7]:
rdd = df.rdd.map(lambda r: r['sparqlPatternModEntities'].split('\n')).cache()
rdd.map(lambda r: r[0]).distinct().collect(), rdd.map(lambda r: r[-1]).distinct().collect()  

(['SELECT DISTINCT ?x0 WHERE {', 'SELECT count(*) WHERE {'], ['}'])

## Interrogatives

In [8]:
interrogatives = df.rdd.map(lambda r: (r['questionPatternModEntities'].split(' ')[0], r['sparqlPatternModEntities'].split('\n')[0])).cache()

In [9]:
interrogatives.distinct().collect()

[('Were', 'SELECT count(*) WHERE {'),
 ('Who', 'SELECT DISTINCT ?x0 WHERE {'),
 ('Did', 'SELECT count(*) WHERE {'),
 ('Was', 'SELECT count(*) WHERE {'),
 ('What', 'SELECT DISTINCT ?x0 WHERE {'),
 ('Which', 'SELECT DISTINCT ?x0 WHERE {')]

In [10]:
interrogatives.map(lambda r: r[0]).countByValue()

defaultdict(int,
            {'Did': 43051,
             'Was': 68063,
             'Were': 19457,
             'What': 56616,
             'Which': 26466,
             'Who': 25704})

## SPARQL TRIPLES

### Triple syntax

In [11]:
triples = rdd.flatMap(lambda r: [l for l in r[1 : -1] if 'FILTER' not in l]).distinct()
p = re.compile(r'^([^ ]+) ([^ ]+) ([^ ]+)( .)?$')
triples = triples.map(lambda r: re.findall(p, r)).cache()
assert triples.map(lambda r: len(r) == 1).reduce(and_)
triples = triples.map(lambda r: r[0])
assert triples.map(lambda r: r[-1] in ['', ' .']).reduce(and_)  # conjunction only

### Subject

In [12]:
sorted(triples.map(lambda r: r[0]).distinct().collect())

['?x0',
 '?x1',
 '?x2',
 '?x3',
 '?x4',
 '?x5',
 'M0',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9']

### Relation

In [13]:
sorted(triples.map(lambda r: r[1]).distinct().collect())

['^ns:people.person.gender',
 '^ns:people.person.nationality',
 'a',
 'ns:business.employer.employees/ns:business.employment_tenure.person',
 'ns:film.actor.film/ns:film.performance.character',
 'ns:film.actor.film/ns:film.performance.film',
 'ns:film.cinematographer.film',
 'ns:film.director.film',
 'ns:film.editor.film',
 'ns:film.film.cinematography',
 'ns:film.film.costume_design_by',
 'ns:film.film.directed_by',
 'ns:film.film.distributors/ns:film.film_film_distributor_relationship.distributor',
 'ns:film.film.edited_by',
 'ns:film.film.executive_produced_by',
 'ns:film.film.film_art_direction_by',
 'ns:film.film.prequel',
 'ns:film.film.produced_by|ns:film.film.production_companies',
 'ns:film.film.sequel',
 'ns:film.film.starring/ns:film.performance.actor',
 'ns:film.film.written_by',
 'ns:film.film_art_director.films_art_directed',
 'ns:film.film_costumer_designer.costume_design_for_film',
 'ns:film.film_distributor.films_distributed/ns:film.film_film_distributor_relationship.f

### Object

In [14]:
sorted(triples.map(lambda r: r[2]).distinct().collect())

['?x0',
 '?x1',
 '?x2',
 '?x3',
 '?x4',
 '?x5',
 'M0',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'ns:business.employer',
 'ns:fictional_universe.fictional_character',
 'ns:film.actor',
 'ns:film.cinematographer',
 'ns:film.director',
 'ns:film.editor',
 'ns:film.film',
 'ns:film.film_art_director',
 'ns:film.film_costumer_designer',
 'ns:film.film_distributor',
 'ns:film.producer',
 'ns:film.production_company',
 'ns:film.writer',
 'ns:m.02zsn',
 'ns:m.0345h',
 'ns:m.03_3d',
 'ns:m.03rjj',
 'ns:m.059j2',
 'ns:m.05zppz',
 'ns:m.06mkj',
 'ns:m.07ssc',
 'ns:m.09c7w0',
 'ns:m.0b90_r',
 'ns:m.0d05w3',
 'ns:m.0d060g',
 'ns:m.0d0vqn',
 'ns:m.0f8l9c',
 'ns:people.person']

In [15]:
sorted(triples.filter(lambda r: r[1] != 'a').map(lambda r: r[2]).distinct().collect())

['?x0',
 '?x1',
 '?x2',
 '?x3',
 '?x4',
 '?x5',
 'M0',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'ns:m.02zsn',
 'ns:m.0345h',
 'ns:m.03_3d',
 'ns:m.03rjj',
 'ns:m.059j2',
 'ns:m.05zppz',
 'ns:m.06mkj',
 'ns:m.07ssc',
 'ns:m.09c7w0',
 'ns:m.0b90_r',
 'ns:m.0d05w3',
 'ns:m.0d060g',
 'ns:m.0d0vqn',
 'ns:m.0f8l9c']

In [16]:
triples.take(1)

[('M1', 'ns:film.film.starring/ns:film.performance.actor', 'M5', '')]

In [17]:
triples.filter(lambda r: r[1] != 'a' and r[2].startswith('ns')).map(lambda r: r[1]).distinct().collect()

['ns:people.person.gender', 'ns:people.person.nationality']

### "Instance of" triples

In [18]:
is_ = triples.filter(lambda r: r[1] == 'a').distinct()

#### Entities

In [19]:
sorted(is_.map(lambda r: r[0]).distinct().collect())

['?x0',
 '?x1',
 '?x2',
 '?x3',
 '?x4',
 '?x5',
 'M0',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6']

#### Concepts

In [20]:
sorted(is_.map(lambda r: r[2]).distinct().collect())

['ns:business.employer',
 'ns:fictional_universe.fictional_character',
 'ns:film.actor',
 'ns:film.cinematographer',
 'ns:film.director',
 'ns:film.editor',
 'ns:film.film',
 'ns:film.film_art_director',
 'ns:film.film_costumer_designer',
 'ns:film.film_distributor',
 'ns:film.producer',
 'ns:film.production_company',
 'ns:film.writer',
 'ns:people.person']

## SPARQL Filters

In [21]:
filters = rdd.flatMap(lambda r: [l for l in r if 'FILTER' in l]).distinct()
p = re.compile(r'^FILTER \( ([^ ]+) != ([^ ]+) \)( .)?$')  # not equal only
xs = filters.map(lambda r: re.findall(p, r)).cache()
assert xs.map(lambda r: len(r) == 1).reduce(and_)
assert xs.map(lambda r: r[0][-1] in ['', ' .']).reduce(and_)

### LHS

In [22]:
sorted(xs.map(lambda r: r[0][0]).distinct().collect())

['?x0',
 '?x1',
 '?x2',
 '?x3',
 '?x4',
 'M0',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7']

### RHS

In [23]:
sorted(xs.map(lambda r: r[0][1]).distinct().collect())

['?x0',
 '?x1',
 '?x2',
 '?x3',
 '?x4',
 '?x5',
 'M0',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8']

## Preprocessing

In [24]:
split_ids = !ls {input_dir}/splits | grep json
for split_id in [s.replace('.json', '') for s in split_ids]:
    split = json.load(open(f'{input_dir}/splits/{split_id}.json'))
    np.savez(f'{output_dir}/splits/{split_id}', **{k : np.array(v) for k, v in split.items()})
    print(split_id, len(split['trainIdxs']), len(split['devIdxs']), len(split['testIdxs']), df.count())

mcd1 95743 11968 11968 239357
mcd2 95743 11968 11968 239357
mcd3 95743 11968 11968 239357
query_complexity_split 100654 9512 9512 239357
query_pattern_split 94600 12489 12589 239357
question_complexity_split 98999 10339 10340 239357
question_pattern_split 95654 12115 11909 239357
random_split 95744 11967 11967 239357


In [25]:
p = re.compile(r"^([A-Za-z0-9,']+[ ]?)+$")
df.rdd.map(lambda r: re.match(p, r['questionPatternModEntities']).string).zip(df.rdd.map(lambda r: r['questionPatternModEntities'])).map(lambda r: r[0] == r[1]).reduce(and_)

True

In [26]:
d = {}

In [27]:
get = lambda rdd, i: rdd.map(lambda r: r[i])
collect = lambda rdd: np.array(rdd.collect())
fcollect = lambda rdd: np.array(rdd.flatMap(lambda r: r).collect())

In [28]:
SEP = '[SEP]'
PAD = '[PAD]'
def mapper(r):
    toks = r['questionPatternModEntities'].split(' ')
    entities = sorted(set(re.findall(r'M[0-0]', r['questionPatternModEntities'])))
    variables = sorted(set(re.findall(r'\?x[0-9]', r['sparqlPatternModEntities'])))
    concepts = []
    for line in r['sparqlPatternModEntities'].split('\n')[1 : -1]:
        if 'FILTER' not in line:
            [[concept, *_]] = re.findall(r'^[^ ]+ [^ ]+ ([^ ]+)( .)?$', line)
            if concept.startswith('ns:'):
                concepts.append(concept)
    concepts = sorted(set(concepts))
    seq = toks + [SEP] + concepts + [SEP] + variables
    isconcept = len(toks) * [False] + len(concepts) * [True] + len(variables) * [False]
    isvariable = len(toks) * [False] + len(concepts) * [False] + len(variables) * [True]
    return seq, isconcept, isvariable

rdd = df.rdd.map(mapper).cache()
seq_rdd, isconcept_rdd, isvariable_rdd = get(rdd, 0).cache(), get(rdd, 1), get(rdd, 2)
idx2tok = seq_rdd.flatMap(lambda r: r).distinct().collect() + [PAD]
tok2idx = dict(map(reversed, enumerate(idx2tok)))

d['n_tok'] = collect(seq_rdd.map(len))
d['seq'] = fcollect(seq_rdd.map(lambda r: [tok2idx[tok] for tok in r]))
d['isconcept'], d['isvariable'] = fcollect(isconcept_rdd), fcollect(isvariable_rdd)

In [29]:
entities = sorted(tok for tok in idx2tok if re.match(r'^M[0-9]$', tok))
concepts = sorted(tok for tok in idx2tok if tok.startswith('ns:'))
variables = sorted(tok for tok in idx2tok if re.match(r'^\?x[0-9]$', tok))
sp_toks = sc.broadcast(set(entities + concepts + variables))
def mapper(r):
    d = defaultdict(list)
    for i, tok in enumerate(r):
        if tok in sp_toks.value:
            d[tok].append(i)

    n = len(d)
    tok = [tok2idx[tok] for tok in sorted(d)]
    n_idx = [len(d[k]) for k in sorted(d)]
    idx = sum((d[k] for k in sorted(d)), [])

    return n, tok, n_idx, idx

rdd = seq_rdd.map(mapper).cache()
d['n'] = collect(get(rdd, 0))
d['tok'] = fcollect(get(rdd, 1))
d['n_idx'] = fcollect(get(rdd, 2))
d['idx'] = fcollect(get(rdd, 3))

In [30]:
def mapper(r):
    src, rel, dst = [], [], []
    for line in r['sparqlPatternModEntities'].split('\n')[1 : -1]:
        if 'FILTER' in line:
            [[src_, dst_, *_]] = re.findall(r'^FILTER \( ([^ ]+) != ([^ ]+) \)( .)?$', line)
            src.append(src_)
            rel.append('!=')
            dst.append(dst_)
        else:
            [[src_, rel_, dst_, *_]] = re.findall(r'^([^ ]+) ([^ ]+) ([^ ]+)( .)?$', line)
            src.append(src_)
            rel.append(rel_)
            dst.append(dst_)

    u, inv = np.unique(src + dst, return_inverse=True)
    src, dst = np.split(np.arange(len(u))[inv], 2)
    return src, rel, dst

rdd = df.rdd.map(mapper).cache()
d['src'], d['dst'] = fcollect(get(rdd, 0)), fcollect(get(rdd, 2))
rel_rdd = get(rdd, 1).cache()
d['m'] = collect(rel_rdd.map(len))
rel_rdd = rel_rdd.flatMap(lambda r: r).cache()
idx2rel = sorted(rel_rdd.distinct().collect())
rel2idx = {rel : idx for idx, rel in enumerate(idx2rel)}
d['rel'] = collect(rel_rdd.map(lambda r: rel2idx[r]))

In [31]:
pickle.dump([idx2tok, tok2idx], open(f'{output_dir}/tok-vocab.pickle', 'wb'))
pickle.dump([idx2rel, rel2idx], open(f'{output_dir}/rel-vocab.pickle', 'wb'))
np.savez(f'{output_dir}/data', **d)

### Maximum multiplicity

In [32]:
def mapper(r):
    src, _, dst = r
    _, c = np.unique(np.vstack([src, dst]), return_counts=True, axis=1)
    return c.max()

rdd.map(mapper).reduce(max)

9

## Variable prediction

In [33]:
d = {}
tok_rdd = df.rdd.map(lambda r: [tok2idx[tok] for tok in r['questionPatternModEntities'].split(' ')])
d['seq'] = fcollect(tok_rdd)
d['n_tok'] = collect(tok_rdd.map(len))
d['n_var'] = collect(df.rdd.map(lambda r: len(set(re.findall(r'\?x[0-9]', r['sparqlPatternModEntities'])))))
np.savez(f'{output_dir}/nvar', **d)

## Concept prediction

In [34]:
def mapper(r):
    concepts = []
    for line in r['sparqlPatternModEntities'].split('\n')[1 : -1]:
        if 'FILTER' not in line:
            [[concept, *_]] = re.findall(r'^[^ ]+ [^ ]+ ([^ ]+)( .)?$', line)
            if concept.startswith('ns:'):
                concepts.append(concept)
    return set(concepts)

In [35]:
d = {}
tok_rdd = df.rdd.map(lambda r: [tok2idx[tok] for tok in r['questionPatternModEntities'].split(' ')])
d['seq'] = fcollect(tok_rdd)
d['n_tok'] = collect(tok_rdd.map(len))

con = df.rdd.map(mapper).cache()
d['n_con'] = collect(con.map(len))
idx2con = sorted(con.flatMap(lambda r: r).distinct().collect())
con2idx = {con : idx for idx, con in enumerate(idx2con)}
d['con'] = fcollect(con.map(lambda r: sorted(con2idx[con] for con in r)))

np.savez(f'{output_dir}/concept', **d)