In [134]:
from collections import defaultdict
import json
from operator import *
import re

import numpy as np
from scipy.sparse import csr_matrix

# The CFQ dataset

In [2]:
df = sqlCtx.read.parquet('/data/yu_gai/cfq/dataset.parquet/')
df.columns

['complexityMeasures',
 'expectedResponse',
 'expectedResponseWithMids',
 'question',
 'questionPatternModEntities',
 'questionWithBrackets',
 'questionWithMids',
 'ruleIds',
 'ruleTree',
 'sparql',
 'sparqlPattern',
 'sparqlPatternModEntities']

## Some samples

In [3]:
n = 2
rs = df.rdd.map(lambda r: [r['questionPatternModEntities'], r['sparqlPatternModEntities']]).take(n)
for i, [question, query] in enumerate(rs):
    print(question)
    print(query)
    if i < n - 1:
        print()

Were M2 and M5 executive produced by a British executive producer of M0 and executive produced by M3 and M4
SELECT count(*) WHERE {
?x0 ns:film.producer.films_executive_produced M0 .
?x0 ns:people.person.nationality ns:m.07ssc .
M2 ns:film.film.executive_produced_by ?x0 .
M2 ns:film.film.executive_produced_by M3 .
M2 ns:film.film.executive_produced_by M4 .
M5 ns:film.film.executive_produced_by ?x0 .
M5 ns:film.film.executive_produced_by M3 .
M5 ns:film.film.executive_produced_by M4
}

Were M1 , M2 , and M3 influenced by a film producer
SELECT count(*) WHERE {
?x0 a ns:film.producer .
M1 ns:influence.influence_node.influenced_by ?x0 .
M2 ns:influence.influence_node.influenced_by ?x0 .
M3 ns:influence.influence_node.influenced_by ?x0
}


## SPARQL syntax

In [4]:
p = re.compile(r'[A-Z]+')
df.rdd.flatMap(lambda r: re.findall(p, r['sparql'])).distinct().collect()

['SELECT', 'FILTER', 'WHERE', 'DISTINCT']

In [5]:
rdd = df.rdd.map(lambda r: r['sparqlPatternModEntities'].split('\n')).cache()
rdd.map(lambda r: r[0]).distinct().collect(), rdd.map(lambda r: r[-1]).distinct().collect()  

(['SELECT count(*) WHERE {', 'SELECT DISTINCT ?x0 WHERE {'], ['}'])

## SPARQL TRIPLES

### Triple syntax

In [30]:
triples = rdd.flatMap(lambda r: [l for l in r[1 : -1] if 'FILTER' not in l]).distinct()
p = re.compile(r'^([^ ]+) ([^ ]+) ([^ ]+)( .)?$')
triples = triples.map(lambda r: re.findall(p, r)).cache()
assert triples.map(lambda r: len(r) == 1).reduce(and_)
triples = triples.map(lambda r: r[0])
assert triples.map(lambda r: r[-1] in ['', ' .']).reduce(and_)  # conjunction only

### Subject

In [31]:
sorted(triples.map(lambda r: r[0]).distinct().collect())

['?x0',
 '?x1',
 '?x2',
 '?x3',
 '?x4',
 '?x5',
 'M0',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9']

### Relation

In [33]:
sorted(triples.map(lambda r: r[1]).distinct().collect())

['^ns:people.person.gender',
 '^ns:people.person.nationality',
 'a',
 'ns:business.employer.employees/ns:business.employment_tenure.person',
 'ns:film.actor.film/ns:film.performance.character',
 'ns:film.actor.film/ns:film.performance.film',
 'ns:film.cinematographer.film',
 'ns:film.director.film',
 'ns:film.editor.film',
 'ns:film.film.cinematography',
 'ns:film.film.costume_design_by',
 'ns:film.film.directed_by',
 'ns:film.film.distributors/ns:film.film_film_distributor_relationship.distributor',
 'ns:film.film.edited_by',
 'ns:film.film.executive_produced_by',
 'ns:film.film.film_art_direction_by',
 'ns:film.film.prequel',
 'ns:film.film.produced_by|ns:film.film.production_companies',
 'ns:film.film.sequel',
 'ns:film.film.starring/ns:film.performance.actor',
 'ns:film.film.written_by',
 'ns:film.film_art_director.films_art_directed',
 'ns:film.film_costumer_designer.costume_design_for_film',
 'ns:film.film_distributor.films_distributed/ns:film.film_film_distributor_relationship.f

### Object

In [34]:
sorted(triples.map(lambda r: r[2]).distinct().collect())

['?x0',
 '?x1',
 '?x2',
 '?x3',
 '?x4',
 '?x5',
 'M0',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'ns:business.employer',
 'ns:fictional_universe.fictional_character',
 'ns:film.actor',
 'ns:film.cinematographer',
 'ns:film.director',
 'ns:film.editor',
 'ns:film.film',
 'ns:film.film_art_director',
 'ns:film.film_costumer_designer',
 'ns:film.film_distributor',
 'ns:film.producer',
 'ns:film.production_company',
 'ns:film.writer',
 'ns:m.02zsn',
 'ns:m.0345h',
 'ns:m.03_3d',
 'ns:m.03rjj',
 'ns:m.059j2',
 'ns:m.05zppz',
 'ns:m.06mkj',
 'ns:m.07ssc',
 'ns:m.09c7w0',
 'ns:m.0b90_r',
 'ns:m.0d05w3',
 'ns:m.0d060g',
 'ns:m.0d0vqn',
 'ns:m.0f8l9c',
 'ns:people.person']

In [35]:
sorted(triples.filter(lambda r: r[1] != 'a').map(lambda r: r[2]).distinct().collect())

['?x0',
 '?x1',
 '?x2',
 '?x3',
 '?x4',
 '?x5',
 'M0',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'ns:m.02zsn',
 'ns:m.0345h',
 'ns:m.03_3d',
 'ns:m.03rjj',
 'ns:m.059j2',
 'ns:m.05zppz',
 'ns:m.06mkj',
 'ns:m.07ssc',
 'ns:m.09c7w0',
 'ns:m.0b90_r',
 'ns:m.0d05w3',
 'ns:m.0d060g',
 'ns:m.0d0vqn',
 'ns:m.0f8l9c']

In [37]:
triples.filter(lambda r: r[1] != 'a' and r[2].startswith('ns')).map(lambda r: r[1]).distinct().collect()

['ns:people.person.gender', 'ns:people.person.nationality']

### "Instance of" triples

In [10]:
is_ = xs.filter(lambda r: r[0][1] == 'a').map(lambda r: r[0]).distinct()

#### Entities

In [11]:
sorted(is_.map(lambda r: r[0]).distinct().collect())

['?x0',
 '?x1',
 '?x2',
 '?x3',
 '?x4',
 '?x5',
 'M0',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6']

#### Concepts

In [12]:
sorted(is_.map(lambda r: r[2]).distinct().collect())

['ns:business.employer',
 'ns:fictional_universe.fictional_character',
 'ns:film.actor',
 'ns:film.cinematographer',
 'ns:film.director',
 'ns:film.editor',
 'ns:film.film',
 'ns:film.film_art_director',
 'ns:film.film_costumer_designer',
 'ns:film.film_distributor',
 'ns:film.producer',
 'ns:film.production_company',
 'ns:film.writer',
 'ns:people.person']

## SPARQL Filters

In [13]:
filters = rdd.flatMap(lambda r: [l for l in r if 'FILTER' in l]).distinct()
p = re.compile(r'^FILTER \( ([^ ]+) != ([^ ]+) \)( .)?$')  # not equal only
xs = filters.map(lambda r: re.findall(p, r)).cache()
assert xs.map(lambda r: len(r) == 1).reduce(and_)
assert xs.map(lambda r: r[0][-1] in ['', ' .']).reduce(and_)

### LHS

In [14]:
sorted(xs.map(lambda r: r[0][0]).distinct().collect())

['?x0',
 '?x1',
 '?x2',
 '?x3',
 '?x4',
 'M0',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7']

### RHS

In [15]:
sorted(xs.map(lambda r: r[0][1]).distinct().collect())

['?x0',
 '?x1',
 '?x2',
 '?x3',
 '?x4',
 '?x5',
 'M0',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8']

## Preprocessing

In [16]:
mcd1 = json.load(open('/data/yu_gai/cfq/splits/mcd1.json'))
len(mcd1['trainIdxs']), len(mcd1['devIdxs']), len(mcd1['testIdxs']), df.count()

(95743, 11968, 11968, 239357)

In [17]:
p = re.compile(r"^([A-Za-z0-9,']+[ ]?)+$")
df.rdd.map(lambda r: re.match(p, r['questionPatternModEntities']).string).zip(df.rdd.map(lambda r: r['questionPatternModEntities'])).map(lambda r: r[0] == r[1]).reduce(and_)

True

In [146]:
d = {}

In [147]:
def mapper(r):
    toks = r['questionPatternModEntities'].split(' ')
    entities = sorted(set(re.findall(r'M[0-0]', r['questionPatternModEntities'])))
    variables = sorted(set(re.findall(r'\?x[0-9]', r['sparqlPatternModEntities'])))
    concepts = []
    for line in r['sparqlPatternModEntities'].split('\n')[1 : -1]:
        if 'FILTER' not in line:
            [[concept, *_]] = re.findall(r'^[^ ]+ [^ ]+ ([^ ]+)( .)?$', line)
            if concept.startswith('ns:'):
                concepts.append(concept)
    concepts = sorted(set(concepts))
    seq = toks + concepts + variables
    isconcept = len(toks) * [False] + len(concepts) * [True] + len(variables) * [False]
    isvariable = len(toks) * [False] + len(concepts) * [False] + len(variables) * [True]
    return seq, isconcept, isvariable

rdd = df.rdd.map(mapper).cache()
get = lambda rdd, i: rdd.map(lambda r: r[i])
seq_rdd, isconcept_rdd, isvariable_rdd = get(rdd, 0).cache(), get(rdd, 1), get(rdd, 2)
id2tok = seq.flatMap(lambda r: r).distinct().collect()
tok2id = dict(map(reversed, enumerate(id2tok)))

collect = lambda rdd: np.array(rdd.flatMap(lambda r: r).collect())
d['n_tok'] = np.array(seq_rdd.map(len).collect())
d['seq'] = collect(seq_rdd.map(lambda r: [tok2id[tok] for tok in r]))
d['isconcept'], d['isvariable'] = collect(isconcept_rdd), collect(isvariable_rdd)

In [150]:
entities = sorted(tok for tok in id2tok if re.match(r'^M[0-9]$', tok))
concepts = sorted(tok for tok in id2tok if tok.startswith('ns:'))
variables = sorted(tok for tok in id2tok if re.match(r'^\?x[0-9]$', tok))
sp_toks = sc.broadcast(set(entities + concepts + variables))
def mapper(r):
    d = defaultdict(list)
    for i, tok in enumerate(r):
        if tok in sp_toks.value:
            d[tok].append(i)
            
    n = len(d)
    n_idx = np.cumsum([0] + [len(d[k]) for k in sorted(d)]).tolist()
    idx = sum((d[k] for k in sorted(d)), [])
    return n, n_idx, idx

rdd = seq_rdd.map(mapper).cache()
d['n'] = np.array(get(rdd, 0).collect())
d['n_idx'] = collect(get(rdd, 1))
d['idx'] = collect(get(rdd, 2))

In [151]:
np.savez('/data/yu_gai/cfq/data', **d)