Permalink
Fetching contributors…
Cannot retrieve contributors at this time
245 lines (211 sloc) 7.03 KB
## Random variable to predict #################################################
# This application's goal is to predict whether a given pair of person mention
# are indicating a spouse relationship or not.
@extraction
has_spouse?(
@key
@references(relation="person_mention", column="mention_id", alias="p1")
p1_id text,
@key
@references(relation="person_mention", column="mention_id", alias="p2")
p2_id text
).
## Input Data #################################################################
@source
articles(
@key
@distributed_by
id text,
@searchable
content text
).
## NLP markup #################################################################
@source
sentences(
@key
@distributed_by
# XXX This breaks the search index. @source should not be derived from another @source
#@references(relation="articles", column="id")
doc_id text,
@key
sentence_index int,
@search_type("text[]")
tokens json,
@search_type("text[]")
lemmas json,
@search_type("text[]")
pos_tags json,
@search_type("text[]")
ner_tags json,
@search_type("int[]")
doc_offsets json,
@search_type("text[]")
dep_types json,
@search_type("int[]")
dep_tokens json
).
function nlp_markup over (
doc_id text,
content text
) returns rows like sentences
implementation "udf/nlp_markup.sh" handles tsj lines.
sentences += nlp_markup(doc_id, content) :-
articles(doc_id, content).
## Candidate mapping ##########################################################
@extraction
person_mention(
@key
mention_id text,
@searchable
mention_text text,
@distributed_by
@references(relation="sentences", column="doc_id", alias="appears_in")
doc_id text,
@references(relation="sentences", column="sentence_index", alias="appears_in")
sentence_index int,
begin_index int,
end_index int
).
function map_person_mention over (
doc_id text,
sentence_index int,
tokens text[],
ner_tags text[]
) returns rows like person_mention
implementation "udf/map_person_mention.py" handles tsj lines.
person_mention += map_person_mention(
doc_id, sentence_index, tokens, ner_tags
) :-
sentences(doc_id, sentence_index, tokens, _, _, ner_tags, _, _, _).
spouse_candidate(
p1_id text,
p1_name text,
p2_id text,
p2_name text
).
num_people(doc_id, sentence_index, COUNT(p)) :-
person_mention(p, _, doc_id, sentence_index, _, _).
spouse_candidate(p1, p1_name, p2, p2_name) :-
num_people(same_doc, same_sentence, num_p),
person_mention(p1, p1_name, same_doc, same_sentence, p1_begin, _),
person_mention(p2, p2_name, same_doc, same_sentence, p2_begin, _),
num_p < 5,
p1 < p2,
p1_name != p2_name,
p1_begin != p2_begin.
## Feature Extraction #########################################################
# Feature extraction (using DDLIB via a UDF) at the relation level
@extraction
spouse_feature(
@key
@references(relation="has_spouse", column="p1_id", alias="has_spouse")
p1_id text,
@key
@references(relation="has_spouse", column="p2_id", alias="has_spouse")
p2_id text,
@key
feature text
).
function extract_spouse_features over (
p1_id text,
p2_id text,
p1_begin_index int,
p1_end_index int,
p2_begin_index int,
p2_end_index int,
doc_id text,
sent_index int,
tokens text[],
lemmas text[],
pos_tags text[],
ner_tags text[],
dep_types text[],
dep_tokens int[]
) returns rows like spouse_feature
implementation "udf/extract_spouse_features.py" handles tsj lines.
spouse_feature += extract_spouse_features(
p1_id, p2_id, p1_begin_index, p1_end_index, p2_begin_index, p2_end_index,
doc_id, sent_index, tokens, lemmas, pos_tags, ner_tags, dep_types, dep_tokens
) :-
person_mention(p1_id, _, doc_id, sent_index, p1_begin_index, p1_end_index),
person_mention(p2_id, _, doc_id, sent_index, p2_begin_index, p2_end_index),
sentences(doc_id, sent_index, tokens, lemmas, pos_tags, ner_tags, _, dep_types, dep_tokens).
## Distant Supervision ########################################################
@extraction
spouse_label(
@key
@references(relation="has_spouse", column="p1_id", alias="has_spouse")
p1_id text,
@key
@references(relation="has_spouse", column="p2_id", alias="has_spouse")
p2_id text,
@navigable
label int,
@navigable
rule_id text
).
# make sure all pairs in spouse_candidate are considered as unsupervised examples
spouse_label(p1,p2, 0, NULL) :- spouse_candidate(p1, _, p2, _).
# distant supervision using data from DBpedia
@source
spouses_dbpedia(
@key
person1_name text,
@key
person2_name text
).
spouse_label(p1,p2, 1, "from_dbpedia") :-
spouse_candidate(p1, p1_name, p2, p2_name),
spouses_dbpedia(n1, n2),
[ lower(n1) = lower(p1_name), lower(n2) = lower(p2_name) ;
lower(n2) = lower(p1_name), lower(n1) = lower(p2_name) ].
# supervision by heuristic rules in a UDF
function supervise over (
p1_id text, p1_begin int, p1_end int,
p2_id text, p2_begin int, p2_end int,
doc_id text,
sentence_index int,
sentence_text text,
tokens text[],
lemmas text[],
pos_tags text[],
ner_tags text[],
dep_types text[],
dep_tokens int[]
) returns (
p1_id text, p2_id text, label int, rule_id text
)
implementation "udf/supervise_spouse.py" handles tsj lines.
spouse_label += supervise(
p1_id, p1_begin, p1_end,
p2_id, p2_begin, p2_end,
doc_id, sentence_index,
tokens, lemmas, pos_tags, ner_tags, dep_types, dep_token_indexes
) :-
spouse_candidate(p1_id, _, p2_id, _),
person_mention(p1_id, p1_text, doc_id, sentence_index, p1_begin, p1_end),
person_mention(p2_id, p2_text, _, _, p2_begin, p2_end),
sentences(
doc_id, sentence_index,
tokens, lemmas, pos_tags, ner_tags, _, dep_types, dep_token_indexes
).
# resolve multiple labels by majority vote (summing the labels in {-1,0,1})
spouse_label_resolved(p1_id, p2_id, SUM(vote)) :- spouse_label(p1_id, p2_id, vote, rule_id).
# assign the resolved labels for the spouse relation
@materialize
has_spouse(p1_id, p2_id) = if l > 0 then TRUE
else if l < 0 then FALSE
else NULL end :- spouse_label_resolved(p1_id, p2_id, l).
## Inference Rules ############################################################
# Features
@weight(f)
has_spouse(p1_id, p2_id) :-
spouse_feature(p1_id, p2_id, f).
# Inference rule: Symmetry
@weight(3.0)
has_spouse(p1_id, p2_id) => has_spouse(p2_id, p1_id) :-
TRUE.
# Inference rule: Only one marriage
@weight(-1.0)
has_spouse(p1_id, p2_id) => has_spouse(p1_id, p3_id) :-
TRUE.