-
Notifications
You must be signed in to change notification settings - Fork 543
/
without_ddlib.py
55 lines (45 loc) · 1.77 KB
/
without_ddlib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#! /usr/bin/env python
# File: udf/ext_has_spouse_features.py
import sys, json
# For each input tuple
# TODO: Sample Data and the input schema.
# sample json
for row in sys.stdin:
obj = json.loads(row)
# Library/DSL??? This is a span, it should be an object.
p1_start = obj["p1.start_position"]
p1_length = obj["p1.length"]
p1_end = p1_start + p1_length
p2_start = obj["p2.start_position"]
p2_length = obj["p2.length"]
p2_end = p2_start + p2_length
p1_text = obj["words"][p1_start:p1_length]
p2_text = obj["words"][p2_start:p2_length]
left_idx = min(p1_end, p2_end)
right_idx = max(p1_start, p2_start)
# Features for this pair come in here
features = set()
# Feature 1: Find out if a lemma of marry occurs.
# A better feature would ensure this is on the dependency path between the two.
lemma_between = obj["lemma"][left_idx:right_idx]
married_words = ['marry', 'widow']
for mw in married_words:
if mw in lemma_between:
features.add("important_word=%s" % mw)
# Feature 2: The number of words between the two phrases.
# Intuition: if they are close by, the link may be stronger.
words_between = obj["words"][left_idx:right_idx]
l = len(words_between)
if l < 5: features.add("num_words_between=%s" % l)
else: features.add("many_words_between")
# Feature 3: Check if the last name matches heuristically.
last_word_left = obj["words"][p1_end - 1]
last_word_right = obj["words"][p2_end - 1]
if (last_word_left == last_word_right):
features.add("potential_last_name_match")
# TODO: Add more features, look at dependency paths, etc
for feature in features:
print json.dumps({
"relation_id": obj["relation_id"],
"feature": feature
})