In [57]:
# Imports
import regex as re
import sys
sys.path.append("../")
sys.path.append("../../")
sys.path.append("../../../")

from num2words import num2words
import nlp.en.segmenters
import nlp.en.tokens

In [186]:
# Duration units
base_unit_list = ['second', 'minute', 'hour', 'day',
                  'week', 'month', 'quarter', 'year',
                 'annum']
unit_list = []
unit_list.extend(["{0}s".format(u) for u in base_unit_list])
unit_list.extend(base_unit_list)

# Setup mapping to days
unit_day_map = {"second": 1/(60.*60.*24.),
                "minute": 1/(60.*24),
                "hour": 1/24.,
                "day": 1,
                "week": 7,
                "month": 30, #365.25/12.,
                "quarter": 365/4.0,
                "year": 365, #365.25,
                "annum": 365,
               }

for k in base_unit_list:
    unit_day_map[k + "s"] = unit_day_map[k]


# Setup written words
word_list = []
word_number_map = {}
for i in range(0, 366):
    # Generate text
    word = num2words(i)
    word_nh = word.replace("-", " ")
    
    word_number_map[word] = i
    word_number_map[word_nh] = i
    
    # Append
    if word != word_nh:
        word_list.extend([word, word_nh])
    else:
        word_list.append(word)

# Setup digits
DIGITS_PATTERN = '[\.\d\,]+'

# Setup delimiter pattern
DELIMITERS_PATTERN = '[/\:\-\,\s\_\+\@\(\)]+'
        
DURATION_PATTERN_TEMPLATE = '''
    (
        (
            (?P<digits_written>{digits_written_pattern})
            |
            (?P<digits>{digits_pattern})
            |
            (?P<delimiters>{delimiters_pattern})
            |
            (?P<units>{unit_pattern})
        ){{2,}}
    )'''

duration_pattern = DURATION_PATTERN_TEMPLATE\
    .format(digits_written_pattern="|".join(word_list),
            digits_pattern=DIGITS_PATTERN,
            delimiters_pattern=DELIMITERS_PATTERN,
           unit_pattern="|".join(unit_list))
#print(duration_pattern)
re_duration = re.compile(duration_pattern, re.IGNORECASE | re.MULTILINE | re.UNICODE | re.DOTALL | re.VERBOSE)

In [187]:
def get_durations(text):
    """
    Return durations found within text.
    """
    durations = []
    
    for match in re_duration.finditer(text):
        ## Get individual group matches
        captures = match.capturesdict()
        
        # Check how many units were found
        base_days = None
        unit = None
        if len(captures["units"]) == 1:
            unit = captures["units"].pop().lower()
            base_days = unit_day_map[unit]
        else:
            base_days = None
        
        # Find the number to multiply base
        count = None
        if len(captures["digits"]) == 1:
            count = float(captures["digits"].pop().replace(',', ''))
        elif len(captures["digits_written"]) == 1:
            written = captures["digits_written"].pop().lower()
            if written in word_number_map:
                count = float(word_number_map[written])
            else:
                count = None
        else:
            count = None
            
        if base_days and count:
            durations.append((unit, count, base_days * count))
        else:
            durations.append((unit, count, None))
    
    return durations

In [189]:
examples = [
    ("""no more than five days thereafter""", [('days', 5.0, 5.0)]),
    ("""without at least thirty days of delay""", [('days', 30.0, 30.0)]),
    ("""at most 90 days shall""", [('days', 90.0, 90.0)]),
    ("""within a period of one month after""", [('month', 1.0, 30.0)]),
    ("""before two months pass""", [('months', 2.0, 60.0)]),
    ("""no more than thirty (30) days""", [('days', 30.0, 30.0)]),
    ("""no more than two (2) quarters""", [('quarters', 2.0, 365/2.)]),
    ("""before two months pass""".upper(), [('months', 2.0, 60.0)]),
    ("""before two months pass""".title(), [('months', 2.0, 60.0)]),
    ("""at least one month and no more than two months""".title(), [('month', 1.0, 30.0),
                                                                   ('months', 2.0, 60.0)]),
    ("""after the passage of at least one year there can be""".title(), [('year', 1.0, 365.0)]),
    ("""from a period of two years to five years""", [("years", 2, 365*2),
                                                     ("years", 5, 365*5)]),
    ("""after the passage of at least 2.5 years there can be""".title(), [('years', 2.5, 912.5)]),
]

In [190]:
total = 0
correct = 0

for example in examples:
    # Get durations
    durations = get_durations(example[0])

    # Check values
    l_diff = set(durations) - set(example[1])
    r_diff = set(example[1]) - set(durations)
    if len(l_diff) > 0 or len(r_diff) > 0:
        print(example[0])
        print((l_diff, r_diff, example[1]))
        print("-"*16)
    else:
        correct += 1
    total += 1

print("Accuracy: {0}% on {1} samples".format(100.*float(correct)/total, total))

Accuracy: 100.0% on 13 samples
