In [1]:
from __future__ import print_function

In [2]:
import re
import spacy

from collections import Counter
from simpleTransform import SimpleTransform
from sympy import postorder_traversal, Symbol
from sympy.parsing.sympy_parser import _token_splittable
from sympy.parsing.sympy_parser import convert_xor
from sympy.parsing.sympy_parser import factorial_notation
from sympy.parsing.sympy_parser import function_exponentiation
from sympy.parsing.sympy_parser import implicit_application
from sympy.parsing.sympy_parser import implicit_multiplication
from sympy.parsing.sympy_parser import parse_expr
from sympy.parsing.sympy_parser import split_symbols_custom
from sympy.parsing.sympy_parser import standard_transformations
from sympy.parsing.sympy_tokenize import ENDMARKER
from sympy.parsing.sympy_tokenize import NAME
from sympy.parsing.sympy_tokenize import NUMBER
from sympy.parsing.sympy_tokenize import OP
from sympy.parsing.sympy_tokenize import STRING

In [3]:
def rewrite_number(text):
    se = SimpleTransform()
    start = 0
    n = len(text)
    result = ''
    while start < n:
        matched, num_str = se.rewriteNumber(list(text[start:]))
        if matched > 0:
            start += matched
            result += num_str
        else:
            while start < n and not text[start].isspace():
                result += text[start]
                start += 1
            if start < n:
                result += text[start]
            start += 1
    return result

In [4]:
tokenizer = spacy.load('en').tokenizer
arrow_str = re.compile(r'(-( )*){3,}')
doller_num = re.compile(r'($)(\d)')
stock_num = re.compile(r'([A-z]\.)(\d)')
num_unit = re.compile(r'(\d,\d\d\d)([A-z])')
power_notation = re.compile(r'(\(\d+\))(\d)')

def rewrite_with_tokenization(text):
    text = power_notation.sub(r'\g<1>^\g<2>', text)
    text = arrow_str.sub('\u2192', text)
    text = doller_num.sub(r'\g<1> \g<2>', text)
    text = stock_num.sub(r'\g<1> \g<2>', text)
    text = num_unit.sub(r'\g<1> \g<2>', text)
    text = rewrite_number(text)
    spaced_text = ' '.join(map(str, tokenizer(text)))
    return rewrite_number(spaced_text)

In [5]:
def combinatorial_notation(tokens, local_dict, global_dict):
    beginning = [(NAME, 'binomial'), (OP, '(')]
    comma = [(OP, ',')]
    end = [(OP, ')')]
    last_toknum = None
    result = []
    for toknum, tokval in tokens:
        if last_toknum == NUMBER and toknum == NAME and len(tokval) > 1 and tokval[0] in ('c', 'C') and tokval[1:].isdigit():
            result = result[:-1] + beginning + result[-1:] + comma + [(NUMBER, tokval[1:])] + end
        else:
            result.append((toknum, tokval))
            last_toknum = toknum
    return result

In [6]:
operator_dict = {
    '\u00f7': '/',
    '\u00d7': '*',
}
def unicode_operator(tokens, local_dict, global_dict):
    result = []
    for toknum, tokval in tokens:
        if tokval in operator_dict:
            toknum = OP
            tokval = operator_dict[tokval]
        result.append((toknum, tokval))
    return result

In [7]:
def reject_symbols(symbols):
    def transformation(tokens, local_dict, global_dict):
        for toknum, tokval in tokens:
            if toknum == NAME and tokval in symbols:
                raise NameError()
        return tokens
    return transformation

In [8]:
def get_transformations(splittable_symbols):
    def splittable(symbol):
        return set(symbol).issubset(splittable_symbols)
    return (combinatorial_notation, unicode_operator) + standard_transformations + (
        convert_xor,
        split_symbols_custom(splittable),
        implicit_multiplication,
        implicit_application,
        function_exponentiation
    )

In [9]:
def all_symbols(expr):
    for sub_expr in postorder_traversal(expr):
        if hasattr(sub_expr, 'is_symbol') and sub_expr.is_symbol:
            yield str(sub_expr)

In [10]:
def all_values(expr):
    for sub_expr in postorder_traversal(expr):
        eval_result = sub_expr.evalf()
        if eval_result.is_number:
            yield eval_result, len(sub_expr.args) == 0

In [11]:
def parse(text, splittable_symbols=set(), local_dict={name: Symbol(name) for name in ('x', 'y', 'z', 'A', 'B', 'C')}):
    from sympy import binomial, factorial, factorial2, Mul, Add
    local_dict['binomial'] = lambda x, y: binomial(x, y, evaluate = False)
    local_dict['factorial'] = lambda x: factorial(x, evaluate = False)
    local_dict['factorial2'] = lambda x: factorial2(x, evaluate = False)
    try:
        mul_identity = Mul.identity
        Mul.identity = None
        add_identity = Add.identity
        Add.identity = None
        expr = parse_expr(text, local_dict=local_dict, transformations=get_transformations(splittable_symbols), evaluate=False)
    finally:
        Mul.identity = mul_identity
        Add.identity = add_identity
    for symbol in all_symbols(expr):
        if '_' in symbol and '' not in symbol.split('_'):
            continue
        if len(symbol) > 2:
            raise NameError()
        if len(symbol) == 2 and not symbol[-1].isdigit():
            raise NameError()
    return expr

In [12]:
def try_parse(text, splittable_symbols=set(), local_dict=None):
    try:
        if local_dict is None:
            return parse(text, splittable_symbols=splittable_symbols)
        return parse(text, splittable_symbols=splittable_symbols, local_dict=local_dict)
    except:
        return None

In [13]:
def potential_expr(char):
    return char.isdigit() or char in {'+', '*', '/', '^'}

In [14]:
def extract_exprs_from_line(line, splittable_symbols=set()):
    if not line:
        return []
    pos = 0
    n = len(line)
    lower_bound = 0
    while pos < n:
        if potential_expr(line[pos]):
            start = pos
            end = pos + 1
            found = False
            for start in range(lower_bound, pos + 1):
                if start > lower_bound and line[start-1].isalpha() and line[start].isalpha():
                    continue
                for end in range(n, pos, -1):
                    if end < n and line[end].isalpha() and line[end-1].isalpha():
                        continue
                    expr_text = line[start:end]
                    expr = try_parse(expr_text, splittable_symbols)
                    if expr is not None:
                        yield start, end, expr
                        lower_bound = end
                        pos = end
                        found = True
                        break
                if found:
                    break
            if not found:
                pos += 1
        else:
            pos += 1

In [15]:
def extract_exprs_from_text(text, splittable_symbols=set(), delimiter=re.compile(r'(=|\n|,|>|<)')):
    base = 0
    for segment in delimiter.split(text):
        if len(segment) > 0 and delimiter.match(segment) is None:
            for start, end, expr in extract_exprs_from_line(segment, splittable_symbols):
                yield base + start, base + end, expr
        base += len(segment)

In [16]:
def split_text_and_expr(text, splittable_symbols=set()):
    last_end = 0
    for start, end, expr in extract_exprs_from_text(text, splittable_symbols):
        if last_end != start:
            yield last_end, start, text[last_end:start]
        yield start, end, expr
        last_end = end
    if last_end < len(text):
        yield last_end, len(text), text[last_end:len(text)]

In [17]:
def parse_rationale(text):
    symbols = set()
    for start, end, segment in split_text_and_expr(text):
        is_expr = type(segment) != str
        if is_expr:
            symbols |= set(all_symbols(segment))
    splittable_symbols = {symbol for symbol in symbols if len(symbol) == 1}
    #print(splittable_symbols)

    results = []
    for start, end, segment in split_text_and_expr(text, splittable_symbols=splittable_symbols):
        is_expr = type(segment) != str
        if is_expr:
            results.append((is_expr, segment))
        else:
            results.append((is_expr, list(map(lambda x:str(x).lower(), tokenizer(segment)))))

    return results

In [18]:
text = "At the non-discounted price, each friend would pay $240, as $ 1200 divided by 5 friends is $240 per friend. But if the bill is 15% off, then each friend would pay 15% less. 15% of $240 is $36, so each friend saves $36 and pays the remaining $204\nCORRECT OPTION:OPTION E"

In [19]:
text = "Total 12 different Gifts, and 4 children.\nThus any one child gets 12C3 gifts,\nthen the other child gets 9C3 gifts(12 total - 3 already given),\nthen the third one gets 6C3 gifts,\nand the last child gets 3C3 gifts.\nSince order in which each child gets the gift is not imp, thus, ans :\n12C3 * 9C3 * 6C3 * 3C3 = 12! / (3!)^4\nAns : C."

In [20]:
text = "spotting the pattern of equations both are in form of (X+C)^2 so\nA1= (x+2)^2A2= (2x-3)^2\nL1= x+2L2= 2x-3\nP1 = 4( x+2)P2=4(2x-3)\nP1+P2=32\n4( x+2) +4(2x-3)=32..............> X=3\nAnswer: E"

In [21]:
text = "For all positive integers n and m,\nA(n) = (1 + 1/2 + 1/2^2)(1 + 1/3 + 3^2)(1 + 1/5 + 5^2)\u2026(1 + 1/p_n + 1/p_n^2), where p_n is the nth smallest prime number,\nnote:i think there's a typo in the above function, A(n) could be (1 + 1/2 + 1/2^2)(1 + 1/3 +1/3^2)(1 + 1/5 +1/5^2)\u2026(1 + 1/p_n + 1/p_n^2)\nB(m) = sum of the reciprocals of all the positive integers from 1 through m, inclusive.\nA(5), here 5 represents the 5th smallest prime number; the 5th smallest prime number is 11 {2, 3, 5, 7, 11, ...}\nA(5) = (1 + 1/2 + 1/2^2)(1 + 1/3 + 3^2)(1 + 1/5 + 5^2)(1 + 1/7 + 1/7^2)(1 + 1/11 + 11^2)\nThe distributed expansion of A(5) = 1 + 1/2 + 1/3 + 1/4 + 1/5 + 1/6 +1/7 + 1/9+ 1/10 + 1/11 +1/12 + 1/14+ ...\nmissing numbers are 1/8, 1/13, 1/16, ....\nB(25) = (1 + 1/2 + 1/3 + 1/4 + ... + 1/16 + 1/17 + ... + 1/25)\nhere the largest reciprocal is 1 and the reciprocals are arranged in descending order based on their values\nThe largest reciprocal that present in B(25) but not in A(5) is 1/3\nAnswer A"

In [22]:
text = "The required probability C= probability of choosing 6 balls out of the total 8 in such a way that we remove 4 out of 5 white and 2 out of 3 blue balls.\nWays to select 6 out of total 8 = 8C6\nWays to select 4 out of 5 white balls = 5C4\nWays to select 2 out of 3 blue balls = 3C2\nThus the required probability = (5C4*3C2)/8C6 = 15/28.\nD is thus the correct answer."

In [23]:
text = "M + D + I = 74 - - ----- (1)\n(D + I) - M = 46 - - - (2)\nI = 410410 D \u21d2\u21d2 5I = 2D \u21d2\u21d2 I = 2D/5 - - - (3)\nAdding (1) and (2) we get 2D + 2I = 120\nSubstituting the value of I in the above equation,\n2D+2(2D5)=1202D+2(2D5)=120\n\u21d2\u21d2 14D = 600\n\u21d2\u21d2 D = 300/7 = 42.8\nAns:B"

In [24]:
rewritten_text = rewrite_with_tokenization(text)
print(rewritten_text)
parse_rationale(rewritten_text)

M + D + I = 74 →(1 ) 
 ( D + I ) - M = 46 →(2 ) 
 I = 410410 D ⇒⇒ 5I = 2D ⇒⇒ I = 2D/5 →(3 ) 
 Adding ( 1 ) and ( 2 ) we get 2D + 2I = 120 
 Substituting the value of I in the above equation , 
 2D+2(2D5)=1202D+2(2D5)=120 
 ⇒⇒ 14D = 600 
 ⇒⇒ D = 300/7 = 42.8 
 Ans : B


[(True, D + M + I),
 (False, ['=']),
 (True, 74),
 (False, ['→']),
 (True, 1),
 (False, ['\n']),
 (True, D - M + I),
 (False, ['=']),
 (True, 46),
 (False, ['→']),
 (True, 2),
 (False, ['\n ', 'i', '=']),
 (True, 410410*D),
 (False, ['⇒⇒']),
 (True, 5*I),
 (False, ['=']),
 (True, 2*D),
 (False, ['⇒⇒', 'i', '=']),
 (True, 2*D/5),
 (False, ['→']),
 (True, 3),
 (False, ['\n ', 'adding']),
 (True, 1),
 (False, ['and']),
 (True, 2),
 (False, ['we', 'get']),
 (True, 2*D + 2*I),
 (False, ['=']),
 (True, 120),
 (False,
  ['\n ',
   'substituting',
   'the',
   'value',
   'of',
   'i',
   'in',
   'the',
   'above',
   'equation',
   ',',
   '\n']),
 (True, 2*D + 2*2*D5),
 (False, ['=']),
 (True, 1202*D + 2*2*D5),
 (False, ['=']),
 (True, 120),
 (False, ['\n ', '⇒⇒']),
 (True, 14*D),
 (False, ['=']),
 (True, 600),
 (False, ['\n ', '⇒⇒', 'd', '=']),
 (True, 300/7),
 (False, ['=']),
 (True, 42.8000000000000),
 (False, ['\n ', 'ans', ':', 'b'])]

In [25]:
list(all_values(parse('(5C4*3C2)/8C6')))

[(5.00000000000000, True),
 (4.00000000000000, True),
 (5.00000000000000, False),
 (3.00000000000000, True),
 (2.00000000000000, True),
 (3.00000000000000, False),
 (8.00000000000000, True),
 (6.00000000000000, True),
 (28.0000000000000, False),
 (-1.00000000000000, True),
 (0.0357142857142857, False),
 (0.535714285714286, False)]

In [26]:
list(all_values(parse('15/28')))

[(15.0000000000000, True),
 (28.0000000000000, True),
 (-1.00000000000000, True),
 (0.0357142857142857, False),
 (0.535714285714286, False)]

In [27]:
text= "be 1*3!*2! + 1*3!*2! + 1*3!*2! + 1*3!*2! = 4 *1*3! *2 !\nSo probability is (4! * 2! ) / 5! = 2/5\nSolution 2:\nConsider C*E as 1 person ."

In [28]:
list(all_values(parse('1/2')))

[(1.00000000000000, True),
 (2.00000000000000, True),
 (-1.00000000000000, True),
 (0.500000000000000, False),
 (0.500000000000000, False)]

In [29]:
print(text)
print(rewrite_with_tokenization(text))
parse_rationale(rewrite_with_tokenization(text))

be 1*3!*2! + 1*3!*2! + 1*3!*2! + 1*3!*2! = 4 *1*3! *2 !
So probability is (4! * 2! ) / 5! = 2/5
Solution 2:
Consider C*E as 1 person .
be 1 * 3!*2 ! + 1 * 3!*2 ! + 1 * 3!*2 ! + 1 * 3!*2 ! = 4 * 1 * 3 ! * 2 ! 
 So probability is ( 4 ! * 2 ! ) / 5 ! = 2/5 
 Solution 2 : 
 Consider C*E as 1 person .


[(False, ['be']),
 (True,
  factorial(2)*factorial(3) + factorial(2)*factorial(3) + factorial(2)*factorial(3) + factorial(2)*factorial(3)),
 (False, ['=']),
 (True, 4*factorial(2)*factorial(3)),
 (False, ['\n ', 'so', 'probability', 'is']),
 (True, factorial(2)*factorial(4)/factorial(5)),
 (False, ['=']),
 (True, 2/5),
 (False, ['\n ', 'solution']),
 (True, 2),
 (False, [':', '\n ', 'consider']),
 (True, E*C),
 (False, ['as']),
 (True, 1),
 (False, ['person', '.'])]