In [1]:
import os
import numpy as np
import sympy as sp
import torch

In [2]:
cd ../

/home/kpgelvan/SymbolicMathematics


In [3]:
from src.utils import AttrDict
from src.envs import build_env
from src.model import build_modules

from src.utils import to_cuda
from src.envs.sympy_utils import simplify

### 

In [4]:
OPERATORS = {
        # Elementary functions
        'add': 2,
        'sub': 2,
        'mul': 2,
        'div': 2,
        'pow': 2,
        'rac': 2,
        'inv': 1,
        'pow2': 1,
        'pow3': 1,
        'pow4': 1,
        'pow5': 1,
        'sqrt': 1,
        'exp': 1,
        'ln': 1,
        'abs': 1,
        'sign': 1,
        # Trigonometric Functions
        'sin': 1,
        'cos': 1,
        'tan': 1,
        'cot': 1,
        'sec': 1,
        'csc': 1,
        # Trigonometric Inverses
        'asin': 1,
        'acos': 1,
        'atan': 1,
        'acot': 1,
        'asec': 1,
        'acsc': 1,
        # Hyperbolic Functions
        'sinh': 1,
        'cosh': 1,
        'tanh': 1,
        'coth': 1,
        'sech': 1,
        'csch': 1,
        # Hyperbolic Inverses
        'asinh': 1,
        'acosh': 1,
        'atanh': 1,
        'acoth': 1,
        'asech': 1,
        'acsch': 1,
        # Derivative
        'derivative': 2,
        # custom functions
        'f': 1,
        'g': 2,
        'h': 3,
    }

symbols = ['I', 'INT+', 'INT-', 'INT', 'FLOAT', '-', '.', '10^', 'Y', "Y'", "Y''"]

In [5]:
constants = ['pi', 'E']
variables = ['x', 'y', 'z', 't']
functions = ['f', 'g', 'h']
elements = [str(i) for i in range(-10, 10)]
coefficients = [f'a{i}' for i in range(10)]

In [6]:
no_child_symbols = constants + variables + functions + elements + coefficients

###

In [7]:
from tqdm import tqdm
import queue

def get_ancestors(exp_list, exp_len):
    q = queue.LifoQueue()
    q.put(-1)                            # so last element gets this parent but doesn't save it

    ancestors = {0: []}
    node2parent = {}
    levels = {0: -1}

    parent = 0
    for i in range(exp_len):
        op_now = exp_list[i]

        node2parent[i] = parent
        levels[i] = levels[parent] + 1

        if op_now in OPERATORS or op_now in symbols:   # <=> node has children
            if op_now in OPERATORS and OPERATORS[op_now] == 2:    # <=> node has 2 children
                q.put(i)
            parent = i
        elif op_now in no_child_symbols:
            if op_now.isdigit() and i + 1 < exp_len and exp_list[i + 1].isdigit():   # e.x. 18
                parent = i
            else:
                parent = q.get()
        else:
            print(op_now)
            #raise(NotFound)
            return False
        ancestors[i] = [i] + ancestors[node2parent[i]]

    return ancestors, levels

In [8]:
def get_path(i, j):
    if i == j:
        return "<self>"
    anc_i = set(ancestors[i])
      
    for node in ancestors[j][-(levels[i] + 1) :]:
        if node in anc_i:
            up_n = levels[i] - levels[node]
            down_n = levels[j] - levels[node]
            return str(up_n + 0.001 * down_n)

In [9]:
def get_ud_masks(ancestors, levels, exp_len):
    path_rels = []
    for i in range(exp_len):
        path_rels.append(" ".join([get_path(i, j) for j in range(exp_len)]))
    
    return path_rels

###

In [10]:
F_prefix  = ['mul', 'x', 'tan', 'mul', 'pow', 'x', 'INT-', '1', 'exp', 'x']

In [13]:
ancestors, levels = get_ancestors(F_prefix, len(F_prefix))
rel_matrix = get_ud_masks(ancestors, levels, len(F_prefix))
rel_matrix

['<self> 0.001 0.001 0.002 0.003 0.004 0.004 0.005 0.003 0.004',
 '1.0 <self> 1.001 1.002 1.003 1.004 1.004 1.005 1.003 1.004',
 '1.0 1.001 <self> 0.001 0.002 0.003 0.003 0.004 0.002 0.003',
 '2.0 2.001 1.0 <self> 0.001 0.002 0.002 0.003 0.001 0.002',
 '3.0 3.001 2.0 1.0 <self> 0.001 0.001 0.002 1.001 1.002',
 '4.0 4.001 3.0 2.0 1.0 <self> 1.001 1.002 2.001 2.002',
 '4.0 4.001 3.0 2.0 1.0 1.001 <self> 0.001 2.001 2.002',
 '5.0 5.001 4.0 3.0 2.0 2.001 1.0 <self> 3.001 3.002',
 '3.0 3.001 2.0 1.0 1.001 1.002 1.002 1.003 <self> 0.001',
 '4.0 4.001 3.0 2.0 2.001 2.002 2.002 2.003 1.0 <self>']

In [23]:
import jsonlines
import json

In [27]:
from collections import Counter

def _insert(iterable):
    words = []
    for w in iterable:
        words.append(w)
    word_count.update(words)

word_count = Counter()
with jsonlines.open("data/rel_matrix_valid.jsonl") as f:
    for line in tqdm(f):
        matrix = json.loads(line)
        new_matrix = [line.split() for line in matrix]
        for tokens in new_matrix:
            for elem in tokens:
                _insert(elem.split("_"))

9985it [00:06, 1439.19it/s]


In [39]:
len(word_count)

548

In [38]:
special_tokens = "unk"
num_spec_tokens = len(special_tokens.split("_"))
# -2 to reserve spots for PAD and UNK token
dict_size = 500
dict_size = dict_size - num_spec_tokens if dict_size and dict_size > num_spec_tokens else dict_size
most_common = word_count.most_common(dict_size)

In [40]:
values = np.array(list(word_count.values()))
keys = np.array(list(word_count.keys()))
idxs = np.argsort(values)[::-1]
for i, (key, value) in enumerate(zip(keys[idxs], values[idxs])):
    print(value, key, i)

215235 <self> 0
205250 0.001 1
205250 1.0 2
195265 0.002 3
195265 2.0 4
185280 0.003 5
185280 3.0 6
180218 2.002 7
165652 4.0 8
165652 0.004 9
157936 2.003 10
157936 3.002 11
149647 2.001 12
149647 1.002 13
135328 0.005 14
135328 5.0 15
126700 1.003 16
126700 3.001 17
126184 3.003 18
125676 1.001 19
110286 2.004 20
110286 4.002 21
98612 0.006 22
98612 6.0 23
93767 4.001 24
93767 1.004 25
75788 4.003 26
75788 3.004 27
68211 2.005 28
68211 5.002 29
63435 0.007 30
63435 7.0 31
61036 1.005 32
61036 5.001 33
42788 3.005 34
42788 5.003 35
37129 6.002 36
37129 2.006 37
35786 8.0 38
35786 0.008 39
35132 4.004 40
34561 1.006 41
34561 6.001 42
21927 3.006 43
21927 6.003 44
18021 2.007 45
18021 7.002 46
17859 0.009000000000000001 47
17859 9.0 48
17232 1.007 49
17232 7.001 50
16357 5.004 51
16357 4.005 52
10241 3.007 53
10241 7.003 54
8004 0.01 55
8004 10.0 56
7820 2.008 57
7820 8.002 58
7687 1.008 59
7687 8.001 60
7364 6.004 61
7364 4.006 62
6746 5.005 63
4343 3.008 64
4343 8.003 65
3315 0.011 66

In [42]:
values = np.array(list(word_count.values()))
keys = np.array(list(word_count.keys()))
idxs = np.argsort(values)[::-1]
vocab = []
for i, (key, value) in enumerate(zip(keys[idxs], values[idxs])):
    print(value, key, i)
    vocab.append(key)
with open("rel_vocab.txt", "w") as fout:
    fout.write("\n".join(vocab))

215235 <self> 0
205250 0.001 1
205250 1.0 2
195265 0.002 3
195265 2.0 4
185280 0.003 5
185280 3.0 6
180218 2.002 7
165652 4.0 8
165652 0.004 9
157936 2.003 10
157936 3.002 11
149647 2.001 12
149647 1.002 13
135328 0.005 14
135328 5.0 15
126700 1.003 16
126700 3.001 17
126184 3.003 18
125676 1.001 19
110286 2.004 20
110286 4.002 21
98612 0.006 22
98612 6.0 23
93767 4.001 24
93767 1.004 25
75788 4.003 26
75788 3.004 27
68211 2.005 28
68211 5.002 29
63435 0.007 30
63435 7.0 31
61036 1.005 32
61036 5.001 33
42788 3.005 34
42788 5.003 35
37129 6.002 36
37129 2.006 37
35786 8.0 38
35786 0.008 39
35132 4.004 40
34561 1.006 41
34561 6.001 42
21927 3.006 43
21927 6.003 44
18021 2.007 45
18021 7.002 46
17859 0.009000000000000001 47
17859 9.0 48
17232 1.007 49
17232 7.001 50
16357 5.004 51
16357 4.005 52
10241 3.007 53
10241 7.003 54
8004 0.01 55
8004 10.0 56
7820 2.008 57
7820 8.002 58
7687 1.008 59
7687 8.001 60
7364 6.004 61
7364 4.006 62
6746 5.005 63
4343 3.008 64
4343 8.003 65
3315 0.011 66

###

In [33]:
model_path = '../checkpoint.pth'
assert os.path.isfile(model_path)

In [34]:
params = params = AttrDict({

    # environment parameters
    'env_name': 'char_sp',
    'int_base': 10,
    'balanced': False,
    'positive': True,
    'precision': 10,
    'n_variables': 1,
    'n_coefficients': 0,
    'leaf_probs': '0.75,0,0.25,0',
    'max_len': 512,
    'max_int': 5,
    'max_ops': 15,
    'max_ops_G': 15,
    'clean_prefix_expr': True,
    'rewrite_functions': '',
    'tasks': 'prim_fwd',
    'operators': 'add:10,sub:3,mul:10,div:5,sqrt:4,pow2:4,pow3:2,pow4:1,pow5:1,ln:4,exp:4,sin:4,cos:4,tan:4,asin:1,acos:1,atan:1,sinh:1,cosh:1,tanh:1,asinh:1,acosh:1,atanh:1',

    # model parameters
    'cpu': False,
    'emb_dim': 1024,
    'n_enc_layers': 6,
    'n_dec_layers': 6,
    'n_heads': 4,
    'dropout': 0,
    'attention_dropout': 0,
    'sinusoidal_embeddings': False,
    'share_inout_emb': True,
    'reload_model': model_path,
    'max_relative_pos':0,
    'use_neg_dist':False
})

In [35]:
env = build_env(params)
x = env.local_dict['x']

In [1]:
modules = build_modules(env, params)
encoder = modules['encoder']
decoder = modules['decoder']

"modules = build_modules(env, params)\nencoder = modules['encoder']\ndecoder = modules['decoder']"

###

In [12]:
import sys
sys.path.append('src/envs/')
sys.path.append('src')

In [14]:
#from char_sp import prefix_to_infix, infix_to_sympy

In [15]:
#from utils import create_logger
#from utils import bool_flag
#from utils import timeout, TimeoutError
#from char_sp import prefix_to_infix, infix_to_sympy


In [16]:
f_prefix = ['sub', "Y'", 'pow', 'x', 'INT+', '2']
x1_prefix = env.clean_prefix(['sub', 'derivative', 'f', 'x', 'x'] + f_prefix)
x1_prefix
#x1 = torch.LongTensor(
#    [env.eos_index] +
#    [env.word2id[w] for w in x1_prefix] +
#    [env.eos_index]
#).view(-1, 1)
#len1 = torch.LongTensor([len(x1)])
#x1, len1 = to_cuda(x1, len1)


NameError: name 'env' is not defined

In [None]:
import json
import jsonlines

for set_name in ['train']:
    with open('data/prim_fwd.' + set_name, 'r') as expressions:
        with jsonlines.open('data/rel_matrix_'+set_name+'.jsonl', 'w') as rel_matrix_json:
            for i, line in tqdm(enumerate(expressions)):
                #print(line)
                qa = line.split('|')[1].split('\t')
                if len(qa) == 2:
                    q, a = qa
                else:
                    print(i,'is broken')
                    continue
                #print(q, ';', a)
                
                q = q.split()
                #a = a.split()
                
                ance_lev = get_ancestors(q, len(q))
                if len(ance_lev) == 2:
                    ancestors, levels = ance_lev
                else:
                    print(i, 'is broken')
                    continue
                rel_matrix_q = get_ud_masks(ancestors, levels, len(q))              

                #ancestors, levels = get_ancestors(a, len(a))
                #rel_matrix_a = get_ud_masks(ancestors, levels, len(a))
                
                rel_matrix_json.write(json.dumps(rel_matrix_q, indent=0))

4349497it [27:02, 1921.71it/s]

4349372 is broken


4478596it [28:35, 1413.35it/s]

4478366 is broken


7430119it [1:04:07, 1394.79it/s]

In [20]:
with jsonlines.open('data/rel_matrix_test.jsonl') as reader:
    for i, obj in enumerate(reader):
        matrix = json.loads(obj)
        new_matrix = [line.split() for line in matrix]
        print(new_matrix[0])
        print('gg wp')
        if i > 3:
            break

['<self>', '0.001', '0.002', '0.003', '0.003', '0.004', '0.004', '0.005']
gg wp
['<self>', '0.001', '0.002', '0.003', '0.003', '0.004']
gg wp
['<self>', '0.001', '0.002', '0.003', '0.003', '0.004', '0.004', '0.005']
gg wp
['<self>', '0.001', '0.002', '0.003', '0.004', '0.005', '0.004', '0.003', '0.004']
gg wp
['<self>', '0.001', '0.002', '0.003', '0.003', '0.004', '0.004', '0.005']
gg wp
