In [3]:
import sys
sys.path.append("..")
sys.path.append("../../eqnet")

import gzip
import numpy as np
from expemb import EquivExpTokenizer

In [4]:
tokenizer = EquivExpTokenizer()

In [5]:
def get_n_operators(prefix_eq):
    return sum([1 for token in prefix_eq.split(" ") if token in tokenizer.operators])

def get_seq_len(prefix_eq):
    return len(prefix_eq.split(" "))

def get_data_stats(filepath):
    n_ops_list = []
    seq_len_list = []
    
    with gzip.open(filepath, "rt") as f:
        for line in f:
            line = line.strip()
            seq_len = get_seq_len(line)
            n_ops = get_n_operators(line)
            n_ops_list.append(n_ops)
            seq_len_list.append(seq_len)
            
    print(f"Number of expressions: {len(seq_len_list):,}")
    return f"${round(np.mean(n_ops_list), 2):.2f} \pm {round(np.std(n_ops_list), 2):.2f}$", \
        f"${round(np.mean(seq_len_list), 2):.2f} \pm {round(np.std(seq_len_list), 2):.2f}$"

In [6]:
train_n_ops, train_seq_len = get_data_stats("../data/autoenc_5_ops.train.gz")
val_n_ops, val_seq_len = get_data_stats("../data/equivexp_5_ops.valid.gz")
test_n_ops, test_seq_len = get_data_stats("../data/equivexp_5_ops.test.gz")

print("\\toprule")
print("Data Split & \# Operators & Sequence Length \\\\")
print("\\midrule")
print(f"Training & {train_n_ops} & {train_seq_len} \\\\")
print(f"Validation & {val_n_ops} & {val_seq_len} \\\\")
print(f"Test & {test_n_ops} & {test_seq_len} \\\\")
print("\\bottomrule")

Number of expressions: 2,744,824
Number of expressions: 2,000
Number of expressions: 5,000
\toprule
Data Split & \# Operators & Sequence Length \\
\midrule
Training & $5.68 \pm 1.32$ & $16.19 \pm 6.28$ \\
Validation & $5.60 \pm 1.29$ & $15.03 \pm 4.31$ \\
Test & $5.98 \pm 1.22$ & $16.20 \pm 4.13$ \\
\bottomrule


In [9]:
import numpy as np

In [10]:
np.mean(n_ops_list)

5.679241729160048

In [11]:
np.std(n_ops_list)

1.315812494489815

In [12]:
n_ops_list[:10]

[6, 6, 5, 6, 5, 6, 5, 7, 6, 7]