In [1]:
%cd "/home/kera/workspace/Transformer-GB"

/data/kera/workspace/Transformer-GB


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import torch
import yaml 
import os 
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import argparse
import re 
from transformers import AutoModelForSeq2SeqLM, AutoConfig

from transformers import PreTrainedTokenizerFast
from src.loader.data import load_data
from src.loader.checkpoint import load_trained_bag
from src.evalution.evaluators import eval_prediction

load('src/data/symbolic_utils.sage')
load('src/data/gbdataset.sage')

-f


# Katsura-n with tricks

### Multiplication (Tristan's idea)

In [162]:
field = 'F31'
field_ = GF(31)
n = 2
ring = PolynomialRing(field_, 'x', n, order='lex')


F_katsura, G_katsura = load_katsura(field, n, return_ring=False)
# F, G = ideal(F).basis, ideal(F).groebner_basis()
# change_map = coordinate_change_map(F.ring())
# F = coordinate_change(F, change_map)
# F, G = ideal(F).basis, ideal(F).groebner_basis()

F, G = matrix(F_katsura).T, matrix(G_katsura).T


builder = GBDataset_Builder(ring, 
                            max_rand_coeff=3, 
                            max_coeff=-1,
                            max_size=6, 
                            max_degree=4, 
                            max_num_terms=None, 
                            min_num_terms=1, 
                            max_Gdegree=4, 
                            max_num_Gterms=None, 
                            num_duplicants=1, 
                            density=1.0, 
                            with_permutation=True)


num_vars = F.nrows()
# m = num_vars
m = num_vars 
max_degree = 3
density = 1.0
d = None
# m = randint(0, max_size-num_vars) + num_vars
# d = randint(min_num_terms, max_num_terms) if max_num_terms is not None else None 
A = builder.random_umut_matrix(m, num_vars,  degree=max_degree, terms=d, density=density, num_bound=builder.max_rand_coeff)
U = builder.random_umut_matrix(m, m, degree=max_degree, terms=d, density=density, num_bound=builder.max_rand_coeff)
if builder.with_permutation:
    P = random_permutation_matrix(m) 
    U = U * P

F_new = U * A * F

In [161]:
for g in G_katsura: print(g)

x0 + 2*x1 + 2*x2 - 1
x1^2 - 3*x1
x1*x2 + 2*x1
x2^2 + 2*x2


In [151]:
F

[       x0 + 2*x1 + 2*x2 - 1]
[x0^2 - x0 + 2*x1^2 + 2*x2^2]
[     2*x0*x1 + 2*x1*x2 - x1]

In [163]:
load_dir = f'results/shape_gb_lex/gb_dataset_n={n}_field={field}'
bag = load_trained_bag(load_dir, from_checkpoint=True)
model = bag['model'] 
tokenizer = bag['tokenizer']
params = bag['params']

The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


In [164]:
F = [f[0] for f in F_new]
G = [g[0] for g in G]

In [166]:
F, G = ideal(F).basis, ideal(F).groebner_basis()

F_prefix = [poly_to_prefix(f) for f in F]
G_prefix = [poly_to_prefix(g) for g in G]
x_text = ' [SEP] '.join(F_prefix)
y_text = ' [SEP] '.join(G_prefix)

num_beams = 1
x = tokenizer(x_text, return_tensors='pt')['input_ids'].cuda()
# y = tokenizer(y_text, return_tensors='pt')['input_ids'].cuda()
output_ids = model.generate(x, max_length=1000, num_beams=num_beams, do_sample=False)
z_text = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
G_pred = [prefix_to_poly(zt, F.ring()) for zt in z_text[0].split('[SEP]')]

print(f'{"F":6}: {F}')
print(f'{"G":6}: {G}')
print(f'{"G_pred":6}: {G_pred}')

F     : [-10*x0^2*x1^2 - x0^2 + 2*x0*x1^3 - x0*x1^2 + x0 + 13*x1^4 - 13*x1^3 - 10*x1^2 - 4*x1, 4*x0^4*x1^2 - 12*x0^4 - 14*x0^3*x1^4 + 11*x0^3*x1^3 - 5*x0^3*x1^2 + 8*x0^3*x1 - 7*x0^3 - 13*x0^2*x1^5 - 4*x0^2*x1^4 - 13*x0^2*x1^3 + 5*x0^2*x1^2 + 6*x0^2*x1 - 12*x0^2 + 4*x0*x1^6 + 12*x0*x1^5 - 4*x0*x1^4 - 3*x0*x1^3 - 3*x0*x1^2 - 14*x0*x1 + x0 + 10*x1^7 - 10*x1^6 + 9*x1^5 - 15*x1^4 + 2*x1 - 1]
G     : [x0 + 2*x1 - 1, x1^2 + 10*x1]
G_pred: [x0 + 2*x1 - 1, x1^2 + 3*x1]


### Coordinate change

In [96]:
def coordinate_change_map(ring, num_bound=None):
    while True:
        n = ring.ngens()
        if num_bound is not None:
            P = matrix.random(ring.base_ring(), n, n, num_bound=num_bound)
        else:
            P = matrix.random(ring.base_ring(), n, n)
            
        if P.rank()  == n: break

    gens = ring.gens()
    trans_gens = matrix(gens) * P
    trans_gens = trans_gens[0]
    change_map = dict(zip(gens, trans_gens))
    return change_map

def coordinate_change(F, change_map):
    return [f.subs(change_map) for f in F]



In [11]:
def experiement_coordinate_change(field, n, load_dir, num_beams=1):
    bag = load_trained_bag(load_dir, from_checkpoint=True)
    model = bag['model'] 
    tokenizer = bag['tokenizer']
    params = bag['params']
    
    F, G = load_katsura(field, n)

    # tricks
    change_map = coordinate_change_map(F.ring())
    F = coordinate_change(F, change_map)
    # G = coordinate_change(G, change_map)

    F, G = ideal(F).basis, ideal(F).groebner_basis()

    F_prefix = [poly_to_prefix(f) for f in F]
    G_prefix = [poly_to_prefix(g) for g in G]
    x_text = ' [SEP] '.join(F_prefix)
    y_text = ' [SEP] '.join(G_prefix)

    x = tokenizer(x_text, return_tensors='pt')['input_ids'].cuda()
    # y = tokenizer(y_text, return_tensors='pt')['input_ids'].cuda()
    output_ids = model.generate(x, max_length=1000, num_beams=num_beams, do_sample=False)
    z_text = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    G_pred = [prefix_to_poly(zt, F.ring()) for zt in z_text[0].split('[SEP]')]
    
    print(f'{"F":6}: {F}')
    print(f'{"G":6}: {G}')
    print(f'{"G_pred":6}: {G_pred}')

In [12]:
field = 'QQ'
for n in range(2, 6):
    print(f' field = {field}, n = {n}')
    load_dir = f'results/shape_gb_lex/gb_dataset_n={n}_field={field}'
    experiement_coordinate_change(field, n, load_dir)
    print('\n')

 field = QQ, n = 2


The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.
  hidden_states = torch._nested_tensor_from_mask(hidden_states, ~attention_mask)


F     : [5/2*x0 + 2*x1 - 1, 9/4*x0^2 + 3*x0*x1 - 1/2*x0 + 3/2*x1^2 - x1]
G     : [x0 + 4/5*x1 - 2/5, x1^2 - 14/9*x1 + 8/27]
G_pred: [x0, x1 - 1]


 field = QQ, n = 3


The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


F     : [-4*x0 + 5*x1 - 1/2*x2 - 1, 6*x0^2 - 2*x0*x1 + 2*x0*x2 + 2*x0 + 17/2*x1^2 + 2*x1*x2 + 11/4*x2^2 - 1/2*x2, 4*x0^2 - 6*x0*x1 + 2*x0*x2 + x0 + 2*x1^2 - 3*x1*x2 - 1/2*x1 - 2*x2^2 + x2]
G     : [x0 - 791945/28428*x2^3 + 200255/9476*x2^2 - 99419/28428*x2 + 3211/14214, x1 - 158389/7107*x2^3 + 40051/2369*x2^2 - 41189/14214*x2 - 137/7107, x2^4 - 4/7*x2^3 + 67/847*x2^2 - 10/9317*x2 - 8/102487]
G_pred: [x0 + 1/2*x2, x1 + 3/2*x2 - 1, x2^2 + 1/2*x2]


 field = QQ, n = 4


The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


F     : [3*x0 + 1/2*x1 - 2*x2 + x3 - 1, 3*x0^2 + 5*x0*x1 + 4*x0*x2 - x0 + 17/4*x1^2 + 6*x1*x2 - 1/2*x1 + 8*x2^2 + 6*x2*x3 - 2*x2 + 17/2*x3^2 - 2*x3, -2*x0*x1 - 6*x0*x2 + 3*x0*x3 - 2*x1^2 - 5*x1*x2 + 4*x1*x3 - 4*x2^2 + 4*x2*x3 + x2 + 3*x3^2 - x3, 2*x0^2 + 3*x0*x1 + 4*x0*x2 + 2*x0*x3 - x0 + x1^2 + 6*x1*x2 + x1*x3 - x1 + 3*x2^2 - 7*x2*x3 - 4*x3^2 + x3]
G     : [x0 + 1250854445864254762356134604848983606869/496960966998134478251388980167235*x3^7 - 976262319631636825880598786954804774087/496960966998134478251388980167235*x3^6 + 484216208749209173306912764929239849613/993921933996268956502777960334470*x3^5 - 30785563842459865907832276322083561881/1192706320795522747803333552401364*x3^4 - 43837053162493566398259414123705226363/8945297405966420608525001643010230*x3^3 + 1706148202972198538130734012522008276/4472648702983210304262500821505115*x3^2 + 5657834743003737065305511512863019/298176580198880686950833388100341*x3 - 570295325014615813485407093333406/496960966998134478251388980167235, x1 - 

The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


F     : [-6*x0 + 10*x2 - x3 - x4 - 1, 18*x0^2 - 20*x0*x2 + 4*x0*x3 + 12*x0*x4 + 3/2*x1^2 - 2*x1*x2 + 4*x1*x4 - x1 + 35/2*x2^2 + 8*x2*x3 - 14*x2*x4 - x2 + 7*x3^2 - 6*x3*x4 - x3 + 7*x4^2 - x4, 8*x0^2 - 3*x0*x1 - 18*x0*x2 - 2*x0*x3 + 2*x0*x4 + 2*x0 + 7/2*x1*x2 + 2*x1*x3 - 3*x1*x4 + 8*x2^2 - 7*x2*x3 + 4*x2*x4 - 2*x2 - 4*x3^2 + 6*x3*x4 - 4*x4^2 + x4, -2*x0*x1 - 19*x0*x2 - 6*x0*x3 + 2*x0 - x1*x2 - 2*x1*x3 + x1*x4 + 13*x2^2 + 2*x2*x3 - 10*x2*x4 - 1/2*x2 - 6*x3*x4 + x3 + 3*x4^2, 4*x0^2 - 6*x0*x2 + 8*x0*x3 - 2*x0*x4 - x1^2 + 3*x1*x2 + x1*x3 - 3*x1*x4 + 1/2*x1 + 6*x2^2 - 2*x2*x3 + 5*x2*x4 - 2*x2 + 2*x3^2 + 4*x3*x4 - x3 - 4*x4^2 + x4]
G     : [x0 - 1284066670753960016830786909529263154900081255537108874460163210398338106817171019196708372680397129008837672772502809895534968490191817576985124416621069988957545013927965069706075352598415450024902451109709759661854790351092670460335790891656378752/4325566653955284146809134365359842873113711262600589971047639849328962653975695358700216317106918742104

In [13]:
field = 'F7'
for n in range(2, 6):
    print(f' field = {field}, n = {n}')
    # load_dir = f'results/shape_gb_lex/gb_dataset_n={n}_field={field}'
    load_dir = f'results/shape_gb_lex/gb_dataset_n={n}_field=F7'
    experiement(field, n, load_dir, num_beams=1)
    print('\n')

 field = F7, n = 2


The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


[x0 + 2*x1 - 1, x0^2 - x0 + 2*x1^2]
[x0 + 2*x1 - 1, x1^2 + 2*x1]
[x0 + 2*x1 - 1, x1^2 - 2*x1]


 field = F7, n = 3


The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


[x0 + 2*x1 + 2*x2 - 1, x0^2 - x0 + 2*x1^2 + 2*x2^2, 2*x0*x1 + 2*x1*x2 - x1]
[x0 + 2*x1 + 2*x2 - 1, x1^2 - 3*x1, x1*x2 + 2*x1, x2^2 + 2*x2]
[x0 + 2*x2 - 1, x1, x2^2]


 field = F7, n = 4


The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


[x0 + 2*x1 + 2*x2 + 2*x3 - 1, x0^2 - x0 + 2*x1^2 + 2*x2^2 + 2*x3^2, 2*x0*x1 + 2*x1*x2 - x1 + 2*x2*x3, 2*x0*x2 + x1^2 + 2*x1*x3 - x2]
[x0 - 2*x3^6 + x3^5 - 2*x3^4 + 3*x3^3 + 3*x3^2 - 2*x3 - 1, x1 - 3*x3^6 + 2*x3^4 + x3^3 + 2*x3^2 - 3*x3, x2 - 3*x3^6 + 3*x3^5 - x3^4 + x3^3 - 2*x3, x3^7 + 3*x3^6 - x3^5 + 3*x3^4 + x3^3 - x3^2 + 3*x3]
[x0 + 2*x3 - 1, x1, x2, x3^2]


 field = F7, n = 5


The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


[x0 + 2*x1 + 2*x2 + 2*x3 + 2*x4 - 1, x0^2 - x0 + 2*x1^2 + 2*x2^2 + 2*x3^2 + 2*x4^2, 2*x0*x1 + 2*x1*x2 - x1 + 2*x2*x3 + 2*x3*x4, 2*x0*x2 + x1^2 + 2*x1*x3 + 2*x2*x4 - x2, 2*x0*x3 + 2*x1*x2 + 2*x1*x4 - x3]
[x0 + 2*x2 - 2*x4^13 + 2*x4^9 + x4^8 + 2*x4^7 - x4^6 - x4^4 - x4^3 + 3*x4^2 + x4 - 1, x1 - 2*x4^13 + 2*x4^12 - x4^11 + 3*x4^8 + x4^6 + 2*x4^5 - x4^4 + 2*x4^3 + 2*x4^2 - x4, x2^2 - 3*x2 - x4^13 + 2*x4^11 - 2*x4^10 - x4^9 - 3*x4^6 - x4^5 - 2*x4^4 + 2*x4^3 + 2*x4^2 - 3*x4, x2*x4 + 2*x2 + x4^12 + x4^11 - x4^10 + x4^9 + x4^8 + 2*x4^7 - x4^6 - x4^5 + 3*x4^4 + 2*x4^3 + 3*x4^2 + 3*x4, x3 + 3*x4^13 - 2*x4^12 + x4^11 - x4^9 - x4^7 + 3*x4^6 - 2*x4^5 - 2*x4^4 + 2*x4^3 - 2*x4, x4^14 - x4^13 - 2*x4^12 + x4^11 - 3*x4^10 - x4^9 + x4^8 - 2*x4^7 - x4^6 + x4^5 - 3*x4^4 + 3*x4^3 - 3*x4^2 - x4]
[x0 + 2*x4^2, x1, x2 + 2*x4, x3, x4^4]




# Katsura-n

In [6]:
def mylatex(f):
    return latex(f)

def load_katsura(field, n, return_ring=False):
    if field == 'QQ':
        field_ = QQ 
    if field[0] == 'F' and field[1:].isdigit():
        field_ = GF(int(field[1:]))
        
    ring = PolynomialRing(field_, n, 'x', order='lex')
    I = sage.rings.ideal.Katsura(ring, n)  
    F = I.basis
    G = I.groebner_basis()
    return (F, G, ring) if return_ring else (F, G)


def experiement(field, n, load_dir, num_beams=1):
    bag = load_trained_bag(load_dir, from_checkpoint=True)
    model = bag['model'] 
    tokenizer = bag['tokenizer']
    params = bag['params']
    
    F, G = load_katsura(field, n)
    F_prefix = [poly_to_prefix(f) for f in F]
    G_prefix = [poly_to_prefix(g) for g in G]
    x_text = ' [SEP] '.join(F_prefix)
    y_text = ' [SEP] '.join(G_prefix)

    x = tokenizer(x_text, return_tensors='pt')['input_ids'].cuda()
    # y = tokenizer(y_text, return_tensors='pt')['input_ids'].cuda()
    output_ids = model.generate(x, max_length=1000, num_beams=num_beams, do_sample=False)
    z_text = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    G_pred = [prefix_to_poly(zt, F.ring()) for zt in z_text[0].split('[SEP]')]
    
    print(F)
    print(G)
    print(G_pred)

        

In [42]:
field = 'QQ'
for n in range(2, 6):
    print(f' field = {field}, n = {n}')
    load_dir = f'results/shape_gb_lex/gb_dataset_n={n}_field={field}'
    experiement(field, n, load_dir)
    print('\n')

 field = QQ, n = 2


The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


[x0 + 2*x1 - 1, x0^2 - x0 + 2*x1^2]
[x0 + 2*x1 - 1, x1^2 - 1/3*x1]
[x0 + 2*x1 - 1, x1^2]


 field = QQ, n = 3


The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


[x0 + 2*x1 + 2*x2 - 1, x0^2 - x0 + 2*x1^2 + 2*x2^2, 2*x0*x1 + 2*x1*x2 - x1]
[x0 - 60*x2^3 + 158/7*x2^2 + 8/7*x2 - 1, x1 + 30*x2^3 - 79/7*x2^2 + 3/7*x2, x2^4 - 10/21*x2^3 + 1/84*x2^2 + 1/84*x2]
[x0 - 1, x1, x2]


 field = QQ, n = 4


The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


[x0 + 2*x1 + 2*x2 + 2*x3 - 1, x0^2 - x0 + 2*x1^2 + 2*x2^2 + 2*x3^2, 2*x0*x1 + 2*x1*x2 - x1 + 2*x2*x3, 2*x0*x2 + x1^2 + 2*x1*x3 - x2]
[x0 - 53230079232/1971025*x3^7 + 10415423232/1971025*x3^6 + 9146536848/1971025*x3^5 - 2158574456/1971025*x3^4 - 838935856/5913075*x3^3 + 275119624/5913075*x3^2 + 4884038/5913075*x3 - 1, x1 - 97197721632/1971025*x3^7 + 73975630752/1971025*x3^6 - 12121915032/1971025*x3^5 - 2760941496/1971025*x3^4 + 814792828/1971025*x3^3 - 1678512/1971025*x3^2 - 9158924/1971025*x3, x2 + 123812761248/1971025*x3^7 - 79183342368/1971025*x3^6 + 7548646608/1971025*x3^5 + 3840228724/1971025*x3^4 - 2024910556/5913075*x3^3 - 132524276/5913075*x3^2 + 30947828/5913075*x3, x3^8 - 8/11*x3^7 + 4/33*x3^6 + 131/5346*x3^5 - 70/8019*x3^4 + 1/3564*x3^3 + 5/42768*x3^2 - 1/128304*x3]
[x0 - 1, x1, x2 - 1, x3]


 field = QQ, n = 5


The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


[x0 + 2*x1 + 2*x2 + 2*x3 + 2*x4 - 1, x0^2 - x0 + 2*x1^2 + 2*x2^2 + 2*x3^2 + 2*x4^2, 2*x0*x1 + 2*x1*x2 - x1 + 2*x2*x3 + 2*x3*x4, 2*x0*x2 + x1^2 + 2*x1*x3 + 2*x2*x4 - x2, 2*x0*x3 + 2*x1*x2 + 2*x1*x4 - x3]
[x0 - 308110355965692339884186470634147686371505891532807871811902272247524425728/2300416163855081226283764659441213559936734646783690132119987*x4^15 + 2745916744964843340073319972931317434499748758333498363131468420213845884928/16102913146985568583986352616088494919557142527485830924839909*x4^14 - 1331408292613478234836082577998991556829776751466175794713581464105517547520/16102913146985568583986352616088494919557142527485830924839909*x4^13 + 257402374905027595104460066253088484947405403489320457627961780169280561152/16102913146985568583986352616088494919557142527485830924839909*x4^12 + 19049599372039000321335526895674219707181251837096131891595208561971849216/16102913146985568583986352616088494919557142527485830924839909*x4^11 - 20585599662255306935074784913735873725428112366305860178

In [39]:
field = 'F7'
for n in range(2, 6):
    print(f' field = {field}, n = {n}')
    # load_dir = f'results/shape_gb_lex/gb_dataset_n={n}_field={field}'
    load_dir = f'results/shape_gb_lex/gb_dataset_n={n}_field=F7'
    experiement(field, n, load_dir, num_beams=1)
    print('\n')

 field = F7, n = 2


The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


[x0 + 2*x1 - 1, x0^2 - x0 + 2*x1^2]
[x0 + 2*x1 - 1, x1^2 + 2*x1]
[x0 + 2*x1 - 1, x1^2 - 2*x1]


 field = F7, n = 3


The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


[x0 + 2*x1 + 2*x2 - 1, x0^2 - x0 + 2*x1^2 + 2*x2^2, 2*x0*x1 + 2*x1*x2 - x1]
[x0 + 2*x1 + 2*x2 - 1, x1^2 - 3*x1, x1*x2 + 2*x1, x2^2 + 2*x2]
[x0 + 2*x2 - 1, x1, x2^2]


 field = F7, n = 4


The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


[x0 + 2*x1 + 2*x2 + 2*x3 - 1, x0^2 - x0 + 2*x1^2 + 2*x2^2 + 2*x3^2, 2*x0*x1 + 2*x1*x2 - x1 + 2*x2*x3, 2*x0*x2 + x1^2 + 2*x1*x3 - x2]
[x0 - 2*x3^6 + x3^5 - 2*x3^4 + 3*x3^3 + 3*x3^2 - 2*x3 - 1, x1 - 3*x3^6 + 2*x3^4 + x3^3 + 2*x3^2 - 3*x3, x2 - 3*x3^6 + 3*x3^5 - x3^4 + x3^3 - 2*x3, x3^7 + 3*x3^6 - x3^5 + 3*x3^4 + x3^3 - x3^2 + 3*x3]
[x0 + 2*x3 - 1, x1, x2, x3^2]


 field = F7, n = 5


The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


[x0 + 2*x1 + 2*x2 + 2*x3 + 2*x4 - 1, x0^2 - x0 + 2*x1^2 + 2*x2^2 + 2*x3^2 + 2*x4^2, 2*x0*x1 + 2*x1*x2 - x1 + 2*x2*x3 + 2*x3*x4, 2*x0*x2 + x1^2 + 2*x1*x3 + 2*x2*x4 - x2, 2*x0*x3 + 2*x1*x2 + 2*x1*x4 - x3]
[x0 + 2*x2 - 2*x4^13 + 2*x4^9 + x4^8 + 2*x4^7 - x4^6 - x4^4 - x4^3 + 3*x4^2 + x4 - 1, x1 - 2*x4^13 + 2*x4^12 - x4^11 + 3*x4^8 + x4^6 + 2*x4^5 - x4^4 + 2*x4^3 + 2*x4^2 - x4, x2^2 - 3*x2 - x4^13 + 2*x4^11 - 2*x4^10 - x4^9 - 3*x4^6 - x4^5 - 2*x4^4 + 2*x4^3 + 2*x4^2 - 3*x4, x2*x4 + 2*x2 + x4^12 + x4^11 - x4^10 + x4^9 + x4^8 + 2*x4^7 - x4^6 - x4^5 + 3*x4^4 + 2*x4^3 + 3*x4^2 + 3*x4, x3 + 3*x4^13 - 2*x4^12 + x4^11 - x4^9 - x4^7 + 3*x4^6 - 2*x4^5 - 2*x4^4 + 2*x4^3 - 2*x4, x4^14 - x4^13 - 2*x4^12 + x4^11 - 3*x4^10 - x4^9 + x4^8 - 2*x4^7 - x4^6 + x4^5 - 3*x4^4 + 3*x4^3 - 3*x4^2 - x4]
[x0 + 2*x4^2, x1, x2 + 2*x4, x3, x4^4]




In [41]:
field = 'F31'
char = int(field[1:])
for n in range(2,6):
    print(f' field = {field}, n = {n}')
    # load_dir = f'results/shape_gb_lex/gb_dataset_n={n}_field={field}'
    load_dir = f'results/shape_gb_lex/gb_dataset_n={n}_field={field}'
    experiement(field, n, load_dir, num_beams=1)
    print('\n')

 field = F31, n = 2


The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


[x0 + 2*x1 - 1, x0^2 - x0 + 2*x1^2]
[x0 + 2*x1 - 1, x1^2 + 10*x1]
[x0 + 2*x1 - 1, x1^2 + 3*x1]


 field = F31, n = 3


The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


[x0 + 2*x1 + 2*x2 - 1, x0^2 - x0 + 2*x1^2 + 2*x2^2, 2*x0*x1 + 2*x1*x2 - x1]
[x0 + 2*x2^3 - 4*x2^2 + 10*x2 - 1, x1 - x2^3 + 2*x2^2 - 4*x2, x2^4 + x2^3 - 7*x2^2 - 7*x2]
[x0 + 2*x2 - 1, x1, x2^2]


 field = F31, n = 4


The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


[x0 + 2*x1 + 2*x2 + 2*x3 - 1, x0^2 - x0 + 2*x1^2 + 2*x2^2 + 2*x3^2, 2*x0*x1 + 2*x1*x2 - x1 + 2*x2*x3, 2*x0*x2 + x1^2 + 2*x1*x3 - x2]
[x0 - 9*x3^7 - 12*x3^6 - 5*x3^5 - 9*x3^4 - 14*x3^3 - 4*x3^2 + 13*x3 - 1, x1 + 15*x3^7 + 12*x3^6 - 7*x3^5 - 2*x3^4 + 8*x3^3 + x3^2 - 7*x3, x2 + 5*x3^7 - 6*x3^6 - 6*x3^5 - 9*x3^4 - x3^3 + x3^2 - 14*x3, x3^8 - 12*x3^7 + 2*x3^6 - 15*x3^5 + 7*x3^4 - x3^3 - 3*x3^2 - 6*x3]
[x0 + 2*x3, x1 + 2*x3, x2, x3^2]


 field = F31, n = 5


The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


[x0 + 2*x1 + 2*x2 + 2*x3 + 2*x4 - 1, x0^2 - x0 + 2*x1^2 + 2*x2^2 + 2*x3^2 + 2*x4^2, 2*x0*x1 + 2*x1*x2 - x1 + 2*x2*x3 + 2*x3*x4, 2*x0*x2 + x1^2 + 2*x1*x3 + 2*x2*x4 - x2, 2*x0*x3 + 2*x1*x2 + 2*x1*x4 - x3]
[x0 + 13*x4^15 + x4^14 + 12*x4^13 - x4^12 + 4*x4^11 - 10*x4^10 - 2*x4^9 - 5*x4^8 + 5*x4^7 + x4^6 - 13*x4^5 + 2*x4^4 + 9*x4^3 - 9*x4^2 - 2*x4 - 1, x1 - 9*x4^15 + 12*x4^14 + 11*x4^13 + 14*x4^12 - 8*x4^11 + 11*x4^10 - 7*x4^9 + 12*x4^8 - 2*x4^7 - 7*x4^6 - 14*x4^5 + 10*x4^4 + 8*x4^3 + 8*x4^2 - 12*x4, x2 - 15*x4^15 + 5*x4^14 + 12*x4^13 - 2*x4^12 - 6*x4^11 + 15*x4^10 + 15*x4^9 + 2*x4^8 - 4*x4^7 - 12*x4^6 - 15*x4^5 - 2*x4^4 + 6*x4^3 + 8*x4^2 + 14*x4, x3 + 2*x4^15 - 2*x4^14 + 2*x4^13 + 4*x4^12 + 12*x4^11 + 10*x4^10 - 7*x4^9 + 4*x4^8 - 12*x4^7 + 3*x4^6 - 11*x4^5 - 9*x4^4 - 3*x4^3 + 4*x4^2, x4^16 - 10*x4^15 - 14*x4^14 + 13*x4^13 - 11*x4^12 - x4^11 - 7*x4^10 - 4*x4^9 - 9*x4^8 - 12*x4^7 - 3*x4^6 - 4*x4^5 + 12*x4^4 + 2*x4^3 + 14*x4^2 - x4]
[x0 + 2*x4^2, x1 + 2*x4, x2, x3 + 2*x4, x4^4]




In [15]:
for field in ['QQ']:
    for n in range(2, 6):
        F, G, ring = load_katsura(field, n, return_ring=True)
        
        for i in range(max(len(F), len(G))):
            f_text = f'f_{i+1} = {latex(F[i])}' if i < len(F) else ''
            g_text = f'g_{i+1} = {latex(G[i])}' if i < len(G) else ''
            
            print(f'{f_text:80} & {g_text} \\\\')

        print('')


f_1 = x_{0} + 2 x_{1} - 1                                                        & g_1 = x_{0} + 2 x_{1} - 1 \\
f_2 = x_{0}^{2} - x_{0} + 2 x_{1}^{2}                                            & g_2 = x_{1}^{2} - \frac{1}{3} x_{1} \\

f_1 = x_{0} + 2 x_{1} + 2 x_{2} - 1                                              & g_1 = x_{0} - 60 x_{2}^{3} + \frac{158}{7} x_{2}^{2} + \frac{8}{7} x_{2} - 1 \\
f_2 = x_{0}^{2} - x_{0} + 2 x_{1}^{2} + 2 x_{2}^{2}                              & g_2 = x_{1} + 30 x_{2}^{3} - \frac{79}{7} x_{2}^{2} + \frac{3}{7} x_{2} \\
f_3 = 2 x_{0} x_{1} + 2 x_{1} x_{2} - x_{1}                                      & g_3 = x_{2}^{4} - \frac{10}{21} x_{2}^{3} + \frac{1}{84} x_{2}^{2} + \frac{1}{84} x_{2} \\

f_1 = x_{0} + 2 x_{1} + 2 x_{2} + 2 x_{3} - 1                                    & g_1 = x_{0} - \frac{53230079232}{1971025} x_{3}^{7} + \frac{10415423232}{1971025} x_{3}^{6} + \frac{9146536848}{1971025} x_{3}^{5} - \frac{2158574456}{1971025} x_{3}^{4} - \frac{83893585

In [58]:
p = ring.random_element()
p

-x0^2 - 3*x0 + 2*x1^2 - 1

In [60]:
latex(p)

-x_{0}^{2} + 4 x_{0} + 2 x_{1}^{2} + 6

In [46]:
F, G, ring = load_katsura('F7', 2, return_ring=True)
F

[x0 + 2*x1 - 1, x0^2 - x0 + 2*x1^2]

In [54]:
latex(F)

\left[x_{0} + 2 x_{1} + 6, x_{0}^{2} - x_{0} + 2 x_{1}^{2}\right]

In [52]:
latex(F[0])

x_{0} + 2 x_{1} + 6

In [53]:
latex(F[1])

x_{0}^{2} - x_{0} + 2 x_{1}^{2}

In [44]:
for field in ['F7']:
    for n in range(2, 6):
        F, G, ring = load_katsura(field, n, return_ring=True)
        
        for i in range(max(len(F), len(G))):
            f_text = f'f_{i+1} = {latex(F[i])}' if i < len(F) else ''
            g_text = f'g_{i+1} = {latex(G[i])}' if i < len(G) else ''
            
            print(f'{f_text:80} & {g_text} \\\\')

        print('')


f_1 = x_{0} + 2 x_{1} + 6                                                        & g_1 = x_{0} + 2 x_{1} + 6 \\
f_2 = x_{0}^{2} - x_{0} + 2 x_{1}^{2}                                            & g_2 = x_{1}^{2} + 2 x_{1} \\

f_1 = x_{0} + 2 x_{1} + 2 x_{2} + 6                                              & g_1 = x_{0} + 2 x_{1} + 2 x_{2} + 6 \\
f_2 = x_{0}^{2} - x_{0} + 2 x_{1}^{2} + 2 x_{2}^{2}                              & g_2 = x_{1}^{2} + 4 x_{1} \\
f_3 = 2 x_{0} x_{1} + 2 x_{1} x_{2} - x_{1}                                      & g_3 = x_{1} x_{2} + 2 x_{1} \\
                                                                                 & g_4 = x_{2}^{2} + 2 x_{2} \\

f_1 = x_{0} + 2 x_{1} + 2 x_{2} + 2 x_{3} + 6                                    & g_1 = x_{0} + 5 x_{3}^{6} + x_{3}^{5} + 5 x_{3}^{4} + 3 x_{3}^{3} + 3 x_{3}^{2} + 5 x_{3} + 6 \\
f_2 = x_{0}^{2} - x_{0} + 2 x_{1}^{2} + 2 x_{2}^{2} + 2 x_{3}^{2}                & g_2 = x_{1} + 4 x_{3}^{6} + 2 x_{3}^{4} + x_{3}^{3}

In [17]:
for field in ['F31']:
    for n in range(2, 6):
        F, G = load_katsura(field, n)
        
        for i in range(max(len(F), len(G))):
            f_text = f'f_{i+1} = {latex(F[i])}' if i < len(F) else ''
            g_text = f'g_{i+1} = {latex(G[i])}' if i < len(G) else ''
            
            print(f'{f_text:80} & {g_text} \\\\')

        print('')


f_1 = x_{0} + 2 x_{1} + 30                                                       & g_1 = x_{0} + 2 x_{1} + 30 \\
f_2 = x_{0}^{2} - x_{0} + 2 x_{1}^{2}                                            & g_2 = x_{1}^{2} + 10 x_{1} \\

f_1 = x_{0} + 2 x_{1} + 2 x_{2} + 30                                             & g_1 = x_{0} + 2 x_{2}^{3} + 27 x_{2}^{2} + 10 x_{2} + 30 \\
f_2 = x_{0}^{2} - x_{0} + 2 x_{1}^{2} + 2 x_{2}^{2}                              & g_2 = x_{1} - x_{2}^{3} + 2 x_{2}^{2} + 27 x_{2} \\
f_3 = 2 x_{0} x_{1} + 2 x_{1} x_{2} - x_{1}                                      & g_3 = x_{2}^{4} + x_{2}^{3} + 24 x_{2}^{2} + 24 x_{2} \\

f_1 = x_{0} + 2 x_{1} + 2 x_{2} + 2 x_{3} + 30                                   & g_1 = x_{0} + 22 x_{3}^{7} + 19 x_{3}^{6} + 26 x_{3}^{5} + 22 x_{3}^{4} + 17 x_{3}^{3} + 27 x_{3}^{2} + 13 x_{3} + 30 \\
f_2 = x_{0}^{2} - x_{0} + 2 x_{1}^{2} + 2 x_{2}^{2} + 2 x_{3}^{2}                & g_2 = x_{1} + 15 x_{3}^{7} + 12 x_{3}^{6} + 24 x_{3}^{5} + 29 x_{3}

In [6]:
a = PolynomialRing(QQ, 2, 'x', order='lex')


In [9]:
b = ideal(a).basis

In [12]:
b.ring()

Multivariate Polynomial Ring in x0, x1 over Rational Field

In [4]:
train_dataset, test_dataset = load_data('data/gb_dataset_n=2_field=QQ/data', encoding='lex.prefix', return_dataloader=False)

loading ... data/gb_dataset_n=2_field=QQ/data.train
loading ... data/gb_dataset_n=2_field=QQ/data.test


In [23]:
for batch in test_dataset:
    x_text, y_text = batch['input'], batch['target']
    break

In [18]:
tokenizer(x, return_tensors='pt')

{'input_ids': tensor([[210, 205,   0, 208, 105, 109, 213, 205, 206, 208,  99, 108, 206, 207,
           0, 107,   1, 205, 206, 208, 103, 108, 206, 207,   0, 106,   1, 205,
         206, 103, 206,   0, 207,   1, 107, 206, 208, 108, 109, 207,   1, 107,
         211]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [29]:
x = tokenizer(x_text, return_tensors='pt')['input_ids'].cuda()
y = tokenizer(y_text, return_tensors='pt')['input_ids'].cuda()
output_ids = model.generate(x, max_length=y.shape[-1], num_beams=1, do_sample=False)
z_text = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
z_text

['+ x0 / 1 5 [SEP] x1 [SEP] x2']

In [48]:
field = "QQ"

if field == 'QQ':
    field_ = QQ 
if field[0] == 'F' and field[1:].isdigit():
    field_ = GF(int(field[1:]))

n = 4
ring = PolynomialRing(field_, n, 'x', order='lex')
I = sage.rings.ideal.Katsura(ring,n)  
F = I.basis
G = I.groebner_basis()
print(F)
G

[x0 + 2*x1 + 2*x2 + 2*x3 - 1, x0^2 - x0 + 2*x1^2 + 2*x2^2 + 2*x3^2, 2*x0*x1 + 2*x1*x2 - x1 + 2*x2*x3, 2*x0*x2 + x1^2 + 2*x1*x3 - x2]


[x0 - 53230079232/1971025*x3^7 + 10415423232/1971025*x3^6 + 9146536848/1971025*x3^5 - 2158574456/1971025*x3^4 - 838935856/5913075*x3^3 + 275119624/5913075*x3^2 + 4884038/5913075*x3 - 1, x1 - 97197721632/1971025*x3^7 + 73975630752/1971025*x3^6 - 12121915032/1971025*x3^5 - 2760941496/1971025*x3^4 + 814792828/1971025*x3^3 - 1678512/1971025*x3^2 - 9158924/1971025*x3, x2 + 123812761248/1971025*x3^7 - 79183342368/1971025*x3^6 + 7548646608/1971025*x3^5 + 3840228724/1971025*x3^4 - 2024910556/5913075*x3^3 - 132524276/5913075*x3^2 + 30947828/5913075*x3, x3^8 - 8/11*x3^7 + 4/33*x3^6 + 131/5346*x3^5 - 70/8019*x3^4 + 1/3564*x3^3 + 5/42768*x3^2 - 1/128304*x3]

In [49]:
save_dir = f'results/shape_gb_lex/gb_dataset_n={n}_field={field}'
bag = load_trained_bag(save_dir, from_checkpoint=True)
model = bag['model'] 
tokenizer = bag['tokenizer']
params = bag['params']

The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


In [50]:
F_prefix = [poly_to_prefix(f) for f in F]
G_prefix = [poly_to_prefix(g) for g in G]
intext = ' [SEP] '.join(F_prefix)
outtext = ' [SEP] '.join(G_prefix)

In [51]:
inseq = tokenizer.encode(intext)
inseq = torch.tensor(inseq).reshape(1, -1).cuda()
inseq

tensor([[210, 205,   0, 205, 206, 106,   1, 205, 206, 106,   2, 205, 206, 106,
           3, 103, 213, 205, 207,   0, 106, 205, 206, 103,   0, 205, 206, 106,
         207,   1, 106, 205, 206, 106, 207,   2, 106, 206, 106, 207,   3, 106,
         213, 205, 206, 106, 206,   0,   1, 205, 206, 106, 206,   1,   2, 205,
         206, 103,   1, 206, 106, 206,   2,   3, 213, 205, 206, 106, 206,   0,
           2, 205, 207,   1, 106, 205, 206, 106, 206,   1,   3, 206, 103,   2,
         211]], device='cuda:0')

In [52]:
output_ids = model.generate(inseq, max_length=100, num_beams=1, do_sample=False)
z_text = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
[prefix_to_poly(zt,ring) for zt in z_text[0].split('[SEP]')]

[x0 - 1, x1, x2 - 1, x3]

In [111]:
predseq = output.logits.argmax(dim=int(-1))
pred_prefix = tokenizer.decode(predseq[0])
print(pred_prefix)
pred = [prefix_to_poly(prefix, ring) for prefix in pred_prefix.split('[SEP]')]
pred

+ + x0 [SEP] 1 5 [SEP] x1 x1 / 4 3 ^ / x3 [SEP] ^ [SEP] x2 / 1 [SEP] ^ / x3 [SEP] ^ [SEP] x2 / ^ / ^ x3 / [SEP] x2 1 5 [SEP] x3 [SEP] ^ [PAD]


[None, 5, None, None, None, None, None, None, 5, x3, None]

In [61]:
pred

[x2, x2, x2, x2, x2, -1, -1, -1, 2, -1]

In [15]:
# save_dir = 'results/shape_gb/gb_n=2_r=5_d=3_m=2_Gd=5_Gm=None_F=4/'
save_dir = 'results/shape_gb2/gb_dataset_n=3_char=7'
# save_dir = 'results/shape_gb/gb_dataset_n=5_char=7'
bag = load_trained_bag(save_dir, from_checkpoint=True)
model = bag['model'] 
tokenizer = bag['tokenizer']
params = bag['params']

The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


In [16]:
from src.loader.data import load_data
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

batch_size = 4
encoding = 'prefix'
data_path = 'data3/gb_dataset_n=3_char=7/data'
testset = load_data(data_path, encoding=encoding, extensions=['test'], do_shuffle=[False], return_dataloader=False)
testloader = load_data(data_path, encoding=encoding, batch_sizes=[batch_size], extensions=['test'], do_shuffle=[False], return_dataloader=True, tokenizer=tokenizer)
# batch = next(iter(testloader))

In [17]:
results = eval_prediction(model, testloader, use_tqdm=True, num_beams=1)
results

100%|██████████| 250/250 [01:28<00:00,  2.83it/s]


{'acc': 0.505,
 'hits': [tensor([ True,  True, False, False]),
  tensor([False,  True,  True, False]),
  tensor([ True, False, False,  True]),
  tensor([ True,  True,  True, False]),
  tensor([False, False, False, False]),
  tensor([False, False,  True,  True]),
  tensor([ True, False,  True, False]),
  tensor([False,  True, False,  True]),
  tensor([False,  True, False,  True]),
  tensor([True, True, True, True]),
  tensor([ True, False, False,  True]),
  tensor([ True, False, False, False]),
  tensor([ True,  True,  True, False]),
  tensor([False,  True, False, False]),
  tensor([False,  True,  True, False]),
  tensor([ True,  True, False, False]),
  tensor([False,  True, False, False]),
  tensor([ True,  True, False,  True]),
  tensor([False,  True, False, False]),
  tensor([False,  True, False, False]),
  tensor([ True, False, False, False]),
  tensor([False, False, False, False]),
  tensor([ True, False, False, False]),
  tensor([False,  True,  True, False]),
  tensor([False,  Tru

In [7]:
testset

TypeError: 'NoneType' object is not subscriptable

In [47]:

for hits, batch in zip(results['hits'], testloader):
    x, y = batch['input_ids'], batch['decoder_input_ids']
    x, y = x.cuda(), y.cuda()
    
    output_ids = model.generate(x, max_length=y.shape[-1], num_beams=1, do_sample=False)
    
    x_text = tokenizer.batch_decode(x, skip_special_tokens=True)
    y_text = tokenizer.batch_decode(y, skip_special_tokens=True)
    z_text = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    
    for hit, yt, zt in zip(hits, y_text, z_text):
        print(f"{'   '}", yt)
        print(f"{'[!]' if hit else '   '}", zt)
        print('')


    + x0 ** x1 3 [SEP] * 3 ** x1 3
[!] + x0 ** x1 3 [SEP] * 3 ** x1 3

    + 3 + x0 + * -1 ** x1 3 * -3 ** x1 5 [SEP] + ** x1 5 * -3 x1
    + 3 + x0 + * -1 ** x1 3 * -3 ** x1 5 [SEP] + ** x1 5 * 2 x1

    + 1 + x0 + * -1 ** x1 4 * -3 ** x1 5 [SEP] + 3 + * -1 ** x1 4 + * -2 x1 * 2 ** x1 5
    + 3 + x0 + * -1 ** x1 4 + * -3 ** x1 5 * 2 x1 [SEP] + 2 + * -1 ** x1 4 + * -3 ** x1 5 * 2 ** x1 3

    + -3 + x0 + x1 + ** x1 5 * 2 ** x1 4 [SEP] + ** x1 5 + * -3 ** x1 2 + * -2 ** x1 4 * 2 x1
    + -3 + x0 + x1 + ** x1 5 * 2 ** x1 4 [SEP] + * -3 ** x1 2 + * -3 ** x1 4 * 2 ** x1 5

    + -2 + x0 + * -1 ** x1 3 + * -2 x1 + * 3 ** x1 4 * 3 ** x1 5 [SEP] + -1 + * -2 ** x1 4 + * 2 ** x1 2 * 3 ** x1 5
    + -2 + x0 + x1 + * -1 ** x1 3 + * 3 ** x1 4 * 3 ** x1 5 [SEP] + -1 + * -2 ** x1 4 + * 2 ** x1 2 * 3 ** x1 5

    + 3 x0 [SEP] * -1 ** x1 4
[!] + 3 x0 [SEP] * -1 ** x1 4

    + -2 + x0 + ** x1 3 + * -2 ** x1 4 * 2 ** x1 5 [SEP] + * -1 ** x1 5 + * -3 x1 + * -3 ** x1 2 * 2 ** x1 3
    + -2 + x0 + ** x1 3 

In [7]:
from tqdm import tqdm
from time import time 

In [8]:
%%time

from tqdm import tqdm
tot, acc = 0, 0
hits_list = []
z_text_list = []
with torch.no_grad():
    for batch in tqdm(testloader):
        x, y = batch['input_ids'].cuda(), batch['decoder_input_ids'].cuda()
        output_ids = model.generate(x, max_length=y.shape[-1], num_beams=1, do_sample=False)
        z_text = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        z_text_list.append(z_text)
        
        l = min(y.shape[-1], output_ids.shape[-1])
        hits = torch.all(y[:, :l] == output_ids[:, :l], dim=1)
        hits_list.append(hits)
        acc += sum(hits).item()
        tot += len(hits)

    acc /= tot
    print(f'acc = {acc:.4f}')

  hidden_states = torch._nested_tensor_from_mask(hidden_states, ~attention_mask)
100%|██████████| 125/125 [00:39<00:00,  3.20it/s]

acc = 0.3430
CPU times: user 38.8 s, sys: 532 ms, total: 39.4 s
Wall time: 39.1 s





In [30]:
for batch, hits in zip(testloader, hits_list):
    x, y = batch['input_ids'].cuda(), batch['decoder_input_ids'].cuda()
    x_text = tokenizer.batch_decode(x, skip_special_tokens=True)
    y_text = tokenizer.batch_decode(y, skip_special_tokens=True)
    
    for i, hit in enumerate(hits):
        if hit:
            print(x_text[i])
            print('--> ', y_text[i])
            
    break

+ * -3 ** x0 3 + * 3 ** x1 3 * -3 * ** x0 2 ** x1 3 [SEP] + x0 + ** x1 3 + * 2 ** x0 4 + * -2 * x0 ** x1 3 * 2 * ** x0 3 ** x1 3
-->  + x0 ** x1 3 [SEP] * 3 ** x1 3
+ * -3 ** x1 7 + * 2 ** x1 3 + * x1 ** x0 2 + * -1 * ** x0 2 ** x1 6 + * -2 * x1 ** x0 3 * 2 * ** x0 2 ** x1 4 [SEP] + 3 + x0 + * -1 ** x1 3 + * -3 ** x1 5 + * x0 ** x1 9 + * -3 * x0 ** x1 5 + * -3 * ** x0 3 ** x1 6 + * -2 * ** x0 3 ** x1 8 + * 2 * ** x0 3 ** x1 3 * 3 * ** x0 4 ** x1 3 [SEP] + ** x1 5 + * -1 ** x1 4 + * -3 ** x1 6 + * x0 x1 + * ** x0 4 ** x1 3 + * -1 * ** x0 2 ** x1 2 + * -1 * ** x0 3 ** x1 6 + * -3 * x0 ** x1 2 + * -3 * ** x0 3 ** x1 8 + * -2 * x0 ** x1 9 + * 3 * x0 ** x1 7 * 3 * ** x0 3 ** x1 3
-->  + 3 + x0 + * -1 ** x1 3 * -3 ** x1 5 [SEP] + ** x1 5 * -3 x1
+ * -1 ** x1 4 + * ** x0 2 ** x1 2 * 3 * x0 ** x1 2 [SEP] + 3 + x0 + * -1 ** x1 7 + * ** x0 2 ** x1 5 * 3 * x0 ** x1 5
-->  + 3 x0 [SEP] * -1 ** x1 4
+ * -1 ** x1 4 + * -1 ** x1 6 * x0 ** x1 2 [SEP] + x0 + * -1 ** x1 4 + * 3 ** x1 5 + * 3 ** x1 7 * -

In [34]:
for batch, hits, z_text in zip(testloader, hits_list, z_text_list):
    x, y = batch['input_ids'].cuda(), batch['decoder_input_ids'].cuda()
    x_text = tokenizer.batch_decode(x, skip_special_tokens=True)
    y_text = tokenizer.batch_decode(y, skip_special_tokens=True)
    
    for i, hit in enumerate(hits):
        if not hit:
            print(x_text[i])
            print('--> ', z_text[i])
            print('GT: ', y_text[i])
            print('')
            
    break

+ * -1 ** x1 3 + * -1 ** x1 5 + * -3 ** x1 8 + * -2 x1 + * -2 ** x1 7 + * 3 ** x1 2 + * 3 ** x1 4 + * x0 ** x1 6 + * -1 * x0 x1 + * -1 * x0 ** x1 2 + * -1 * ** x0 2 ** x1 2 * 3 * x0 ** x1 7 [SEP] + 2 + x1 + * -1 ** x1 2 + * -3 ** x1 7 + * -3 ** x1 9 + * -2 ** x1 6 + * -2 ** x1 8 + * 2 ** x1 5 + * 3 ** x1 3 + * 3 ** x1 4 + * x0 ** x1 6 + * -1 * x0 ** x1 2 + * -1 * x0 ** x1 3 + * -1 * ** x0 2 ** x1 3 + * -3 * x0 ** x1 5 + * -2 * ** x0 2 ** x1 2 + * 2 * x0 x1 * 3 * x0 ** x1 8 [SEP] + -2 + ** x1 8 + * -3 ** x0 2 + * -3 ** x0 3 + * -3 ** x1 4 + * -3 ** x1 5 + * -2 ** x1 6 + * -2 ** x1 7 + * 2 x0 + * 2 x1 + * 2 ** x1 2 + * 2 ** x1 3 + * ** x0 2 ** x1 6 + * ** x0 3 ** x1 2 + * ** x0 3 ** x1 3 + * ** x0 3 ** x1 4 + * ** x0 4 ** x1 3 + * -1 * x0 ** x1 6 + * -1 * x0 ** x1 7 + * -1 * ** x0 2 ** x1 3 + * -1 * ** x0 2 ** x1 5 + * -1 * ** x0 3 ** x1 7 + * -3 * ** x0 2 ** x1 4 + * -3 * ** x0 3 ** x1 8 + * -2 * x0 ** x1 9 + * -2 * ** x0 2 ** x1 7 + * -2 * ** x0 2 ** x1 8 + * 2 * x0 ** x1 8 + * 2 * ** 

: 

In [11]:
for c in [7]:
    for n in range(2, 6):
        save_dir = f'results/shape_gb/gb_dataset_n={n}_char={c}'
        bag = load_trained_bag(save_dir, from_checkpoint=False)
        model = bag['model'] 
        tokenizer = bag['tokenizer']
        params = bag['params']
        
        data_path = f'data/gb_dataset_n={n}_char={c}/data'
        # testset = load_data(data_path, extensions=['test'], do_shuffle=[False], return_dataloader=False)
        testloader = load_data(data_path, batch_sizes=[64], extensions=['test'], do_shuffle=[False], return_dataloader=True, tokenizer=tokenizer)
        
        tot, acc = 0, 0
        with torch.no_grad():
            start_time = time()
            for batch in tqdm(testloader):
                x, y = batch['input_ids'].cuda(), batch['decoder_input_ids'].cuda()
                output_ids = model.generate(x, max_length=y.shape[-1], num_beams=1, do_sample=False)
                # z_text = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
                
                l = min(y.shape[-1], output_ids.shape[-1])
                hits = torch.all(y[:, :l] == output_ids[:, :l], dim=1)
                acc += hits.sum().item()
                tot += len(hits)

            acc /= tot
            runtime = time() - start_time 
            print(f'(n={n}, char={c}): acc = {acc:.4f}, runtime = {runtime:.2f} [sec]')
        

100%|██████████| 16/16 [00:08<00:00,  1.85it/s]


(n=2, char=7): acc = 0.4140, runtime = 8.63 [sec]


100%|██████████| 16/16 [00:14<00:00,  1.08it/s]


(n=3, char=7): acc = 0.6200, runtime = 14.88 [sec]


100%|██████████| 16/16 [00:19<00:00,  1.22s/it]


(n=4, char=7): acc = 0.7480, runtime = 19.54 [sec]


100%|██████████| 16/16 [00:22<00:00,  1.40s/it]

(n=5, char=7): acc = 0.7960, runtime = 22.46 [sec]





In [22]:
n, c = 2, 31
save_dir = f'results/shape_gb/gb_dataset_n={n}_char={c}'
bag = load_trained_bag(save_dir, from_checkpoint=False)
model = bag['model'] 
tokenizer = bag['tokenizer']
params = bag['params']

data_path = f'data/gb_dataset_n={n}_char={c}/data'
# testset = load_data(data_path, extensions=['test'], do_shuffle=[False], return_dataloader=False)
testloader = load_data(data_path, batch_sizes=[64], extensions=['test'], do_shuffle=[False], return_dataloader=True, tokenizer=tokenizer)

tot, acc = 0, 0
with torch.no_grad():
    start_time = time()
    for batch in tqdm(testloader):
        x, y = batch['input_ids'].cuda(), batch['decoder_input_ids'].cuda()
        output_ids = model.generate(x, max_length=y.shape[-1], num_beams=1, do_sample=False)
        # z_text = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        
        l = min(y.shape[-1], output_ids.shape[-1])
        hits = torch.all(y[:, :l] == output_ids[:, :l], dim=1)
        acc += hits.sum().item()
        tot += len(hits)

    acc /= tot
    runtime = time() - start_time 
    print(f'(n={n}, char={c}): acc = {acc:.4f}, runtime = {runtime:.2f} [sec]')


'+ x0 ** x1 3 [SEP] * 3 ** x1 3'

In [21]:
z_text

['+ x0 ** x1 3 [SEP] * 3 ** x1 3']

In [6]:
input_text, output_text = batch['input'], batch['target']

KeyError: 'input'

In [17]:
eval_prediction(model, testset, tokenizer, max_samples=10, use_tqdm=False, num_beams=5)

+ x0 ** x1 3 [SEP] * 3 ** x1 3
+
------------

+ 3 + x0 + * -1 ** x1 3 * -3 ** x1 5 [SEP] + ** x1 5 * -3 x1
+
------------

+ 2 + x0 + * -3 x1 + * 2 ** x1 4 * 3 ** x1 5 [SEP] + 2 + * -1 ** x1 4 + * -3 ** x1
+
------------

+ -3 + x0 + x1 + * -3 ** x1 4 * -3 ** x1 5 [SEP] + * -3 ** x1 2 + * -2 ** x1 4 * 2 x1
+
------------

+ -2 + x0 + ** x1 4 + * -1 ** x1 3 + * -2 x1 * -2 ** x1 5 [SEP] + -1 + * -2 ** x1 4 + * 2 ** x1 2 * 3 ** x1 5
+
------------

+ 3 x0 [SEP] * -1 ** x1 4
+
------------

+ -2 + x0 + ** x1 3 + * -2 ** x1 4 * 2 ** x1 5 [SEP] + * -3 ** x1 2 + * -3 ** x1 5 + * -2 ** x1 4 * 2 **
+
------------

+ -2 + x0 + * -1 ** x1 4 + * -3 ** x1 2 + * -3 ** x1 5 * 3 ** x1 3 [SEP] + -2 + x1 + * -1 ** x1 4 + * -3 ** x1
+
------------

+ x0 + * -1 ** x1 2 + * -1 ** x1 3 + * -1 ** x1 5 + * -3 ** x1 4 * -2 x1 [SEP] + 3 + * -1 ** x1 3 + * -1 ** x1 5 + * 2
+
------------

+ -2 + x0 + * -3 x1 + * -3 ** x1 2 * 2 ** x1 4 [SEP] + x1 + * -1 ** x1 5 + * -2 ** x1 4 + * 3 ** x1 2 * 3 ** x1 3
+
--------

{'acc': 0.0,
 'hits': [False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False],
 'num_beams': 5}

In [6]:
x = tokenizer(input_text, padding=True, return_tensors='pt').input_ids
y = tokenizer(output_text, padding=True, return_tensors='pt').input_ids
x, y = x.cuda(), y.cuda()

In [7]:
max_length=x.shape[-1]
output_ids = model.generate(x, max_length=max_length, num_beams=1, do_sample=False)

In [33]:
pred_text = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
pred_text

['+ x0 + ** x1 3 + * -1 x1 + * -1 ** x1 2 * -2 ** x1 4 [SEP] + -1 + * -1 ** x1 4 + * 2 ** x1 2 * 2 ** x1 5',
 '+ 1 + x0 + ** x1 5 + * -1 x1 + * 2 ** x1 2 * 2 ** x1 4 [SEP] + ** x1 2 + * -1 x1 + * -2 ** x1 3 + * 2 ** x1 4 * 2 ** x1 5',
 '+ 1 + x0 + ** x1 5 + * -1 x1 + * 2 ** x1 2 * 2 ** x1 4 [SEP] + ** x1 2 + * -1 x1 + * -2 x1 + * -2 ** x1 3 + * -1 ** x1 4 * -2 ** x1 5',
 '+ -1 + x0 + ** x1 5 + * -1 x1 + * 2 ** x1 2 * 2 ** x1 4 [SEP] + ** x1 2 + * -1 x1 + * -1 ** x1 3 + * 2 ** x1 4 * 2 ** x1 5']

In [36]:
len(trainloader)

1000000

In [9]:
batch = next(iter(testloader))
batch

{'data': ['+ x0 + * -1 ** x1 3 + * -1 ** x1 4 + * -1 ** x1 5 + * -2 ** x1 2 * 2 x1 [SEP] + -1 + * -1 ** x1 3 + * -1 ** x1 5 + * -2 ** x1 6 + * -2 ** x1 7 + * -2 ** x1 8 + * -1 * x0 ** x1 4 + * -2 * x0 ** x1 3 + * -2 * ** x0 2 ** x1 2 + * 2 * x0 ** x1 5 + * 2 * x0 ** x1 6 * 2 * x0 ** x1 7'],
 'target': ['+ x0 + * -1 ** x1 3 + * -1 ** x1 4 + * -1 ** x1 5 + * -2 ** x1 2 * 2 x1 [SEP] + -1 + ** x1 4 + * -1 ** x1 3 * -2 ** x1 5']}

In [4]:
## ==============================
##       Overlap test 
## ==============================

from tqdm import tqdm
from src.loader.data import load_data
data_path = 'data/gb_n=2_r=5_d=3_m=2_Gd=5_Gm=None_F=4'
trainloader, testloader = load_data(data_path, batch_sizes=[1,1000])

count = 0
count_c = 0
total = 100
for i, batch in enumerate(tqdm(trainloader)):
    if i > total: break
    x, y = batch['data'], batch['target']
    x, y = x[0], y[0]
    for batch_ in testloader:
        x_, y_ = batch_['data'], batch_['target']
        
        if x in x_ or y in y_: 
            count += 1
            # count_c += int(x == x_ and y == y_)
            break 

print(f'{count} / {total} overlap') # 0 / 100 overlap
# print(f'{count_c} / {total} complete overlap')

  0%|          | 101/1000000 [01:08<188:49:07,  1.47it/s]

0 / 100 overlap





In [5]:
x

'+ x0 + ** x1 2 + ** x1 5 + * -1 x1 + * 2 ** x1 3 * 2 ** x1 4 [SEP] + -1 + x0 + ** x0 3 + ** x1 2 + ** x1 5 + * -1 x1 + * -2 ** x1 4 + * 2 ** x1 3 + * x0 ** x1 5 + * x0 ** x1 6 + * ** x0 2 ** x1 5 + * -1 * x1 ** x0 2 + * -1 * ** x0 2 ** x1 2 + * -2 * x0 ** x1 4 + * -2 * x0 ** x1 7 + * 2 * x0 ** x1 3 + * 2 * ** x0 2 ** x1 3 * 2 * ** x0 2 ** x1 4'

In [34]:
for pred, target in zip(pred_text, output_text):
    print(pred == target)

False
False
False
False


In [None]:
pred_text[0]

In [None]:
output_text[0]

In [None]:
i = 3
pred_text[i][:len(output_text[i])] == output_text[i]

In [3]:
from src.data.tokenizers import set_vocab, set_tokenizer

In [None]:
tokenizer.vocab

In [None]:
data_path = 'data/gb_n=2_r=5_d=3_m=2_Gd=5_Gm=None_F=4'
trainloader, testloader = load_data(data_path)


In [None]:
batch = next(iter(testloader))
x, y = batch['data'], batch['target']

In [None]:
vocab = set_vocab('none', 3, max_int=100)
tok = set_tokenizer(vocab)

In [None]:
vocab

In [None]:
tok(['-1 1'])

In [None]:
a = tok(x, y, return_tensors='pt', padding='longest')

In [None]:
a = tok(x, y, return_tensors='pt', padding='longest', verbose=True)
b = tok(list(zip(x, y)), return_tensors='pt', padding='longest', verbose=True)

In [None]:
len(a.input_ids)

In [None]:
len(b.input_ids[0])

In [None]:
a.token_type_ids

In [None]:
len(b.input_ids[0])

In [None]:
b = tok(x[0], y[0], return_tensors='pt', padding='longest')

In [None]:
b.token_type_ids

In [None]:
from src.data.symbolic_utils import * 
from src.utils.utils import *


In [None]:
import torch
import yaml 
import os 
import argparse
import re 

save_dir = 'results/shape_gb/_gb_n=2_r=5_d=3_m=2_Gd=5_Gm=None_F=4/'

def load_args(save_dir):
    config_file = os.path.join(save_dir, 'params.yaml')
    with open(config_file, 'r') as f:
        config = yaml.safe_load(f)
    args = argparse.Namespace(**config)
    return args 

def get_checkpoint_id(save_dir):
    cpt_file = [f for f in os.listdir(save_dir) if 'checkpoint' in f][0]
    cpid = int(re.search(r'checkpoint-(\d+)', cpt_file).group(1))
    return cpid 

from transformers import PreTrainedTokenizerFast
def load_tokenizer(save_dir):
    cpid = get_checkpoint_id(save_dir)
    tokenizer = PreTrainedTokenizerFast.from_pretrained(os.path.join(save_dir, f'tokenizer.json'))
    return tokenizer

def load_pretrained_model(save_dir, params, tokenizer):
    # from src.loader._models import Transformer
    model = Transformer(tokenizer, 
                        encoder_layers = params.encoder_layers, 
                        decoder_layers = params.decoder_layers, 
                        attention_heads = params.attention_heads, 
                        embedding_dimension=params.embedding_dimension, 
                        max_sequence_length=params.max_seq_length, 
                        feedforward_dimension=params.feedforward_dimension,
                        dropout=params.dropout
                        ) 

    # print(model.generate)

    model_path = os.path.join(save_dir, 'pytorch_model.bin')
    model_config = torch.load(model_path, map_location='cpu')
    model.load_state_dict(model_config)
    
    # model.from_pretrained(model_path)
    return model 

def load_trained_bag(save_dir):
    cpid = get_checkpoint_id(save_dir)
    checkpoint_path = os.path.join(save_dir, f'checkpoint-{cpid}')
    
    params = load_args(save_dir)
    tokenizer = load_tokenizer(save_dir)
    model = load_pretrained_model(checkpoint_path, params, tokenizer)
    
    bag = {'model': model, 'params': params, 'tokenizer': model.tokenizer}
    
    return bag


In [None]:
bag = load_trained_bag(save_dir)
model = bag['model'].cuda() 
tokenizer = bag['tokenizer']
params = bag['params']

In [None]:
train_dataset, test_dataset = load_data(params.data_path, return_dataloader=False)

In [None]:
model.return_loss=False

In [None]:
x = tokenizer(input_text, return_tensors='pt').input_ids.cuda()
y = tokenizer(out_text, return_tensors='pt').input_ids.cuda()

In [None]:
mean_acc = 0
model.return_loss=False
for i in range(100):
    input_text = test_dataset[0]['data']
    out_text = test_dataset[0]['target']

    x = tokenizer(input_text, return_tensors='pt').input_ids.cuda()
    y = tokenizer(out_text, return_tensors='pt').input_ids.cuda()

    attention_mask = ~tokenizer(input_text, return_tensors='pt')['attention_mask'].bool().cuda()
    decoder_attention_mask = ~tokenizer(out_text, return_tensors='pt')['attention_mask'].bool().cuda()

    z = model(x, y, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask)

    acc = (z.argmax(dim=-1) == y).float().mean().item()
    mean_acc += acc 
    # print(acc)
    # decoder_attention_mask=None
    
print(mean_acc)

In [None]:
from pytorch_beam_search.seq2seq import greedy_search

In [None]:
from transformers.generation_utils import greedy_search

In [None]:
tokenizer.batch_decode(predictions)

In [None]:
predictions

In [None]:
log_probabilities.shape

In [None]:
tokenizer = bag['tokenizer']

In [None]:
len(tokenizer.vocab)

In [None]:
tokenizer.special_tokens_map

In [12]:
vocab = set_vocab(2, field_char=0, max_int=20)
tokernizer = set_tokenizer(vocab)

In [8]:
len(tok.vocab)

52

In [9]:
tok.vocab_size

52

In [11]:
len(tok.get_vocab())

52

In [23]:
from src.data.tokenizers import set_tokenizer, set_vocab

In [25]:
vocab = set_vocab(3)
tok = set_tokenizer(vocab, max_seq_length=1024)

In [26]:
tok.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]'}

In [None]:
print(len(tok.vocab))
tok.vocab.keys()

In [None]:
for k in tokenizer.vocab:
    if k not in tok.vocab:
        print(k)

In [None]:
tokenizer.vocab_size

In [None]:
tok.vocab_size

In [None]:
tokenizer.special_tokens_map

In [None]:
tok.special_tokens_map

In [None]:
params = load_args(save_dir)
tokenizer = load_tokenizer(save_dir)
cpid = get_checkpoint_id(save_dir)

In [None]:
# from src.loader.model import load_model
from src.loader._models import Transformer

model = Transformer(tokenizer, 
                    encoder_layers = params.encoder_layers, 
                    decoder_layers = params.decoder_layers, 
                    attention_heads = params.attention_heads, 
                    embedding_dimension=params.embedding_dimension, 
                    max_sequence_length=params.max_seq_length, 
                    feedforward_dimension=params.feedforward_dimension,
                    dropout=params.dropout
                    ) 
    

In [None]:
import torch
# model_path = os.path.join(save_dir, 'pytorch_model.bin')
model_path = os.path.join(save_dir, f'checkpoint-{cpid}/pytorch_model.bin')
model_config = torch.load(model_path, map_location='cpu')
model.load_state_dict(model_config)

In [None]:
# set up tokenizer
cpid = get_checkpoint_id(save_dir)
tokenizer = PreTrainedTokenizerFast.from_pretrained(os.path.join(save_dir, f'checkpoint-{cpid}'))
tokenizer.add_special_tokens({'pad_token': '[PAD]', 'cls_token': '[CLS]', 'bos_token': '<s>', 'eos_token': '</s>'})
from tokenizers.processors import TemplateProcessing
tokenizer.post_processor = TemplateProcessing(
    single="<s> $A </s>",
    special_tokens=[("<s>", tokenizer.bos_token_id), ("</s>", tokenizer.eos_token_id)],
)

# load model 
config = BartConfig.from_pretrained(os.path.join(save_dir, f'checkpoint-{cpid}/config.json'))
model = BartForConditionalGeneration.from_pretrained(os.path.join(save_dir, f'checkpoint-{cpid}/pytorch_model.bin'), config=config)
model.eval().cuda()


In [None]:
def reload_model(path, args):
    with open(path) as file:
        params = yaml.safe_load(file)
    corpus = read_file(data_path + '.corpus')

    model = Transformer(env, env, 
                        encoder_layers = params['encoder_layers'], 
                        decoder_layers = params['decoder_layers'], 
                        attention_heads = params['attention_heads'], 
                        embedding_dimension=params['embedding_dimension'], 
                        max_sequence_length=params['max_seq_length'], 
                        feedforward_dimension=params['feedforward_dimension'],
                        dropout=params['dropout'])

    model.print_architecture()
    sd = torch.load(os.path.join(params['save_path'], 'best_model.pth'), map_location='cpu')
    model.load_state_dict(sd)
    
    return model 

In [None]:
import torch 
import torch.nn as nn 
import math 
from torch import Tensor 
from transformers import GenerationMixin, PreTrainedModel
from attrdict import AttrDict

class Transformer(nn.Module):
        
    def __init__(self, 
                tokenizer,
                max_sequence_length = 1024,
                embedding_dimension = 512,
                feedforward_dimension = 2048,
                encoder_layers = 6,
                decoder_layers = 6,
                attention_heads = 8,
                activation = "gelu",
                dropout = 0.1,
                ):

        super().__init__()
        
        max_sequence_length = 1024*2
        
        self.return_loss = True
        self.batch_first = True
        # # float16 / distributed (AMP)
        # if params.amp >= 0:
        #     self.init_amp()
        #     if params.multi_gpu:
        #         for k in self.modules.keys():
        #             self.modules[k] = apex.parallel.DistributedDataParallel(self.modules[k], delay_allreduce=True)
        self.tokenizer = tokenizer
        self.vocab_size = 51 # len(tokenizer.vocab)  # tokenizer.vocab_size does not include special tokens
        self.generation_config = AttrDict({'bos_token_id': tokenizer.cls_token_id, 
                                           'eos_token_id': tokenizer.sep_token_id})
        self.pad_token_id = tokenizer.pad_token_id
            
        self.source_embeddings = nn.Embedding(self.vocab_size, embedding_dimension)
        self.target_embeddings = self.source_embeddings
        self.positional_embeddings =  PositionalEncoding(d_model=embedding_dimension, dropout=dropout, max_len=max_sequence_length)

        self.transformer = nn.Transformer(d_model = embedding_dimension, 
                                          dim_feedforward = feedforward_dimension,
                                          nhead = attention_heads, 
                                          num_encoder_layers = encoder_layers, 
                                          num_decoder_layers = decoder_layers,
                                          activation = activation,
                                          dropout = dropout,
                                          batch_first=True)
        
        self.output_layer = nn.Linear(embedding_dimension, self.vocab_size)
        
        self.architecture = dict(model = "Seq2Seq Transformer",
                                 vocab = self.tokenizer.vocab,
                                 max_sequence_length = max_sequence_length,
                                 embedding_dimension = embedding_dimension,
                                 feedforward_dimension = feedforward_dimension,
                                 encoder_layers = encoder_layers,
                                 decoder_layers = decoder_layers,
                                 attention_heads = attention_heads,
                                 activation = activation,
                                 dropout = dropout)

    def criterion(self, z, y):
        loss_fn = nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
        return loss_fn(z.reshape(-1, z.shape[-1]), y.flatten())

    def print_architecture(self):
        """
        Displays the information about the model in standard output. 
        """
        for k in self.architecture.keys():
            print(f"{k.replace('_', ' ').capitalize()}: {self.architecture[k]}")
        print(f"Trainable parameters: {sum([p.numel() for p in self.parameters()]):,}")
        print(flush=True)
        
    def tokenize(self, x):
        return self.tokenizer(x, padding=True, truncation=False, return_tensors='pt').input_ids

    def encode(self, X, mask):
        '''
        X: batch_size x seq_length
        '''
        assert(X.shape[0] <= self.architecture["max_sequence_length"])
        src_pad_mask = (X == self.pad_token_id)
        
        X = self.source_embeddings(X) #* math.sqrt(self.architecture['embedding_dimension'])
        X = self.positional_embeddings(X)

        return self.transformer.encoder(X, src_key_padding_mask=src_pad_mask)
    
    def decode(self, encoder_outputs, Y): 
        assert Y.shape[1] <= self.architecture["max_sequence_length"]

        mask = self.transformer.generate_square_subsequent_mask(Y.shape[1]).to(Y.device)
        tgt_pad_mask = (Y == self.pad_token_id)
        
        Y = self.target_embeddings(Y) #* math.sqrt(self.architecture['embedding_dimension'])
        Y = self.positional_embeddings(Y)
                
        return self.transformer.decoder(Y, encoder_outputs, tgt_mask=mask, tgt_key_padding_mask=tgt_pad_mask)

    def forward(self, input_ids=None, decoder_input_ids=None, attention_mask=None, decoder_attention_mask=None, labels=None, return_loss=None):
        X, Y = input_ids, decoder_input_ids
        """
        Forward method of the model.
        
        Parameters
        ----------
        X: LongTensor of shape (batch_size, input_length)
            Tensor of integers containing the inputs for the model.
            
        Y: LongTensor of shape (batch_size, output_length)
            Tensor of integers containing the output produced so far.
            
        Returns
        -------
        output: FloatTensor of shape (batch_size, output_length, len(out_vocabulary))
            Tensor of floats containing the inputs for the final Softmax layer (usually integrated in the loss function).
        """
        # assert X.shape[1] <= self.architecture["max_sequence_length"]
        # assert Y.shape[1] <= self.architecture["max_sequence_length"]
        
        src_pad_mask = (X == self.pad_token_id)
        tgt_pad_mask = (Y == self.pad_token_id)

        X_embd = self.source_embeddings(X) / math.sqrt(self.architecture['embedding_dimension'])
        X_embd = self.positional_embeddings(X_embd)
        
        Y_embd = self.target_embeddings(Y) / math.sqrt(self.architecture['embedding_dimension'])
        Y_embd = self.positional_embeddings(Y_embd)

        mask = self.transformer.generate_square_subsequent_mask(Y_embd.shape[1]).to(Y_embd.device)
        transformer_output = self.transformer(src=X_embd, tgt=Y_embd, 
                                              tgt_mask=mask, 
                                              src_key_padding_mask=attention_mask, 
                                              tgt_key_padding_mask=decoder_attention_mask)
        outputs = self.output_layer(transformer_output) # (batch_size, seq_len, vocab_size)
        
        return_loss = self.return_loss if return_loss is None else return_loss
        loss = self.criterion(outputs, Y) if return_loss else None
        outputs = outputs if self.batch_first else outputs.permute(1, 0, 2)
        
        return (loss, outputs) if return_loss else outputs
        # return outputs
    
    def generate(self, X, prediction_length=30, method='greedy_search'):
        '''
        X: batch_size x length
        '''
        batch_size = X.shape[0]
        device = next(model.parameters()).device
        bos = self.generation_config.bos_token_id
        eos = self.generation_config.eos_token_id

        print(bos)

        with torch.no_grad():
            Y = bos * torch.ones(batch_size, 1).long().to(device)
            probabilities = torch.zeros(batch_size).to(device)
        
            iterator = range(prediction_length)
            for i in iterator:
                next_probabilities = model.forward(X, Y)[:, -1].log_softmax(-1)
                max_next_probabilities, next_chars = next_probabilities.max(-1)
                next_chars = next_chars.unsqueeze(-1)
                Y = torch.cat((Y, next_chars), axis = 1)
                probabilities += max_next_probabilities
        return Y, probabilities 
        


class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 1024):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        pe = pe.permute(1, 0, 2)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [batch_size, seq_len, embedding_dim]
        """
        x = x + self.pe[:, :x.shape[1], :]
        return self.dropout(x)
    

In [None]:
bag = load_trained_bag(save_dir)
model = bag['model'].cuda() 
tokenizer = bag['tokenizer']
params = bag['params']
model.return_loss = False

In [None]:
X = torch.ones(1, 2000).long().cuda()
model.max_sequence_length = 2000
out = model(X, X)

In [None]:
x

In [None]:
predictions, log_probabilities = model.generate(x[:, 1:])
tokenizer.batch_decode(predictions)

In [None]:
model(x, y).argmax(dim=-1)

In [None]:
y

In [None]:
def greedy_search(
    model, 
    X, 
    predictions = 20,
    progress_bar = False
):
    """
    Implements Greedy Search to compute the output with the sequences given in X. The method can compute 
    several outputs in parallel with the first dimension of X.
    Parameters
    ----------    
    X: LongTensor of shape (examples, length)
        The sequences to start the decoding process.
    predictions: int
        The number of tokens to append to X.
    progress_bar: bool
        Shows a tqdm progress bar, useful for tracking progress with large tensors.
    Returns
    -------
    Y: LongTensor of shape (examples, length + predictions)
        The output sequences.
    probabilities: FloatTensor of length examples
        The estimated log-probabilities for the output sequences. They are computed by iteratively adding the 
        probability of the next token at every step.
    """
    # print('input:  ', X[0])
    with torch.no_grad():
        Y = torch.ones(X.shape[0], 1).long().to(next(model.parameters()).device)
        probabilities = torch.zeros(X.shape[0]).to(next(model.parameters()).device)
    
        iterator = range(predictions)
        if progress_bar:
            iterator = tqdm(iterator)
        for i in iterator:
            # print('output: ', Y)
            next_probabilities = model.forward(X, Y)[:, -1].log_softmax(-1)
            max_next_probabilities, next_chars = next_probabilities.max(-1)
            next_chars = next_chars.unsqueeze(-1)
            Y = torch.cat((Y, next_chars), axis = 1)
            probabilities += max_next_probabilities
    return Y, probabilities

In [None]:
from transformers import BartForConditionalGeneration, BartConfig
import torch 


In [None]:
config = BartConfig(
    encoder_layers=params.encoder_layers,
    encoder_attention_heads=params.attention_heads,
    decoder_layers=params.decoder_layers,
    decoder_attention_heads=params.attention_heads,
    vocab_size=51, 
    d_model=params.embedding_dimension, 
    encoder_ffn_dim=params.feedforward_dimension,
    decoder_ffn_dim=params.feedforward_dimension,
    pad_token_id=tokenizer.pad_token_id,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    max_position_embeddnigs=params.max_seq_length
)
model = BartForConditionalGeneration(config).cuda()


In [None]:
X = torch.cat([x for _ in range(10)], dim=1).cuda()
X.shape

In [None]:
model(X)