In [1]:
import dataclasses
import os
import itertools
import re
import sys

import numpy as np
import rich.pretty
from rich.jupyter import print
import tensorflow as tf
import transformers

sys.path.append(os.path.dirname(os.getcwd()))
import task_specific

rich.pretty.install()

In [2]:
TARGET = "../../data/cached_pretok/"
SIZE_SUBSET = 10
CONTEXT_WINDOW_SIZE = 1024

In [3]:
tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2-xl")

In [4]:
_ACCEPTABLE_SPLITS = frozenset({"train", "eval", "test"})
def _tfr_sort_key(path):
    name = os.path.basename(path)
    split, num, ext = re.split(r"[_\.]", name)
    assert split in _ACCEPTABLE_SPLITS, split
    num = int(num)
    return ext, split, num
    
path_generator_fn = lambda: (os.path.abspath(os.path.join(TARGET, path)) for path in os.listdir(TARGET))
tfr_path_generator_fn = lambda: filter(lambda x: x.endswith(".tfr"), path_generator_fn())
paths = list(tfr_path_generator_fn())
paths.sort(key=_tfr_sort_key)

In [5]:
reader = tf.data.TFRecordDataset(paths)

In [22]:
@dataclasses.dataclass
class Sample:
    distances: tf.Tensor
    gpt2_answer_ids: tf.Tensor
    gpt2_question_ids: tf.Tensor
    gpt2_retrieved_ids: tf.Tensor

def prep_for_decoding(tensor):
 
    return tf.ragged.boolean_mask(tensor, tensor != -1)
        
def h1(text):
    return "[bold blue] -- " + text + " -- [/]"
    
parse = task_specific.make_parse_fn("eval", CONTEXT_WINDOW_SIZE)

In [24]:
for sample in itertools.islice(reader, SIZE_SUBSET):
    parsed = Sample(**parse(sample))
    print("#" * 300)
    print(h1("Question"))
    print(tokenizer.decode(prep_for_decoding(parsed.gpt2_question_ids)))
    print("")
    preped = prep_for_decoding(parsed.gpt2_retrieved_ids)
    for i in range(preped.shape[0]):
        print(f"{preped.shape = }")
        print(h1(f"Context #{i}"))
        print(tokenizer.decode(preped[i]))
        print("")
    print("")
    print(h1("Answer"))
    print(tokenizer.decode(prep_for_decoding(parsed.gpt2_answer_ids)))
    print("")
    

################################################################################
################################################################################
################################################################################
############################################################
 -- Question -- 
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
(1024,)
(1024,)
0
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
In Trading Places (1983, Akroyd/Murphy) how does the scheme at the end of the 
movie work? Why would buying a lot of OJ at a high price ruin the Duke Brothers?

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
(10, 1024)
(10, 1024)
1
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
preped.shape = TensorShape([10, None])
 -- Context #0 -- 
Trading Places

Trading Places is a 1983 American comedy film directed by John Landis and 
starring Dan Aykroyd and Eddie Murphy. It tells the story of an upper-class 
commodities broker and a homeless street hustler whose lives cross paths when 
they are unknowingly made part of an elaborate 