In [26]:
import collections
import dataclasses
import itertools
import json
import numpy as np
import os
import pathlib
import socket
import subprocess
import sys
import transformers
from typing import *

!pip install markdown_strings
import markdown_strings
from IPython.display import display, Markdown, Latex

import rich
import rich.console
import rich.markdown
import rich.table

import tensorflow.python.framework.ops as ops
import tensorflow as tf
import tensorflow.python.distribute.values as values
import tqdm.notebook as tqdm

_PROJECT_DIRECTORY = pathlib.Path().resolve().parent
sys.path.append(str(_PROJECT_DIRECTORY))
import constants
import utils
import tf_utils



In [27]:
def normal(text, escape=False):
    if escape:
        text = markdown_strings.esc_format(text)
    display(Markdown(text))

def h1(text, escape=False):
    if escape:
        text = markdown_strings.esc_format(text)
    display(Markdown(f"# {text}"))
    
def h2(text, escape=False):
    if escape:
        text = markdown_strings.esc_format(text)
    display(Markdown(f"#### {text}"))
    
def quote(text, escape=True):
    if escape:
        text = markdown_strings.esc_format(text)
    display(Markdown(markdown_strings.blockquote(text)))

In [28]:
def build_per_split(num_paths_display):
    h1("Getting filenames.")
    h2("Loading json config.")
    config_path = _PROJECT_DIRECTORY/"configs"/"train_configs"/"tpu_gpt2_eli5_kilt.json"
    config = utils.from_json_file(config_path)
    h2("Calling `gsutil ls` on the dataset repo.")
    ds_path = config["tfr_prefix"]
    filenames = subprocess.check_output(f"gsutil ls {ds_path}", shell=True).decode().strip().split("\n")
    h2("A few paths:")
    normal(f"There are actually {len(filenames)}.")
    normal(" - " + "\n - ".join(filenames[:num_paths_display]))
    

    h1("Building the `per_split` Path dict.")
    per_split = collections.defaultdict(list)
    for path in tqdm.tqdm(filenames, desc="Building `per_split` dict."):
        split = pathlib.Path(path).name.split("_")[0]
        per_split[split].append(path)

    normal("Sorting the `per_split` lists.")
    for split in per_split:
        # Ad-hoc split per file index
        per_split[split].sort(key=lambda p: int(pathlib.Path(p).name.split("_")[1].split(".")[0]))

    normal("Len per split for the per_split dict:")
    print({split: len(per_split[split]) for split in per_split})
    return per_split

In [29]:
def build_dataset(paths, context_window_size, split):
    ds = tf.data.TFRecordDataset(paths)


    description = {
      constants.CTH5Fields.distances:
          tf.io.FixedLenFeature((), tf.string),
      constants.CTH5Fields.gpt2_retrieved_ids:
          tf.io.FixedLenFeature((), tf.string),
      constants.CTH5Fields.gpt2_question_ids_inputs:
          tf.io.FixedLenFeature((), tf.string),
    }
    if split != constants.SplitChoices.test:
        description[
            constants.CTH5Fields.gpt2_answer_ids_inputs
        ] = tf.io.FixedLenFeature((), tf.string)

    feature_dtypes = {
      constants.CTH5Fields.distances:
          tf.float32,
      constants.CTH5Fields.gpt2_retrieved_ids:
          tf.int32,
      constants.CTH5Fields.gpt2_question_ids_inputs:
          tf.int32,
    }
    if split != constants.SplitChoices.test:
        feature_dtypes[
            constants.CTH5Fields.gpt2_answer_ids_inputs
        ] = tf.int32

    feature_shape = {
      constants.CTH5Fields.distances:
          (10,),
      constants.CTH5Fields.gpt2_retrieved_ids:
          (10, context_window_size,),
      constants.CTH5Fields.gpt2_question_ids_inputs:
          (context_window_size,),
    }
    if split != constants.SplitChoices.test:
        feature_shape[constants.CTH5Fields.gpt2_answer_ids_inputs] = (
            context_window_size
        )

    @tf.function
    def parse(sample):
        example = tf.io.parse_single_example(sample, description)
        output = {}
        for k, v in example.items():
            output[k] = tf.io.parse_tensor(v, out_type=feature_dtypes[k])
            output[k].set_shape(feature_shape[k])
        return output

    ds = ds.map(
      parse,
      num_parallel_calls=tf.data.experimental.AUTOTUNE,
      deterministic=False
      )
    return ds


In [30]:
def decode_line(tokenizer, line):
    return tokenizer.decode([x for x in line if x >= 0])

def count(paths, context_window_size, split, tokenizer, min_length=7):
    ds = build_dataset(paths, context_window_size, split)
    i = 0    
    feature_lengths = collections.defaultdict(lambda: collections.defaultdict(int))
    
    for item in tqdm.tqdm(ds, desc=f"Counting items for split `{split}`"):
        i += 1
        for feature_key, feature in item.items():
            
            if feature_key in {
                constants.CTH5Fields.gpt2_question_ids_inputs, constants.CTH5Fields.gpt2_answer_ids_inputs
            }:
                length = np.sum(feature >= 0)
                feature_lengths[feature_key][length] += 1
#                 if length <= min_length:
#                     normal(f"{split} - `{feature_key}`: {decode_line(tokenizer, feature)}")

    for k, v in feature_lengths.items():
        print(f"{split} - {k}: ")
        sorted_ = sorted(v.items(), key=lambda x: x[0])
        print(sorted_)
    return i


def main_count():
    _MAX_QTY = None
    _MODEL_TYPE = "gpt2-xl"
    _MODEL_CONFIG = transformers.AutoConfig.from_pretrained("distilgpt2")
    _CONTEXT_WINDOW_SIZE = _MODEL_CONFIG.n_ctx
    _EXPECTED_SIZES = dict(train=272634, eval=1507, test=600)
    _NUM_PATHS_DISPLAY = 10
    
    tokenizer = transformers.AutoTokenizer.from_pretrained(_MODEL_TYPE)
    per_split = build_per_split(_NUM_PATHS_DISPLAY)

    for split in ["eval", "test", "train"]:
        to_test = per_split[split][:_MAX_QTY]
        paths = tf.data.Dataset.from_tensor_slices(to_test)
        count(paths, _CONTEXT_WINDOW_SIZE, split, tokenizer)


In [31]:
def is_all_neg(tensor):
    if not isinstance(tensor, (np.ndarray, tf.Tensor, ops.EagerTensor)):
        return all([x < 0 for x in tensor])
    else:
        return np.all(tensor < 0)
    
def check_and_decode(feature_key, item, tokenizer):
    feature = item[feature_key]
    all_neg = is_all_neg(feature)
    assert not all_neg, feature_key
    return decode_line(tokenizer, feature)
    
def display_item(tokenizer, item, split):    
    item = vars(item)
    ##################################################################################################################
    # Produce information
    ##################################################################################################################
    question = check_and_decode(
        constants.CTH5Fields.gpt2_question_ids_inputs,
        item,
        tokenizer
    )
    
    answer = None
    if split != "test":
        feature_key = constants.CTH5Fields.gpt2_answer_ids_inputs
        feature = item[feature_key]
        answer = check_and_decode(feature_key, item, tokenizer)

    retrieved_segments = []
    for line in item[constants.CTH5Fields.gpt2_retrieved_ids]:
        retrieved_segments.append(decode_line(tokenizer, line))

    ##################################################################################################################
    # Display
    ##################################################################################################################
    console = rich.console.Console()
    table = rich.table.Table(title=f"Item from split `{split}`", show_lines=True)
    table.add_column("Field", style="bold")
    table.add_column("Value")
    table.add_row("Question:", question)
    for i, segment in enumerate(retrieved_segments):
        table.add_row(f"Retrieved segment {i}:", segment)

    if answer:
        table.add_row("Answer:", answer)
    
    console.print(table)

    
@dataclasses.dataclass
class Sample:
    distances: tf.Tensor
    gpt2_answer_ids: tf.Tensor
    gpt2_question_ids: tf.Tensor
    gpt2_retrieved_ids: tf.Tensor

In [32]:
# Init TPUs
tpu_setup = tf_utils.init_tpus(socket.gethostname())
utils.check_equal(tf_utils.devices_to_use()[0].device_type, "TPU")

# Prep dataset
_MAX_QTY = None
_MODEL_TYPE = "gpt2-xl"
_MODEL_CONFIG = transformers.AutoConfig.from_pretrained("distilgpt2")
_CONTEXT_WINDOW_SIZE = _MODEL_CONFIG.n_ctx
_EXPECTED_SIZES = dict(train=272634, eval=1507, test=600)
_NUM_PATHS_DISPLAY = 10
_NUM_ITEMS_TO_SHOW = 1
_NUM_REPLICAS = 8

tokenizer = transformers.AutoTokenizer.from_pretrained(_MODEL_TYPE)
per_split = build_per_split(_NUM_PATHS_DISPLAY)

strategy = tf.distribute.TPUStrategy(tpu_setup.resolver)
console = rich.console.Console()





INFO:tensorflow:Initializing the TPU system: jules


INFO:tensorflow:Initializing the TPU system: jules


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


# Getting filenames.

#### Loading json config.

#### Calling `gsutil ls` on the dataset repo.

#### A few paths:

There are actually 8192.

 - gs://julesgm-research-v3/tfrecord_query_cache/20210225-191356/eval_0.tfr
 - gs://julesgm-research-v3/tfrecord_query_cache/20210225-191356/eval_1.tfr
 - gs://julesgm-research-v3/tfrecord_query_cache/20210225-191356/eval_10.tfr
 - gs://julesgm-research-v3/tfrecord_query_cache/20210225-191356/eval_100.tfr
 - gs://julesgm-research-v3/tfrecord_query_cache/20210225-191356/eval_1000.tfr
 - gs://julesgm-research-v3/tfrecord_query_cache/20210225-191356/eval_1001.tfr
 - gs://julesgm-research-v3/tfrecord_query_cache/20210225-191356/eval_1002.tfr
 - gs://julesgm-research-v3/tfrecord_query_cache/20210225-191356/eval_1003.tfr
 - gs://julesgm-research-v3/tfrecord_query_cache/20210225-191356/eval_1004.tfr
 - gs://julesgm-research-v3/tfrecord_query_cache/20210225-191356/eval_1005.tfr

# Building the `per_split` Path dict.

HBox(children=(FloatProgress(value=0.0, description='Building `per_split` dict.', max=8192.0, style=ProgressSt…




Sorting the `per_split` lists.

Len per split for the per_split dict:

{'eval': 2048, 'test': 2048, 'train': 2048, 'validation': 2048}
INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


In [33]:
for split in [
# "eval", 
# "test", 
    "train"
]:
    # Load the DS
    paths_to_test = per_split[split][:_MAX_QTY]
    paths_ds = tf.data.Dataset.from_tensor_slices(paths_to_test)
    ds = build_dataset(paths_ds, _CONTEXT_WINDOW_SIZE, split)
    # Distribute
    dds = strategy.experimental_distribute_dataset(ds)
    for dist_items in itertools.islice(dds, 0, _NUM_ITEMS_TO_SHOW):
        turned_on_itself = [Sample(None, None, None, None) 
                            for _ in range(_NUM_REPLICAS)]

        for feature_key, v in dist_items.items():
            console.print(feature_key, style="green")
            utils.check_isinstance(v, values.PerReplica)
            for i, vv in enumerate(v.values):
                setattr(turned_on_itself[i], feature_key, vv)

        for item in turned_on_itself:
            if split == "train":
                for k, v in vars(item).items():
                    display_item(tokenizer, item, split)



AssertionError: gpt2_question_ids