In [1]:

import math
import os
import sys

from dataclasses import asdict, dataclass, field
from typing import Any, Dict, Optional

import datasets
import torch
import transformers
from datasets import interleave_datasets, load_dataset, load_from_disk

from transformers import HfArgumentParser, TrainingArguments, ViTFeatureExtractor
os.environ["TRANSFORMERS_OFFLINE"] = "1"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from pixel import (
    PIXELConfig,
    PIXELEmbeddings,
    PIXELForPreTraining,
    PIXELTrainerForPretraining,
    SpanMaskingGenerator,
    PyGameTextRenderer,
    get_attention_mask,
    get_transforms,
    get_2d_sincos_pos_embed
)

In [6]:
pwd

'/mnt/lustre/indy2lfs/work/sc118/sc118/xliao11/PIXEL/notebooks'

In [57]:

config = PIXELConfig.from_pretrained(
    "../../cache/models/pixel-base",
    attention_probs_dropout_prob=0.1,
    hidden_dropout_prob=0.1,

)

In [4]:
config.update(
    {
        "mask_ratio": 0.25,
        "norm_pix_loss": True,
        "architectures": [PIXELForPreTraining.__name__]
    })

In [5]:
model = PIXELForPreTraining.from_pretrained(
    "../../cache/models/pixel-base",
    from_tf=False,
    config=config,
        )

In [21]:
train_dataset = load_from_disk("../../cache/datasets/rendered_bookcorpus_8x8")

In [7]:
text_renderer = PyGameTextRenderer.from_pretrained("../new_configs")

In [8]:
feature_extractor = ViTFeatureExtractor.from_pretrained("../../cache/models/pixel-base")

In [53]:
# Adjust image size
image_height = text_renderer.pixels_per_patch
image_width = text_renderer.pixels_per_patch * text_renderer.max_seq_length
model.config.image_size = (image_height, image_width)
model.config.patch_size = image_height
model.image_size = (image_height, image_width)
feature_extractor.size = (image_height, image_width)

In [58]:
config

PIXELConfig {
  "_name_or_path": "Team-PIXEL/pixel-base",
  "architectures": [
    "PIXELForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "decoder_hidden_size": 512,
  "decoder_intermediate_size": 2048,
  "decoder_num_attention_heads": 16,
  "decoder_num_hidden_layers": 8,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "image_size": [
    16,
    8464
  ],
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "mask_ratio": 0.25,
  "model_type": "pixel",
  "norm_pix_loss": true,
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "torch_dtype": "float32",
  "transformers_version": "4.17.0"
}

In [51]:
model.config

PIXELConfig {
  "_name_or_path": "../../cache/models/pixel-base",
  "architectures": [
    "PIXELForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "decoder_hidden_size": 512,
  "decoder_intermediate_size": 2048,
  "decoder_num_attention_heads": 16,
  "decoder_num_hidden_layers": 8,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "image_size": [
    8,
    4232
  ],
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "mask_ratio": 0.25,
  "model_type": "pixel",
  "norm_pix_loss": true,
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": [
    8,
    8
  ],
  "qkv_bias": true,
  "torch_dtype": "float32",
  "transformers_version": "4.17.0"
}

In [54]:
# Reinitialize embeddings
model.vit.embeddings = PIXELEmbeddings(model.config)
model.decoder.decoder_pos_embed = torch.nn.Parameter(
    torch.zeros((1, text_renderer.max_seq_length + 1, 512)), requires_grad=False
)
decoder_pos_embed = get_2d_sincos_pos_embed(
    model.decoder.decoder_pos_embed.shape[-1], int(text_renderer.max_seq_length ** 0.5), add_cls_token=True
)
model.decoder.decoder_pos_embed.data.copy_(torch.from_numpy(decoder_pos_embed).float().unsqueeze(0))

#
model.decoder.decoder_pred = nn.Linear(
            model.config.decoder_hidden_size, model.config.patch_size ** 2 * model.config.num_channels, bias=True
        )  # encoder to decoder

NameError: name 'nn' is not defined

In [55]:
model.config.patch_size ** 2 * model.config.num_channels

192

In [11]:
patch_mask_generator = SpanMaskingGenerator(
    num_patches=text_renderer.max_seq_length,
    num_masking_patches=math.ceil(0.25 * text_renderer.max_seq_length),
    max_span_length=6,
    spacing="span",
    cumulative_span_weights="0.2,0.4,0.6,0.8,0.9,1",
)

column_names = ["pixel_values","text", "num_patches"]
image_column_name = column_names[0]

In [12]:
image_mean, image_std = (None, None)
feature_extractor.do_normalize = False

In [13]:
transforms = get_transforms(
    do_resize=True,
    size=(image_height, image_width),
    do_normalize=False,
    image_mean=image_mean,
    image_std=image_std,
)

In [14]:
def preprocess_images(examples):
    """Preprocess a batch of images by applying transforms."""

    examples["pixel_values"] = [transforms(image) for image in examples[image_column_name]]
    examples["attention_mask"] = [get_attention_mask(num_patches) for num_patches in examples["num_patches"]]
    if True:
        examples["patch_mask"] = [
            torch.tensor(patch_mask_generator(num_patches + 1), dtype=torch.float32)
            for num_patches in examples["num_patches"]
        ]

    return examples

In [15]:
if True:
    if True:
        # turn the dataset into the iterable dataset and make the use of the speeding in the shuffling and mapping
        train_dataset = train_dataset.to_iterable_dataset()
        train_dataset = train_dataset.with_format("torch")
        train_dataset = train_dataset.shuffle(42, buffer_size=1000)
    # Filter out examples that are less than one row long in the squared input image/ for test purpose
    #train_dataset = train_dataset.filter(lambda x: (x["num_patches"] >= 22))
    # if data_args.max_train_samples is not None:
        # train_dataset = train_dataset.shuffle(seed=training_args.seed).select(
            # range(data_args.max_train_samples)
        # 
    # Set training transforms
    if True:
        train_dataset = train_dataset.map(preprocess_images, batched=True, batch_size=1000)
    else:
        train_dataset.set_transform(preprocess_images)

In [19]:
train_dataset.set_transform(preprocess_images)

In [16]:
def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    attention_mask = torch.stack([example["attention_mask"] for example in examples])
    inputs = {"pixel_values": pixel_values, "attention_mask": attention_mask}
    if "patch_mask" in examples[0]:
        patch_mask = torch.stack([example["patch_mask"] for example in examples])
        inputs.update({"patch_mask": patch_mask})
    return inputs

In [None]:
trainer = PIXELTrainerForPretraining(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=validation_dataset if training_args.do_eval else None,
    tokenizer=text_renderer,
    data_collator=collate_fn,
)

In [39]:
data = train_dataset[:1]

In [29]:
patch_mask_generator = SpanMaskingGenerator(
    num_patches=text_renderer.max_seq_length,
    num_masking_patches=math.ceil(0.25 * text_renderer.max_seq_length),
    max_span_length=6,
    spacing="span",
    cumulative_span_weights=[0.2,0.4,0.6,0.8,0.9,1],
)

In [34]:
def preprocess_images(examples):
    """Preprocess a batch of images by applying transforms."""

    examples["pixel_values"] = [transforms(image) for image in examples[image_column_name]]
    examples["attention_mask"] = [get_attention_mask(num_patches) for num_patches in examples["num_patches"]]
    if True:
        examples["patch_mask"] = [
            torch.tensor(patch_mask_generator(num_patches + 1), dtype=torch.float32)
            for num_patches in examples["num_patches"]
        ]

    return examples

AttributeError: 'dict' object has no attribute 'unsqueeze'

In [41]:
preprocess_images(data)


{'pixel_values': [tensor([[[1.0000, 1.0000, 0.2471,  ..., 1.0000, 1.0000, 1.0000],
           [1.0000, 1.0000, 0.1490,  ..., 1.0000, 1.0000, 1.0000],
           [1.0000, 1.0000, 0.2471,  ..., 1.0000, 1.0000, 1.0000],
           ...,
           [1.0000, 1.0000, 0.2471,  ..., 1.0000, 1.0000, 1.0000],
           [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
           [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000]],
  
          [[1.0000, 1.0000, 0.2471,  ..., 1.0000, 1.0000, 1.0000],
           [1.0000, 1.0000, 0.1490,  ..., 1.0000, 1.0000, 1.0000],
           [1.0000, 1.0000, 0.2471,  ..., 1.0000, 1.0000, 1.0000],
           ...,
           [1.0000, 1.0000, 0.2471,  ..., 1.0000, 1.0000, 1.0000],
           [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
           [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000]],
  
          [[1.0000, 1.0000, 0.2471,  ..., 1.0000, 1.0000, 1.0000],
           [1.0000, 1.0000, 0.1490,  ..., 1.0000, 1.0000, 1.0000],
      

In [49]:
data['pixel_values'][0].shape

torch.Size([3, 8, 4232])

In [50]:
target= model.patchify(data['pixel_values'])

AttributeError: 'list' object has no attribute 'shape'

### Token classification

In [1]:
import argparse
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import transformers
import submitit
import wandb
from pixel import (
    AutoConfig,
    AutoModelForTokenClassification,
    UPOS_LABELS,
    Modality,
    PangoCairoTextRenderer,
    PIXELTrainer,
    PIXELTrainingArguments,
    POSDataset,
    Split,
    PyGameTextRenderer,
    get_transforms,
    resize_model_embeddings,
)
from seqeval.metrics import accuracy_score
from torch import nn
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    EvalPrediction,
    HfArgumentParser,
    PreTrainedTokenizerFast,
    default_data_collator,
    set_seed, PretrainedConfig,
)
from transformers.trainer_utils import get_last_checkpoint, is_main_process

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_processor(model_args: argparse.Namespace, modality: Modality):
    config_kwargs = {
        "cache_dir": model_args.cache_dir,
        "revision": model_args.model_revision,
        "use_auth_token": model_args.use_auth_token if model_args.use_auth_token else None,
    }

    if modality == Modality.TEXT:
        processor = AutoTokenizer.from_pretrained(
            model_args.processor_name if model_args.processor_name else model_args.model_name_or_path,
            use_fast=True,
            add_prefix_space=True if model_args.model_name_or_path == "roberta-base" else False,
            **config_kwargs,
        )
    elif modality == Modality.IMAGE:
        renderer_cls = PyGameTextRenderer if model_args.rendering_backend == "pygame" else PangoCairoTextRenderer
        processor = renderer_cls.from_pretrained(
            model_args.processor_name if model_args.processor_name else model_args.model_name_or_path,
            fallback_fonts_dir=model_args.fallback_fonts_dir,
            rgb=model_args.render_rgb,
            **config_kwargs,
        )
    else:
        raise ValueError(f"Modality {modality} not supported.")

    return processor

In [3]:
def get_dataset(
    config: PretrainedConfig,
    data_args: argparse.Namespace,
    processor: Union[Union[PyGameTextRenderer, PangoCairoTextRenderer], PreTrainedTokenizerFast],
    modality: Modality,
    split: Split,
):
    kwargs = {}
    if modality == Modality.IMAGE:
        transforms = get_transforms(
            do_resize=True,
            size=(processor.pixels_per_patch, processor.pixels_per_patch * processor.max_seq_length),
        )
    else:
        transforms = None
        kwargs.update({
            "sep_token_extra": bool(config.model_type in ["roberta"]),
            "cls_token": processor.cls_token,
            "sep_token": processor.sep_token,
            "pad_token": processor.convert_tokens_to_ids([processor.pad_token])[0]
        })

    return POSDataset(
        data_dir=data_args.data_dir,
        processor=processor,
        transforms=transforms,
        modality=modality,
        labels=UPOS_LABELS,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=split,
        **kwargs
    )

In [8]:
transforms = get_transforms(
    do_resize=True,
    size=(processor.pixels_per_patch, processor.pixels_per_patch * processor.max_seq_length),
)
POSDataset(
        data_dir="../../cache/datasets/pos/ud-treebanks-v2.10/UD_English-EWT",
        processor=processor,
        transforms=transforms,
        modality=modality,
        labels=UPOS_LABELS,
        max_seq_length=256,
        overwrite_cache=True,
        mode=Split.TEST,
    )

In [4]:
# Set modality
modality = Modality.IMAGE

# Load text renderer when using image modality and tokenizer when using text modality
processor = PyGameTextRenderer.from_pretrained(
            "../../cache/models/pixel-8x8/outputs",
        )

In [5]:
if modality == Modality.IMAGE:
    if processor.max_seq_length != data_args.max_seq_length:
        processor.max_seq_length = data_args.max_seq_length

    resize_model_embeddings(model, processor.max_seq_length)

NameError: name 'data_args' is not defined

In [None]:
train_dataset = get_dataset(config, data_args, processor, modality, Split.TRAIN)

datasets

In [6]:
from datasets import interleave_datasets, load_dataset, train_test_split
ds= load_dataset(
    "Groosezzz/rendered-bookcorpus-8x8-withText",
    split="train",
    use_auth_token="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",
    cache_dir="../cache/downloads",
    streaming=False
)

Downloading and preparing dataset None/None (download: 29.30 GiB, generated: 31.77 GiB, post-processed: Unknown size, total: 61.07 GiB) to /mnt/lustre/indy2lfs/work/sc118/sc118/xliao11/PIXEL/../cache/downloads/Groosezzz___parquet/Groosezzz--rendered-bookcorpus-8x8-initial-948219afb85c2c51/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...





[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



Downloading data: 100%|██████████| 233M/233M [00:03<00:00, 60.1MB/s]




[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A




KeyboardInterrupt: 

In [3]:
!pip install ./datasets

Processing ./datasets
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: datasets
  Building wheel for datasets (setup.py) ... [?25ldone
[?25h  Created wheel for datasets: filename=datasets-2.1.1.dev0-py3-none-any.whl size=348728 sha256=271fa11905970105316a86d9cbacd159060cc52d6abdfde86cf3284d986ae0e0
  Stored in directory: /tmp/pip-ephem-wheel-cache-hd28p4rm/wheels/e5/8f/2d/a1365ef16547af3c84e7a05594aeae0bd26668bcdd2491a8e5
Successfully built datasets
Installing collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 2.12.0
    Uninstalling datasets-2.12.0:
      Successfully uninstalled datasets-2.12.0
Successfully installed datasets-2.1.1.dev0


In [38]:
ds.save_to_disk("../cache/datasets/rendered_wikipedia_8x8")

In [4]:
from datasets import interleave_datasets, load_dataset, load_from_disk
ds_test=load_from_disk("../../cache/datasets/rendered_bookcorpus_16x16")

In [8]:
train_dataset,validation_dataset = ds_test.train_test_split(test_size=0.0001).values()

In [10]:
len(train_dataset)

5399460

In [9]:
len(validation_dataset)

540

In [5]:
ds = ds_test.to_iterable()

AttributeError: 'Dataset' object has no attribute 'to_iterable'

In [4]:
ds_test[0]

{'pixel_values': <PIL.PngImagePlugin.PngImageFile image mode=L size=8464x16>,
 'num_patches': 498}

In [1]:
from datasets import load_from_disk
ds_test=load_from_disk("../cache/datasets/rendered_bookcorpus_16x16")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = ds_test.to_iterable_dataset()

In [1]:
from pixel import (
    PIXELConfig,
    PIXELEmbeddings,
    PIXELForPreTraining,
    PIXELTrainerForPretraining,
    SpanMaskingGenerator,
    PyGameTextRenderer,
    get_attention_mask,
    get_transforms,
    get_2d_sincos_pos_embed
)

  from .autonotebook import tqdm as notebook_tqdm


RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
cannot import name 'BertTokenizerFast' from 'transformers.models.bert' (/work/sc118/sc118/xliao11/miniconda3/envs/pixel-test/lib/python3.9/site-packages/transformers/models/bert/__init__.py)

In [1]:
import transformers

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
transformers.__version__

'4.17.0'

In [2]:
x = '12'
x[::-1]


'21'

In [3]:
x = []
y = []
y not in x

True