Based on the HuggingFace CLM example https://github.com/huggingface/transformers/blob/master/examples/pytorch/language-modeling/run_clm.py 

# Initial Transformers prereqs

In [1]:
%%capture
! pip install transformers datasets

In [2]:
import logging
import math
import os
import sys
from dataclasses import dataclass, field
from typing import Optional

import transformers
from transformers import (
    CONFIG_MAPPING,
    MODEL_FOR_CAUSAL_LM_MAPPING,
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from datasets import load_dataset
from transformers.testing_utils import CaptureLogger
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version

In [3]:
logger = logging.getLogger(__name__)


MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
    """

    model_name_or_path: Optional[str] = field(
        default=None,
        metadata={
            "help": "The model checkpoint for weights initialization."
            "Don't set if you want to train a model from scratch."
        },
    )
    model_type: Optional[str] = field(
        default=None,
        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
    )
    config_overrides: Optional[str] = field(
        default=None,
        metadata={
            "help": "Override some existing default config settings when a model is trained from scratch. Example: "
            "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
        },
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )
    use_fast_tokenizer: bool = field(
        default=True,
        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
    )
    model_revision: str = field(
        default="main",
        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
    )
    use_auth_token: bool = field(
        default=False,
        metadata={
            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
            "with private models)."
        },
    )

    def __post_init__(self):
        if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
            raise ValueError(
                "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
            )

In [4]:
training_args = TrainingArguments(
    output_dir="./output",
    do_train=True,
    overwrite_output_dir=True,
    per_device_train_batch_size=1,
    save_strategy="no"
)

In [5]:
model_args = ModelArguments("gpt2", "gpt")

In [6]:
! wget https://raw.githubusercontent.com/MonsoonNLP/gpt-nyc/main/combined.csv

import csv
op = open('./csvout.csv', 'w')
wrt = csv.writer(op)

with open('./combined.csv', 'r') as srcfile:
  rdr = csv.reader(srcfile)
  for row in rdr:
    wrt.writerow(['. '.join(row), '1'])

--2021-11-15 23:35:35--  https://raw.githubusercontent.com/MonsoonNLP/gpt-nyc/main/combined.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9675046 (9.2M) [text/plain]
Saving to: ‘combined.csv’


2021-11-15 23:35:36 (83.8 MB/s) - ‘combined.csv’ saved [9675046/9675046]



In [7]:
raw_datasets = {}
raw_datasets = load_dataset("csv", data_files="./csvout.csv", column_names=['text', 'extra'])

Using custom data configuration default-fcd7d1bf2102ce0c


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-fcd7d1bf2102ce0c/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-fcd7d1bf2102ce0c/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
raw_datasets["train"].column_names

['text', 'extra']

In [8]:
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
    last_checkpoint = get_last_checkpoint(training_args.output_dir)
    if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. "
            "Use --overwrite_output_dir to overcome."
        )
    elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
        logger.info(
            f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
            "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
        )

# Set seed before initializing model.
set_seed(training_args.seed)

config_kwargs = {
    "cache_dir": model_args.cache_dir,
    "revision": model_args.model_revision,
    "use_auth_token": True if model_args.use_auth_token else None,
}

# Track a single true-token match in GPT-2 over AskNYC Qs dataset

In [None]:
if model_args.config_name:
    config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
elif model_args.model_name_or_path:
    config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
else:
    config = CONFIG_MAPPING[model_args.model_type]()
    logger.warning("You are instantiating a new config instance from scratch.")
    if model_args.config_overrides is not None:
        logger.info(f"Overriding config: {model_args.config_overrides}")
        config.update_from_string(model_args.config_overrides)

tokenizer_kwargs = {
    "cache_dir": model_args.cache_dir,
    "use_fast": model_args.use_fast_tokenizer,
    "revision": model_args.model_revision,
    "use_auth_token": True if model_args.use_auth_token else None,
}
if model_args.tokenizer_name:
    tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
elif model_args.model_name_or_path:
    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
else:
    raise ValueError(
        "You are instantiating a new tokenizer from scratch. This is not supported by this script."
        "You can do it from another script, save it, and load it from here, using --tokenizer_name."
    )

if model_args.model_name_or_path:
    model = AutoModelForCausalLM.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
else:
    model = AutoModelForCausalLM.from_config(config)
    n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
    logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")

model.resize_token_embeddings(len(tokenizer))

# Preprocessing the datasets.
# First we tokenize all the texts.
if training_args.do_train:
    column_names = raw_datasets["train"].column_names
else:
    column_names = raw_datasets["validation"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]

# since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")

def tokenize_function(examples):
    with CaptureLogger(tok_logger) as cl:
        output = tokenizer(examples[text_column_name])
    # clm input could be much much longer than block_size
    if "Token indices sequence length is longer than the" in cl.out:
        tok_logger.warning(
            "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model."
        )
    return output

with training_args.main_process_first(desc="dataset map tokenization"):
    tokenized_datasets = raw_datasets.map(
        tokenize_function,
        batched=True,
        remove_columns=column_names,
        desc="Running tokenizer on dataset",
    )

# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= tokenizer.model_max_length:
        total_length = (total_length // tokenizer.model_max_length) * tokenizer.model_max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + tokenizer.model_max_length] for i in range(0, total_length, tokenizer.model_max_length)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
# for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
# to preprocess.
#
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

with training_args.main_process_first(desc="grouping texts together"):
    lm_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
        desc=f"Grouping texts in chunks of {tokenizer.model_max_length}",
    )

if training_args.do_train:
    if "train" not in tokenized_datasets:
        raise ValueError("--do_train requires a train dataset")
    train_dataset = lm_datasets["train"]

KeyboardInterrupt: ignored

In [None]:
# this is a token I decided to measure loss on in the Croatian dataset
# written it is "aj" (suffix token)
MONITOR_TOKEN_ID = 1228
tokenizer.decode([MONITOR_TOKEN_ID])

'aj'

In [None]:
# this is the token I decided to monitor in GPT-NYC dataset
print(tokenizer([' train']))
MONITOR_TOKEN_ID = 4512

{'input_ids': [[4512]], 'attention_mask': [[1]]}


## Overriding Trainer.compute_loss to log

In [None]:
# subclass of Trainer https://huggingface.co/transformers/main_classes/trainer.html
class TokenTrackingTrainer(Trainer):
  def compute_loss(self, model, inputs, return_outputs=False):
    #print(inputs['input_ids'])
    outputs = model(**inputs)['logits']
    for batch_num, batch_output in enumerate(outputs):
      for token_num, token_output in enumerate(batch_output):
        correct_token = inputs['input_ids'][batch_num][token_num]
        #print("correct token:")
        #print(correct_token)
        if correct_token.item() == MONITOR_TOKEN_ID:
          #print("predictions:")
          #print(token_output)
          if correct_token < len(token_output):
            #print("local prob:")
            print(token_output[inputs['input_ids'][batch_num][token_num]].item())

    return super().compute_loss(model, inputs, return_outputs)

trainer = TokenTrackingTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    # Data collator will default to DataCollatorWithPadding, so we change it.
    data_collator=default_data_collator,
)

In [None]:
# this attempt to track tokens from a callback was flawed because the callback does not see inputs
"""
from transformers import TrainerCallback, TrainerState, TrainerControl

class TokenLossCallback(TrainerCallback):
  #def on_substep_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
  #  print("substep")
  
  def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
    print("step")
    print(kwargs.keys())

trainer.add_callback(TokenLossCallback())
"""

'\nfrom transformers import TrainerCallback, TrainerState, TrainerControl\n\nclass TokenLossCallback(TrainerCallback):\n  #def on_substep_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):\n  #  print("substep")\n  \n  def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):\n    print("step")\n    print(kwargs.keys())\n\ntrainer.add_callback(TokenLossCallback())\n'

## Track loss of model and individual token during training

I cleared the logs on these during training, but it started all -150 to -60, by 0.28 epochs we had some positive scores :)

In [None]:
checkpoint = None
if training_args.resume_from_checkpoint is not None:
    checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
    checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model()

# Add NYC tokens to GPT-2 and track them all during fine-tuning

## Tokenizer / model setup

In [9]:
# load original tokenizer / model
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModel.from_pretrained('gpt2')

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

In [None]:
# original token length
len(tokenizer)

50257

In [10]:
! wget https://raw.githubusercontent.com/MonsoonNLP/gpt-nyc/main/newnyc.txt

def unique(list1):
    unique_list = []     
    for x in list1:
        if x not in unique_list:
            unique_list.append(x)
    return unique_list

with open('newnyc.txt', 'r') as vfile:
  new_nyc = unique(vfile.read().strip().split("\n"))
new_nyc[:10]

--2021-11-15 23:36:12--  https://raw.githubusercontent.com/MonsoonNLP/gpt-nyc/main/newnyc.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7204 (7.0K) [text/plain]
Saving to: ‘newnyc.txt’


2021-11-15 23:36:12 (63.5 MB/s) - ‘newnyc.txt’ saved [7204/7204]



['nyc',
 'Astoria',
 'Williamsburg',
 'midtown',
 'Bushwick',
 'UWS',
 'UES',
 'yelp',
 'manhattan',
 'Midtown']

In [11]:
tokenizer.add_tokens(new_nyc)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))

Embedding(51004, 768)

In [12]:
tokenizer.save_pretrained("gpt-nyc")
model.save_pretrained("gpt-nyc")
del tokenizer
del model

### If you already created GPT-NYC tokens model you can jump to here

In [13]:
model_args = ModelArguments("./gpt-nyc", "gpt")

In [14]:
if model_args.config_name:
    config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
elif model_args.model_name_or_path:
    config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
else:
    config = CONFIG_MAPPING[model_args.model_type]()
    logger.warning("You are instantiating a new config instance from scratch.")
    if model_args.config_overrides is not None:
        logger.info(f"Overriding config: {model_args.config_overrides}")
        config.update_from_string(model_args.config_overrides)

tokenizer_kwargs = {
    "cache_dir": model_args.cache_dir,
    "use_fast": model_args.use_fast_tokenizer,
    "revision": model_args.model_revision,
    "use_auth_token": True if model_args.use_auth_token else None,
}
if model_args.tokenizer_name:
    tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
elif model_args.model_name_or_path:
    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
else:
    raise ValueError(
        "You are instantiating a new tokenizer from scratch. This is not supported by this script."
        "You can do it from another script, save it, and load it from here, using --tokenizer_name."
    )

if model_args.model_name_or_path:
    model = AutoModelForCausalLM.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
else:
    model = AutoModelForCausalLM.from_config(config)
    n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
    logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")

model.resize_token_embeddings(len(tokenizer))

# Preprocessing the datasets.
# First we tokenize all the texts.
if training_args.do_train:
    column_names = raw_datasets["train"].column_names
else:
    column_names = raw_datasets["validation"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]

# since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")

def tokenize_function(examples):
    with CaptureLogger(tok_logger) as cl:
        output = tokenizer(examples[text_column_name])
    # clm input could be much much longer than block_size
    if "Token indices sequence length is longer than the" in cl.out:
        tok_logger.warning(
            "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model."
        )
    return output

with training_args.main_process_first(desc="dataset map tokenization"):
    tokenized_datasets = raw_datasets.map(
        tokenize_function,
        batched=True,
        remove_columns=column_names,
        desc="Running tokenizer on dataset",
    )

def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    if total_length >= tokenizer.model_max_length:
        total_length = (total_length // tokenizer.model_max_length) * tokenizer.model_max_length
    result = {
        k: [t[i : i + tokenizer.model_max_length] for i in range(0, total_length, tokenizer.model_max_length)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

with training_args.main_process_first(desc="grouping texts together"):
    lm_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
        desc=f"Grouping texts in chunks of {tokenizer.model_max_length}",
    )

if training_args.do_train:
    if "train" not in tokenized_datasets:
        raise ValueError("--do_train requires a train dataset")
    train_dataset = lm_datasets["train"]

Running tokenizer on dataset:   0%|          | 0/14 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1229 > 1024). Running this sequence through the model will result in indexing errors


Grouping texts in chunks of 1024:   0%|          | 0/14 [00:00<?, ?ba/s]

In [15]:
print(tokenizer.decode([50257]))
NEW_TOKEN_START = 50257

nyc


## Tracking only true matches at first

In [None]:
# subclass of Trainer https://huggingface.co/transformers/main_classes/trainer.html
class TokensTrackingTrainer(Trainer):
  def compute_loss(self, model, inputs, return_outputs=False):
    #print(inputs['input_ids'])
    outputs = model(**inputs)['logits']
    avg_fit = 0
    fit_tokens = 0
    for batch_num, batch_output in enumerate(outputs):
      for token_num, token_output in enumerate(batch_output):
        correct_token = inputs['input_ids'][batch_num][token_num]
        #print("correct token:")
        #print(correct_token)
        if correct_token.item() >= NEW_TOKEN_START:
          #print("predictions:")
          #print(token_output)
          if correct_token < len(token_output):
            individual_fit = token_output[inputs['input_ids'][batch_num][token_num]].item()
            avg_fit += individual_fit
            fit_tokens += 1
    if fit_tokens > 0:
      print(avg_fit / fit_tokens)

    return super().compute_loss(model, inputs, return_outputs)

trainer = TokensTrackingTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=default_data_collator,
)

In [None]:
trainer.train()

## Loss calculation (i.e. true/false x positives/negatives)

In [16]:
tokenizer([' bagel', ' bagels', ' Sunnyside', ' DUMBO'])

{'input_ids': [[220, 50289], [220, 50278], [220, 50299], [220, 50326]], 'attention_mask': [[1, 1], [1, 1], [1, 1], [1, 1]]}

In [17]:
TRACK_TOKENS = [50278, 50289]

In [18]:
# loss function chosen based on GPT2 implementation https://github.com/huggingface/transformers/blob/391db836ab7ed2ca61c51a7cf1b135b6ab92be58/transformers/modeling_gpt2.py#L539
from torch.nn import CrossEntropyLoss
loss_appearances = []
op = []

In [19]:
class TokensLossTrainer(Trainer):
  def compute_loss(self, model, inputs, return_outputs=False):
    outputs = model(**inputs)

    out_logits = outputs['logits']
    out_labels = inputs['input_ids']
    focus_tokens = [[], []]
    match_tokens = 0

    for batch_num, batch_output in enumerate(outputs['logits']):
        for token_num, token_output in enumerate(batch_output):
          correct_token = inputs['input_ids'][batch_num][token_num]
          #if correct_token.item() >= NEW_TOKEN_START:
          if correct_token.item() in TRACK_TOKENS:
            focus_tokens[0].append(batch_num)
            focus_tokens[1].append(token_num)
            match_tokens += 1
    
    op.append(out_logits)
    shift_logits = out_logits[focus_tokens].contiguous()
    shift_labels = out_labels[focus_tokens].contiguous()
    
    loss_fct = CrossEntropyLoss(ignore_index=-1)
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                    shift_labels.view(-1))
    if match_tokens > 0:
      loss = loss.item()
      loss_appearances.append([outputs['loss'].item(), loss])
      print(loss)

    return super().compute_loss(model, inputs, return_outputs)

trainer = TokensLossTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=default_data_collator,
)

In [None]:
trainer.train()

In [None]:
import json
open('loss_list.json', 'w').write(json.dumps(loss_appearances))

5317

# Compare loss on multiple-token phrases and probability using ecco

In [None]:
%%capture
! pip install ecco transformers

In [None]:
import ecco
lm = ecco.from_pretrained('gpt2')

In [None]:
output_0 = lm.generate("I ate an", generate=1, do_sample=False)
output_0.layer_predictions(position=3, layer=11)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
output_1 = lm.generate("I ate an apple", generate=1, do_sample=False)
output_1.layer_predictions(position=4, layer=11)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained('gpt2')
tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [None]:
from torch import tensor

def tokenized(inp):
  inputs = tokenizer([inp])
  inputs['input_ids'] = tensor(inputs['input_ids'])
  inputs['attention_mask'] = tensor(inputs['attention_mask'])
  return inputs

In [None]:
from torch.nn import CrossEntropyLoss

input1 = tokenized('I ate an apple')
print(input1)
output1 = model(**input1)
output1.keys()

{'input_ids': tensor([[   40, 15063,   281, 17180]]), 'attention_mask': tensor([[1, 1, 1, 1]])}


odict_keys(['logits', 'past_key_values'])

In [None]:
out_logits = output1['logits']
out_labels = input1['input_ids']

In [None]:
focus_tokens = [[0, 0], [2, 3]]

In [None]:
out_logits[focus_tokens][..., :-1, :].contiguous()

tensor([[-100.0585,  -98.2642, -102.5634,  ..., -104.1698, -107.1987,
         -102.1312]], grad_fn=<SliceBackward0>)

In [None]:
shift_logits = out_logits[focus_tokens][..., :-1, :].contiguous()
shift_labels = out_labels[focus_tokens][..., 1:].contiguous()

In [None]:
loss_fct = CrossEntropyLoss(ignore_index=-1)
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                shift_labels.view(-1))
loss

tensor(2.2496, grad_fn=<NllLossBackward0>)

In [None]:
input2 = tokenized('I ate an apple pie')
output2 = model(**input2)

out_logits = output2['logits']
out_labels = input2['input_ids']
focus_tokens = [[0, 0, 0], [2, 3, 4]]

shift_logits = out_logits[focus_tokens][..., :-1, :].contiguous()
shift_labels = out_labels[focus_tokens][..., 1:].contiguous()

loss_fct = CrossEntropyLoss(ignore_index=-1)
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                shift_labels.view(-1))
loss

tensor(2.5004, grad_fn=<NllLossBackward0>)

In [None]:
input3 = tokenized('I ate an apple tree')
output3 = model(**input3)

out_logits = output3['logits']
out_labels = input3['input_ids']
focus_tokens = [[0, 0, 0], [2, 3, 4]]

shift_logits = out_logits[focus_tokens][..., :-1, :].contiguous()
shift_labels = out_labels[focus_tokens][..., 1:].contiguous()

loss_fct = CrossEntropyLoss(ignore_index=-1)
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                shift_labels.view(-1))
loss

tensor(4.1849, grad_fn=<NllLossBackward0>)

In [None]:
input4 = tokenized('I ate an avocado')
output4 = model(**input4)

out_logits = output4['logits']
out_labels = input4['input_ids']
focus_tokens = [[0, 0], [2, 3]]

shift_logits = out_logits[focus_tokens][..., :-1, :].contiguous()
shift_labels = out_labels[focus_tokens][..., 1:].contiguous()

loss_fct = CrossEntropyLoss(ignore_index=-1)
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                shift_labels.view(-1))
loss

tensor(4.2713, grad_fn=<NllLossBackward0>)