OK so I have extracted replies from Reddit which "one-up", "clapback", or improve on the parent comment, to the point that they score much higher than the parent.

There are two datasets:
- repetitive meme-y replies (such as "nope") - this is a classification problem
- non-meme, freeform replies - these are a seq2seq problem

This is a sample notebook for the seq2seq dataset (see https://huggingface.co/datasets/georeactor/reddit_one_ups_seq2seq_2014 for more info)

I already have some filters on this:
- comments are selected from each month of 2014
- the parent comment must also have a positive score (i.e. it is not replying to a terrible parent comment)
- no replies from /r/IAmA , where generally were OP's replies to questions.

In [1]:
# install prereqs
! pip install transformers datasets --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.9/452.9 KB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m70.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 KB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# load model for seq2seq; in this case T5; for baby CoLab I use t5-small
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = "t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [3]:
# download seq2seq reddit dataset from HuggingFace
from datasets import load_dataset

# custom prefix for T5 prompting; no idea what would be best here
prefix = "Respond to: "

# based on https://github.com/huggingface/transformers/blob/main/examples/pytorch/translation/run_translation.py
def preprocess_function(examples):
    # define input and target: parent_body -> body
    # sadly this does not allow us to use scores here
    inputs = [ex for ex in examples["parent_body"]]
    targets = [ex for ex in examples["body"]]
    inputs = [prefix + inp for inp in inputs]
    model_inputs = tokenizer(inputs, max_length=1024, padding="max_length", truncation=True)

    labels = tokenizer(text_target=targets, max_length=1024, padding="max_length", truncation=True)

    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# 80-20 split on train-eval
train_dataset = load_dataset(
    "georeactor/reddit_one_ups_seq2seq_2014",
    split='train[:80%]'
).map(
    preprocess_function,
    batched=True,
    remove_columns=['id', 'score', 'parent_id', 'author', 'subreddit', 'parent_score', 'tstamp'],
    desc="Running tokenizer on train dataset",
)

eval_dataset = load_dataset(
    "georeactor/reddit_one_ups_seq2seq_2014",
    split='train[80%:]'
).map(
    preprocess_function,
    batched=True,
    remove_columns=['id', 'score', 'parent_id', 'author', 'subreddit', 'parent_score', 'tstamp'],
    desc="Running tokenizer on eval dataset",
)

Downloading readme:   0%|          | 0.00/2.43k [00:00<?, ?B/s]



Downloading and preparing dataset csv/georeactor--reddit_one_ups_seq2seq_2014 to /root/.cache/huggingface/datasets/georeactor___csv/georeactor--reddit_one_ups_seq2seq_2014-66c077902f41c6b1/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.07M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/georeactor___csv/georeactor--reddit_one_ups_seq2seq_2014-66c077902f41c6b1/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


Running tokenizer on train dataset:   0%|          | 0/16 [00:00<?, ?ba/s]



Running tokenizer on eval dataset:   0%|          | 0/4 [00:00<?, ?ba/s]

In [4]:
from transformers import DataCollatorForSeq2Seq, HfArgumentParser, Seq2SeqTrainer, Seq2SeqTrainingArguments

# I had to make very small batches for CoLab. this and collator's pad_to_multiple_of default is 8
training_args = Seq2SeqTrainingArguments(
    output_dir="./",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_steps=4000,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    #data_collator=data_collator,
    #compute_metrics=compute_metrics if training_args.predict_with_generate else None,
)

In [5]:
# 50-60 minutes per epoch
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: parent_body, body. If parent_body, body are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 15994
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 3999
  Number of trainable parameters = 60506624
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,4.5519


KeyboardInterrupt: ignored

In [8]:
# previous experience with 2 examples/batch

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: body, parent_body. If body, parent_body are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 15994
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 7997
  Number of trainable parameters = 60506624


Step,Training Loss
500,4.2918
1000,4.3289
1500,4.1951
2000,4.1622
2500,4.126
3000,4.1196
3500,4.1072
4000,4.1263
4500,4.1029
5000,4.1502


Saving model checkpoint to ./checkpoint-500
Configuration saved in ./checkpoint-500/config.json
Model weights saved in ./checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
Special tokens file saved in ./checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./checkpoint-1000
Configuration saved in ./checkpoint-1000/config.json
Model weights saved in ./checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./checkpoint-1500
Configuration saved in ./checkpoint-1500/config.json
Model weights saved in ./checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./checkpoint-1500/tokenizer_config.json
Special tokens file saved in ./checkpoint-1500/special_tokens_map.json
Saving model checkpoint to ./checkpoint-2000
Configuration saved in ./checkpoint-2000/config.json
Model weig

TrainOutput(global_step=7997, training_loss=4.13200546518421, metrics={'train_runtime': 3550.1764, 'train_samples_per_second': 4.505, 'train_steps_per_second': 2.253, 'total_flos': 4329313545486336.0, 'train_loss': 4.13200546518421, 'epoch': 1.0})

In [15]:
trainer.save_model("./drive/MyDrive/mlin/clapback-t5")

Saving model checkpoint to ./drive/MyDrive/mlin/clapback-t5
Configuration saved in ./drive/MyDrive/mlin/clapback-t5/config.json
Model weights saved in ./drive/MyDrive/mlin/clapback-t5/pytorch_model.bin
tokenizer config file saved in ./drive/MyDrive/mlin/clapback-t5/tokenizer_config.json
Special tokens file saved in ./drive/MyDrive/mlin/clapback-t5/special_tokens_map.json


In [16]:
! ls drive/MyDrive/mlin/clapback-t5

config.json	   special_tokens_map.json  tokenizer.json
pytorch_model.bin  tokenizer_config.json    training_args.bin
