### Init libraries

In [None]:
!pip install transformers
!pip install wandb

In [None]:
import logging
import math
import os
from dataclasses import dataclass, field
from typing import Optional
import pickle

from transformers import (
    CONFIG_MAPPING,
    MODEL_WITH_LM_HEAD_MAPPING,
    AutoConfig,
    GPT2LMHeadModel,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    LineByLineTextDataset,
    PreTrainedTokenizer,
    TextDataset,
    Trainer,
    TrainingArguments,
    set_seed,
)
import random
import wandb
import pickle
import random

# 2022 importing and formatting clean data from GYAFC:


In [None]:
!gdown --id 
!gdown --id 
!gdown --id 
!gdown --id 


with open('/content/3_all_acl_cleaned.pkl', 'rb') as handle:
    all_acl_cleaned = pickle.load(handle)

In [None]:
bos = "<BOS>"
eos = "<EOS>"
sg = "<SCI_GEN>"

In [None]:
#sents_gyafc = [bos + row["formal"] + sg + row["informal"] + eos for index, row in cleaned_gyafc.iterrows()]
#sents_train_gyafc = [bos + row["formal"] + sg + row["informal"] + eos for index, row in cleaned_train_gyafc.iterrows()]
#sents_paranmt = [bos + row["t1"] + sg + row["t2"] + eos for index, row in cleaned_paranmt.iterrows()]
sents_acl = [bos + row["output"] + sg + row["input"] + eos for index, row in all_acl_cleaned.iterrows()]

In [None]:
# Save data
#sents_gyafc.extend(sents_paranmt)
random.shuffle(sents_acl)

textfile = open("/content/train.txt", "w")
for element in sents_acl[1000:]:
    textfile.write(element + "\n")
textfile.close()

textfile = open("/content/eval.txt", "w")
for element in sents_acl[:1000]:
    textfile.write(element + "\n")
textfile.close()

# Old code continued:

### Model arguments

In [None]:

# Setup logging
logger = logging.getLogger(__name__)

# Get access to model types and model configs to select GPT2 model and config
MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

In [None]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
    """

    model_name_or_path: Optional[str] = field(
        default=None,
        metadata={
            "help": "The model checkpoint for weights initialization. Leave None if you want to train a model from scratch."
        },
    )
    model_type: Optional[str] = field(
        default=None,
        metadata={
            "help": "If training from scratch, pass a model type from the list: "
            + ", ".join(MODEL_TYPES)
        },
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={
            "help": "Where do you want to store the pretrained models downloaded from s3"
        },
    )


In [None]:
@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    train_data_file: Optional[str] = field(
        default=None, metadata={"help": "The input training data file (a text file)."}
    )
    eval_data_file: Optional[str] = field(
        default=None,
        metadata={
            "help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."
        },
    )
    line_by_line: bool = field(
        default=False,
        metadata={
            "help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."
        },
    )

    mlm: bool = field(
        default=False,
        metadata={
            "help": "Train with masked-language modeling loss instead of language modeling."
        },
    )

    block_size: int = field(
        default=-1,
        metadata={
            "help": "Optional input sequence length after tokenization."
            "The training dataset will be truncated in block of this size for training."
            "Default to the model max input length for single sentence inputs (take into account special tokens)."
        },
    )
    
    overwrite_cache: bool = field(
        default=False,
        metadata={"help": "Overwrite the cached training and evaluation sets"},
    )



In [None]:
# Create LineByLineDataset from Movie Plots text file
def get_dataset(
    args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, evaluate=False
):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        return LineByLineTextDataset(
            tokenizer=tokenizer, file_path=file_path, block_size=args.block_size
        )
    else:
        return TextDataset(
            tokenizer=tokenizer,
            file_path=file_path,
            block_size=args.block_size,
            overwrite_cache=args.overwrite_cache,
        )

In [None]:
customTokenList = ["<SCI_GEN>"]

### Wandb

In [None]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mdaniel_ml[0m (use `wandb login --relogin` to force relogin)


### Run training

In [None]:
def main():

    model_args = ModelArguments(
        model_name_or_path="gpt2-medium", model_type="gpt2-medium"
    )
    data_args = DataTrainingArguments(
        train_data_file="/content/train.txt",
        eval_data_file="/content/eval.txt",
        line_by_line=True,
        block_size=512,
        overwrite_cache=True,
    )
    training_args = TrainingArguments(
        output_dir="content/",
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        logging_steps=500,
        per_device_train_batch_size=8,
        num_train_epochs=8,
        save_total_limit=8,
        save_strategy = "epoch",
    )

    # Cut 1 was here

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed for deterministic training runs
    set_seed(training_args.seed)


    config = AutoConfig.from_pretrained(
        "gpt2-medium", cache_dir=model_args.cache_dir
    )
   
    tokenizer = AutoTokenizer.from_pretrained(
        "gpt2-medium", cache_dir=model_args.cache_dir
    )

    model = GPT2LMHeadModel.from_pretrained(
        "gpt2-medium",
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )
    wandb.watch(model, log='all')

    special_tokens_dict = {
        "bos_token": "<BOS>",
        "eos_token": "<EOS>",
        "pad_token": "<PAD>",
        "additional_special_tokens": customTokenList,
    }


    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))
    # Update the models understanding of the bos and eos tokens
    model.config.bos_token_id = tokenizer.bos_token_id
    model.config.eos_token_id = tokenizer.eos_token_id
    ###########################################################

    # Cut 2 was here

    train_dataset = (
        get_dataset(data_args, tokenizer=tokenizer) if training_args.do_train else None
    )
    print('train_dataset: \n' + str(len(train_dataset)))
    eval_dataset = (
        get_dataset(data_args, tokenizer=tokenizer, evaluate=True)
        if training_args.do_eval
        else None
    )
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=data_args.mlm,
    )

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

    # Training
    try:
      if training_args.do_train:
          model_path = (
              model_args.model_name_or_path
              if model_args.model_name_or_path is not None
              and os.path.isdir(model_args.model_name_or_path)
              else None
          )
          trainer.train(model_path=model_path)
          trainer.save_model()
          tokenizer.save_pretrained(training_args.output_dir)
    except KeyboardInterrupt:
      print("Saving model that was in the middle of training")
      trainer.save_model()
      tokenizer.save_pretrained(training_args.output_dir)
      return

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        result = {"perplexity": perplexity}

        output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

        results.update(result)

    return results

In [None]:
# Press the Run Cell button to the left to start training
if __name__ == "__main__":
  main()

# To stop training and save model, press the same Run Cell button (now, it is the Interrupt Execution button)

### Inference

In [None]:
model = GPT2LMHeadModel.from_pretrained("/content/content/checkpoint-29474")

loading configuration file /content/content/checkpoint-29474/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2-medium",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50257,
  "embd_pdrop": 0.1,
  "eos_token_id": 50258,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1024,
  "n_head": 16,
  "n_inner": null,
  "n_layer": 24,
  "n_positions": 1024,
  "n_special": 0,
  "predict_special_tokens": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_v

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
_ = model.to(device)

In [None]:
input_string = "We trained this model for the task of NER."

encoded_sent = tokenizer.encode("<BOS>"+ input_string + "<SCI_GEN>", return_tensors = "pt", return_attention_mask= True).to(device)
output = model.generate(inputs = encoded_sent , pad_token_id= tokenizer.pad_token_id, max_length = 200)

tokenizer.decode(output[0], skip_special_tokens= False)

'<BOS>We trained this model for the task of NER.<SCI_GEN>we trained this model on the ner task<EOS>'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
model.save_pretrained("/content/drive/MyDrive/MRP2_model_checkpoints/GPT2_model2_no_rewards/gpt2_model/epoch2")

Configuration saved in /content/drive/MyDrive/MRP2_model_checkpoints/GPT2_model2_no_rewards/gpt2_model/epoch2/config.json
Model weights saved in /content/drive/MyDrive/MRP2_model_checkpoints/GPT2_model2_no_rewards/gpt2_model/epoch2/pytorch_model.bin


# Questions:
>Tags what do they do?
>Capitalisation not an issue for token understanding of model?
>

In [None]:
# This cell is to style the Google Colab's output properly (Just blindly run this)
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
# Run these cells for story generation
from transformers import pipeline, TextGenerationPipeline, GPT2LMHeadModel, AutoTokenizer
""" 
Below, my model checkpoint is commented out. You can replace your checkpoint 
with that to test if your checkpoint didn't train for long enough
"""
checkpoint = "/content/gdrive/MyDrive/startup/data/checkpoint-150000/"
tokenizer_path = "/content/gdrive/MyDrive/startup/data/"

model = GPT2LMHeadModel.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
story_generator = TextGenerationPipeline(model=model, tokenizer=tokenizer)

In [None]:
input_prompt = """
"""

input_ids = tokenizer.encode(input_prompt, return_tensors='pt')

# num_beams=5,
    # early_stopping=True    

conv_output = model.generate(
    input_ids,
    max_length=100,
    top_p=0.90,
    temperature=0.90,
    top_k=50,
    do_sample=True,
    early_stopping=True
)

# story_output = story_generator(input_prompt, 
#                         max_length=100, 
#                         do_sample=True,
#                         repetition_penalty=1.1, 
#                         temperature=0.90,
#                         num_beams=5,
#                         early_stopping=True,
#                         top_p=0.90,
#                         top_k=50)

# for conv in conv_output:
print(tokenizer.decode(conv_output[0], skip_special_tokens=True))
# print('\n')

In [None]:
print(len(conv_output))

In [None]:
def score(tokens_tensor):
    loss = story_generator(tokens_tensor, labels=tokens_tensor)[0]
    return np.exp(loss.gpu().detach().numpy())


# for text in input:
tokens_tensor = tokenizer.encode(input, add_special_tokens=False, return_tensors="pt")
print (input, score(tokens_tensor))