In [2]:
from google.colab import drive
drive.mount('/content/drive')

DATASET_PATH = "/content/drive/MyDrive/TWM/DataEngineering/FinalDataset/for_salah/"
MODEL_OUTPUT_DIR = "/content/drive/MyDrive/TWM/FineTuning/BERT2GPT/"
MODEL_NAME = "BERT2GPT_v3"

!ls $DATASET_PATH
!ls $MODEL_OUTPUT_DIR

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
_dev.csv  _test.csv  _train.csv
 BERT2GPT_v1	   checkpoint-2000   checkpoint-5000		     runs
 BERT2GPT_v2	   checkpoint-3000   checkpoint-6000
 checkpoint-1000   checkpoint-4000  'Fine tunning BERT2GPT2.ipynb'


In [4]:

%%capture
!pip install -q datasets transformers rouge_score rouge_score

import datasets
import transformers

In [5]:
%%capture
!pip install nlp

In [6]:
#!/usr/bin/env python3
import nlp
import logging
from transformers import BertTokenizer, GPT2Tokenizer, EncoderDecoderModel, Trainer, TrainingArguments

logging.basicConfig(level=logging.INFO)

model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "gpt2")
# cache is currently not supported by EncoderDecoder framework
model.decoder.config.use_cache = False
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

# CLS token will work as BOS token
bert_tokenizer.bos_token = bert_tokenizer.cls_token

# SEP token will work as EOS token
bert_tokenizer.eos_token = bert_tokenizer.sep_token


# make sure GPT2 appends EOS in begin and end
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
    outputs = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
    return outputs


GPT2Tokenizer.build_inputs_with_special_tokens = build_inputs_with_special_tokens
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# set pad_token_id to unk_token_id -> be careful here as unk_token_id == eos_token_id == bos_token_id
gpt2_tokenizer.pad_token = gpt2_tokenizer.unk_token


# set decoding params
model.config.decoder_start_token_id = gpt2_tokenizer.bos_token_id
model.config.eos_token_id = gpt2_tokenizer.eos_token_id
model.config.max_length = 142
model.config.min_length = 56
model.config.no_repeat_ngram_size = 3
model.early_stopping = True
model.length_penalty = 2.0
model.num_beams = 4

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.2.ln_cross_attn.weight', 'h.8.crossattention.bias', 'h.7.crossattention.bias', 'h.5.crossattention.bias', 'h.2.crossattention.q_attn.weight', 'h.1.ln_cross_attn.weight', 'h.9.crossattention.bias', 'h.3.ln_cross_attn.weight', 'h.8.ln_cross_attn.weight', 'h.2.crossattention.c_proj.weight', 'h.5.crossattention.masked_bias', 'h.9.crossattention.masked_bias', 'h.6.crossattention.masked_bias', 'h.10.crossattention.c_attn.weight', 'h.10.crossattention.c_proj.weight', 'h.4.crossattention.bias', 'h.6.crossattention.c_proj.bias', 'h.4.crossattention.c_proj.bias', 'h.11.crossattention.masked_bias', 'h.4.crossattention.c_attn.weight', 'h.9.crossattention.q_attn.weight', 'h.10.crossattention.c_proj.bias', 'h.1.crossattention.q_attn.weight', 'h.10.ln_cross_attn.weight', 'h.0.crossattention.bias', 'h.6.crossattention.c_proj.weight', 'h.5.crossattention.c_attn.weight', 'h.3.crossattent

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

In [7]:
from datasets import load_dataset
import os

train_path = os.path.join(DATASET_PATH, '_train.csv')
test_path = os.path.join(DATASET_PATH, '_test.csv')
dev_path = os.path.join(DATASET_PATH, '_dev.csv')

dataset = load_dataset(
    'csv', 
    data_files={
        'train': train_path,
        'test': test_path,
        'dev': dev_path,
    })

train_dataset = dataset["train"]
val_dataset =  dataset["dev"]

dataset["train"]



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-f22dcc840168c988/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-f22dcc840168c988/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Dataset({
    features: ['Unnamed: 0', 'text', 'input', 'output'],
    num_rows: 8635
})

In [15]:
def format_batch(batch, key, sep):
  return [" ".join(eval(row)) for row in batch[key]]

In [17]:
rouge = nlp.load_metric("rouge", experiment_id=2)

encoder_length = 512
decoder_length = 128
batch_size = 4


# map data correctly
def map_to_encoder_decoder_inputs(batch):    # Tokenizer will automatically set [BOS] <text> [EOS] 
    # use bert tokenizer here for encoder
    inputs = bert_tokenizer(format_batch(batch, "input", " "), padding="max_length", truncation=True, max_length=encoder_length)
    # force summarization <= 128
    outputs = gpt2_tokenizer(format_batch(batch, "output", ' '), padding="max_length", truncation=True, max_length=decoder_length)

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["decoder_input_ids"] = outputs.input_ids
    batch["labels"] = outputs.input_ids.copy()
    batch["decoder_attention_mask"] = outputs.attention_mask

    # complicated list comprehension here because pad_token_id alone is not good enough to know whether label should be excluded or not
    batch["labels"] = [
        [-100 if mask == 0 else token for mask, token in mask_and_tokens] for mask_and_tokens in [zip(masks, labels) for masks, labels in zip(batch["decoder_attention_mask"], batch["labels"])]
    ]

    assert all([len(x) == encoder_length for x in inputs.input_ids])
    assert all([len(x) == decoder_length for x in outputs.input_ids])

    return batch


def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions
    
   
    # all unnecessary tokens are removed
    pred_str = gpt2_tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = gpt2_tokenizer.eos_token_id
    label_str = gpt2_tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

# train_dataset = train_dataset.select(range(32))
# val_dataset = val_dataset.select(range(12))

# make train dataset ready
train_dataset = train_dataset.map(
    map_to_encoder_decoder_inputs, batched=True, batch_size=batch_size, remove_columns=['Unnamed: 0', 'text', 'input', 'output'],
)
train_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

# same for validation dataset
val_dataset = val_dataset.map(
    map_to_encoder_decoder_inputs, batched=True, batch_size=batch_size, remove_columns=['Unnamed: 0', 'text', 'input', 'output'],
)
val_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

INFO:nlp.load:Checking /root/.cache/huggingface/datasets/5ecb6e4b474317b41ae1fe5d702d1af8d86d452f0b1d70f77a12f6f014ded6ac.35bc2c477aa456d2f589656477ccb0b463c21cdfb83a9de86d63de8560a96d1b.py for additional imports.
INFO:nlp.load:Found main folder for metric https://s3.amazonaws.com/datasets.huggingface.co/nlp/metrics/rouge/rouge.py at /usr/local/lib/python3.7/dist-packages/nlp/metrics/rouge
INFO:nlp.load:Found specific version folder for metric https://s3.amazonaws.com/datasets.huggingface.co/nlp/metrics/rouge/rouge.py at /usr/local/lib/python3.7/dist-packages/nlp/metrics/rouge/06783dbed5f6b6a5413f84d2a5f0d9dc9cb871f1aeb3787f2c90a8e3fe60b1c1
INFO:nlp.load:Found script file from https://s3.amazonaws.com/datasets.huggingface.co/nlp/metrics/rouge/rouge.py to /usr/local/lib/python3.7/dist-packages/nlp/metrics/rouge/06783dbed5f6b6a5413f84d2a5f0d9dc9cb871f1aeb3787f2c90a8e3fe60b1c1/rouge.py
INFO:nlp.load:Couldn't find dataset infos file at https://s3.amazonaws.com/datasets.huggingface.co/nlp/m

  0%|          | 0/2159 [00:00<?, ?ba/s]

  0%|          | 0/463 [00:00<?, ?ba/s]

In [20]:
from transformers import Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments


training_args = Seq2SeqTrainingArguments(
    output_dir=MODEL_OUTPUT_DIR,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    # evaluate_during_training=True,
    do_train=True,
    do_eval=True,
    logging_steps=1000,
    save_steps=1000,
    eval_steps=1000,
    overwrite_output_dir=True,
    warmup_steps=2000,
    save_total_limit=10,
    fp16=True,
)

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# start training
trainer.train()

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 8635
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 6477


Step,Training Loss
1000,0.735
2000,0.0056
3000,0.0021
4000,0.0012
5000,0.0015
6000,0.001


Saving model checkpoint to /content/drive/MyDrive/TWM/FineTuning/BERT2GPT/checkpoint-1000
Configuration saved in /content/drive/MyDrive/TWM/FineTuning/BERT2GPT/checkpoint-1000/config.json
Model weights saved in /content/drive/MyDrive/TWM/FineTuning/BERT2GPT/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/TWM/FineTuning/BERT2GPT/checkpoint-2000
Configuration saved in /content/drive/MyDrive/TWM/FineTuning/BERT2GPT/checkpoint-2000/config.json
Model weights saved in /content/drive/MyDrive/TWM/FineTuning/BERT2GPT/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/TWM/FineTuning/BERT2GPT/checkpoint-3000
Configuration saved in /content/drive/MyDrive/TWM/FineTuning/BERT2GPT/checkpoint-3000/config.json
Model weights saved in /content/drive/MyDrive/TWM/FineTuning/BERT2GPT/checkpoint-3000/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/TWM/FineTuning/BERT2GPT/checkpoint-4000
Configuration saved in /content/dr

TrainOutput(global_step=6477, training_loss=0.11523996523204862, metrics={'train_runtime': 3203.7586, 'train_samples_per_second': 8.086, 'train_steps_per_second': 2.022, 'total_flos': 1.58419778863104e+16, 'train_loss': 0.11523996523204862, 'epoch': 3.0})

In [18]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sat Jun 25 00:59:03 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [21]:
trainer.save_model(MODEL_OUTPUT_DIR + "BERT2GPT_v3")

Saving model checkpoint to /content/drive/MyDrive/TWM/FineTuning/BERT2GPT/BERT2GPT_v3
Configuration saved in /content/drive/MyDrive/TWM/FineTuning/BERT2GPT/BERT2GPT_v3/config.json
Model weights saved in /content/drive/MyDrive/TWM/FineTuning/BERT2GPT/BERT2GPT_v3/pytorch_model.bin


In [33]:
from transformers import EncoderDecoderModel
# model = EncoderDecoderModel.from_pretrained(MODEL_OUTPUT_DIR + "BERT2GPT_v3")
model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "gpt2")
model.to("cuda")

loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading weights file https://huggingface.co/bert-base-cased/resolve/

EncoderDecoderModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [34]:
model.decoder.config.use_cache = False
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

# CLS token will work as BOS token
bert_tokenizer.bos_token = bert_tokenizer.cls_token

# SEP token will work as EOS token
bert_tokenizer.eos_token = bert_tokenizer.sep_token


# make sure GPT2 appends EOS in begin and end
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
    outputs = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
    return outputs


GPT2Tokenizer.build_inputs_with_special_tokens = build_inputs_with_special_tokens
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# set pad_token_id to unk_token_id -> be careful here as unk_token_id == eos_token_id == bos_token_id
gpt2_tokenizer.pad_token = gpt2_tokenizer.unk_token


# set decoding params
model.config.decoder_start_token_id = gpt2_tokenizer.bos_token_id
model.config.eos_token_id = gpt2_tokenizer.eos_token_id
model.config.max_length = 142
model.config.min_length = 56
model.config.no_repeat_ngram_size = 3
model.early_stopping = True
model.length_penalty = 2.0
model.num_beams = 4

loading file https://huggingface.co/bert-base-cased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/6508e60ab3c1200bffa26c95f4b58ac6b6d95fba4db1f195f632fa3cd7bc64cc.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791
loading file https://huggingface.co/bert-base-cased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-cased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-cased/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/ec84e86ee39bfe112543192cf981deebf7e6cbe8c91b8f7f8f63c9be44366158.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217

In [35]:
input = "Book about history"
input_ids = bert_tokenizer(input, return_tensors="pt").input_ids.to("cuda")

In [36]:
output_ids = model.generate(input_ids)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [37]:
output_ids

tensor([[50256,   198,   464,   471,    13,    50,    13,  2732,   286,  4796,
           468,  5047,   257,  1966,  5349,  5797,   351, 10086,   284,  4589,
          6503,  7394,   290,  1637, 27194,    13,   198,   198,   464,  4796,
          2732,   338,  4452,   286,   262, 24625,  3611,   357,    46,  3528,
             8,   468,  5047,  1966,  5349, 15906,  3700,   371,    13,   371,
            13,  4176,   351, 10086,   290,  1637,   300, 23496,    13,   198,
           357,  6191,    25,   471,    13,    45,    13,  4452,   286, 24625,
          3611,     8,   198,   198, 17919,    11,   508,   373,   257,   471,
            13,    42,    13,  4430,  3818,    11,   373,  5169,   287,  1737,
          1853,   706,   339,   373,  4978,   319,   257,  6503,  7394,  3877,
            13,   198,    13,   198,    11,   257,  1966,   471,    13,    34,
            13,    50,  7874,  3106,  5349,  5797,    11,   373,  5047,   351,
         10086,    11,  1637, 27194,   290,  1637,  

In [38]:
print(gpt2_tokenizer.decode(output_ids[0]))

<|endoftext|>
The U.S. Department of Justice has charged a former FBI agent with conspiracy to commit wire fraud and money laundering.

The Justice Department's Office of the Inspector General (OIG) has charged former FBI Agent James R. R. Smith with conspiracy and money laundering.
 (Photo: U.N. Office of Inspector General)

Smith, who was a U.K. intelligence officer, was arrested in May 2015 after he was caught on a wire fraud charge.
.
, a former U.C.S.-based FBI agent, was charged with conspiracy, money laundering and money-laundering. (Photo by U.U.S Department of State)
