<a href="https://colab.research.google.com/github/Lukehsu1999/Academic-and-Formality-Rewriter/blob/main/model/GYAFC_Bert2Bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install datasets==1.0.2

import datasets
import transformers

In [None]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased") #uncased->cased
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token

Get dataset

put your path below

In [None]:
path_to_formal_train_split="/content/drive/MyDrive/CityU/FYP/Formal_Classifier_Training_Corpus/EMFR_formal_train"
path_to_informal_train_split="/content/drive/MyDrive/CityU/FYP/Formal_Classifier_Training_Corpus/EMFR_informal_train"

path_to_formal_val_split="/content/drive/MyDrive/CityU/FYP/Formal_Classifier_Training_Corpus/EMFR_formal_eval"
path_to_informal_val_split="/content/drive/MyDrive/CityU/FYP/Formal_Classifier_Training_Corpus/EMFR_informal_eval"

In [None]:
formal_train_split=open(path_to_formal_train_split,"r")
formal_train_lines=formal_train_split.readlines()

informal_train_split=open(path_to_informal_train_split,"r")
informal_train_lines=informal_train_split.readlines()

formal_val_split=open(path_to_formal_val_split,"r")
formal_val_lines=formal_val_split.readlines()

informal_val_split=open(path_to_informal_val_split,"r")
informal_val_lines=informal_val_split.readlines()

In [None]:
print(len(formal_train_lines))
print(len(formal_val_lines))

95056
9506


In [None]:
from datasets import Dataset

train_data_map={"id":[],"formal":[],"informal":[]}
for i in range(0,len(formal_train_lines)): 
  train_data_map["id"].append(i)
  train_data_map["formal"].append(formal_train_lines[i])
  train_data_map["informal"].append(informal_train_lines[i])
  
train_data=Dataset.from_dict(train_data_map)

val_data_map={"id":[],"formal":[],"informal":[]}
for i in range(0,len(formal_val_lines)): 
  val_data_map["id"].append(i)
  val_data_map["formal"].append(formal_val_lines[i])
  val_data_map["informal"].append(informal_val_lines[i])

val_data=Dataset.from_dict(val_data_map)

Process data to model inputs

In [None]:
batch_size=16  # 4, the max size colab can handle Original: batch size: 4; max length= 512
encoder_max_length=128
decoder_max_length=128

In [None]:
def process_data_to_model_inputs(batch):
  # tokenize the inputs and labels
  #inputs = tokenizer(batch["informal"], padding="max_length", truncation=True, max_length=encoder_max_length)
  #outputs = tokenizer(batch["formal"], padding="max_length", truncation=True, max_length=decoder_max_length)

  # try to build a formal -> informal
  inputs = tokenizer(batch["informal"], padding="max_length", truncation=True, max_length=encoder_max_length)
  outputs = tokenizer(batch["formal"], padding="max_length", truncation=True, max_length=decoder_max_length)

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_input_ids"] = outputs.input_ids
  batch["decoder_attention_mask"] = outputs.attention_mask
  batch["labels"] = outputs.input_ids.copy()

  # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
  # We have to make sure that the PAD token is ignored
  batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

  return batch

In [None]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")#uncased->cased
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token

In [None]:

train_data = train_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["id", "formal", "informal"]
)
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)
print(train_data)


val_data = val_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["id", "formal", "informal"]
)
val_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

Warm-starting the Encoder-Decoder Model

In [None]:
from transformers import EncoderDecoderModel
#if you want to start from a checkpoint: 
#bert2bert=EncoderDecoderModel.from_pretrained("/content/drive/MyDrive/CityU/FYP/Models/Formal_Rewriter/Epoch_3")
#if you want to build one from start:
bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "bert-base-cased") #uncased->cased?

In [None]:
# set special tokens
bert2bert.config.decoder_start_token_id = tokenizer.bos_token_id
bert2bert.config.eos_token_id = tokenizer.eos_token_id
bert2bert.config.pad_token_id = tokenizer.pad_token_id

# sensible parameters for beam search
bert2bert.config.vocab_size = bert2bert.config.decoder.vocab_size
bert2bert.config.max_length = 128
bert2bert.config.min_length = 2 
bert2bert.config.no_repeat_ngram_size = 3
bert2bert.config.early_stopping = True
bert2bert.config.length_penalty = 2.0
bert2bert.config.num_beams = 4

Fine-Tuning Warm-Started Encoder-Decoder Models

In [None]:
%%capture
#!rm seq2seq_trainer.py
#!wget https://raw.githubusercontent.com/huggingface/transformers/master/examples/seq2seq/seq2seq_trainer.py

!pip install git-python==1.0.3
!pip install sacrebleu==1.4.12
!pip install rouge_score

In [None]:
#from seq2seq_trainer import Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer #newly added
from transformers import TrainingArguments
from dataclasses import dataclass, field
from typing import Optional

In [None]:
@dataclass
class Seq2SeqTrainingArguments(TrainingArguments):
    label_smoothing: Optional[float] = field(
        default=0.0, metadata={"help": "The label smoothing epsilon to apply (if not zero)."}
    )
    sortish_sampler: bool = field(default=False, metadata={"help": "Whether to SortishSamler or not."})
    predict_with_generate: bool = field(
        default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."}
    )
    adafactor: bool = field(default=False, metadata={"help": "whether to use adafactor"})
    encoder_layerdrop: Optional[float] = field(
        default=None, metadata={"help": "Encoder layer dropout probability. Goes into model.config."}
    )
    decoder_layerdrop: Optional[float] = field(
        default=None, metadata={"help": "Decoder layer dropout probability. Goes into model.config."}
    )
    dropout: Optional[float] = field(default=None, metadata={"help": "Dropout probability. Goes into model.config."})
    attention_dropout: Optional[float] = field(
        default=None, metadata={"help": "Attention dropout probability. Goes into model.config."}
    )
    lr_scheduler: Optional[str] = field(
        default="linear", metadata={"help": f"Which lr scheduler to use."}
    )

In [None]:
# load rouge for validation
rouge = datasets.load_metric("rouge")
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1656.0, style=ProgressStyle(description…




Training <br>
note that downloading checkpoint from colab to local takes roughly 40 mins

In [None]:
# set training arguments - these params are not really tuned, feel free to change
training_args = Seq2SeqTrainingArguments(
    output_dir="./",#maybe ./ is better, cause directly linking to cloud will cause trash overflow
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    do_train=True,
    do_eval=True,
    evaluation_strategy="steps", #"no", "steps", "epochs"
    logging_steps=5941,  #2, set to 1000 for full training
    save_steps=5941,  #16 set to 500 for full training
    eval_steps=5941,  #4 set to 8000 for full training
    warmup_steps=200,  #1 set to 2000 for full training
    #max_steps=16, # delete for full training
    overwrite_output_dir=True,
    save_total_limit=3,
    fp16=True, 
    num_train_epochs=2 #################################################### 2 Dec 30
)

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=bert2bert,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer
)
trainer.train()

In [None]:
demo_sentence="how's the movieeee?"
demo_input=tokenizer(demo_sentence, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
demo_ids=demo_input.input_ids.to("cuda")
demo_attention_mask=demo_input.attention_mask.to("cuda")

outputs=bert2bert.generate(demo_ids)
output_str=tokenizer.decode(outputs[0],skip_special_tokens=True)

print(output_str)

<h1>Tryout</h1>

In [None]:
!pip install transformers
!pip install datasets==1.0.2

import datasets
import transformers
import torch

In [None]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token

loading file https://huggingface.co/bert-base-cased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/6508e60ab3c1200bffa26c95f4b58ac6b6d95fba4db1f195f632fa3cd7bc64cc.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791
loading file https://huggingface.co/bert-base-cased/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/226a307193a9f4344264cdc76a12988448a25345ba172f2c7421f3b6810fddad.3dab63143af66769bbb35e3811f75f7e16b2320e12b7935e216bd6159ce6d9a6
loading file https://huggingface.co/bert-base-cased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-cased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-cased/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/ec84e86ee39bfe112543192cf981deebf7e6cbe8c91b8f7f8f63c9be44366158.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f


In [None]:
path_to_demo_model="/content/drive/MyDrive/CityU/FYP/Mar29Split/EM/checkpoint-8964"

In [None]:
from transformers import EncoderDecoderModel
#if you want to start from a checkpoint: 
demo_model=EncoderDecoderModel.from_pretrained(path_to_demo_model)

In [None]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
demo_model.to(device)

In [None]:
demo_sentence='Lets watcha movie together lol'
demo_input=tokenizer(demo_sentence, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
demo_ids=demo_input.input_ids.to("cuda")
demo_attention_mask=demo_input.attention_mask.to("cuda")

outputs=demo_model.generate(demo_ids)
output_str=tokenizer.decode(outputs[0],skip_special_tokens=True)

print(output_str)

Not so sure whether this is a good movie. -> <br>
Lets watcha movie together lol-> <br>
U r a bad guy. -> <br>
U r a badd people

In [None]:
model.save()

In [None]:
import bisect
events_start = [2,3,6,7]
events_end = [3,4,7,8]

print("Head case: "+str(bisect.bisect_left(events_start, 1)))
print("On Head case: "+str(bisect.bisect_left(events_start, 2)))
print("Middle case: "+str(bisect.bisect_left(events_start, 4)))
print("On Tail case: "+str(bisect.bisect_left(events_start, 7)))
print("Tail case: "+str(bisect.bisect_left(events_start, 8)))

In [None]:
import bisect
events = [[2,3],[3,4],[6,7],[7,8]]
print("Head case: "+str(bisect.bisect_left(events, [1,2])))
print("On Head case: "+str(bisect.bisect_left(events, [2,3])))
print("Middle case: "+str(bisect.bisect_left(events, [4,5])))
print("On Tail case: "+str(bisect.bisect_left(events, [7,8])))
print("Tail case: "+str(bisect.bisect_left(events, [8,9])))
print("Overlap case: "+str(bisect.bisect_left(events, [4,9])))