In [None]:
# !pip install pip install jupyter
# !pip install ipywidgets widgetsnbextension pandas-profiling
# !jupyter nbextension enable --py widgetsnbextension
# !jupyter nbextension enable --py widgetsnbextension --sys-prefix

# !jupyter labextension install @jupyter-widgets/jupyterlab-manager

In [None]:
# !pip install --quiet  datasets # to access squad dataset
# !pip install --quiet pyarrow   # to deal with parquet files for saving dataset if required
# !pip install --quiet  tqdm     # for progress bars
# !pip install --quiet transformers # for t5 model
# !pip install --quiet tokenizers  # tokenizers from HuggingFace
# !pip install --quiet sentencepiece # subword tokenizer used by T5
# !pip install --quiet pytorch-lightning # pytorch wrapper 
# !pip install --quiet torchtext # text utilities
# !pip install pandas datasets pyarrow tqdm transformers tokenizers sentencepiece torchtext

# # watch -n 1 free -h
# # watch -n 1 nvidia-smi

# # torch.cuda.empty_cache() 

# Fetching Datasets

In [3]:
import os, ipdb
import random

import numpy as np
import torch

import pandas as pd
from tqdm import tqdm
from dataclasses import dataclass, field
from typing import Optional

from datasets import DatasetDict, Dataset, load_from_disk
from tokenizers import AddedToken
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, EarlyStoppingCallback
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, HfArgumentParser
from transformers.optimization import Adafactor, AdafactorSchedule

import random, evaluate

# from ../evaluation_metrics import Metrics


seed = 42
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)

os.environ["TOKENIZERS_PARALLELISM"] = "false" # or "true", depending on your needs

# pd.options.display.max_rows , pd.options.display.max_columns  = 100,100  

device = 'cuda' if torch.cuda.is_available() else "cpu"
device

2023-09-06 23:55:33.041649: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


'cuda'

In [4]:
# import ipywidgets as widget
# widget.IntSlider()

In [5]:
# from huggingface_hub import notebook_login

# notebook_login()

# Check the data 


In [6]:
# # df = pd.read_parquet('train_tdm_f1_v1.parquet')#.iloc[:2000,:]
# # df = pd.read_parquet('train_squad.parquet')

# # mode = "tdm"
# mode = "tdms"
# train_path = f'../data/train_{mode}_f1_v2_short.parquet' 
# validation_path = f'../data/dev_{mode}_f1_v2_short.parquet'

# df = pd.read_parquet(f'{train_path}')

# df

In [7]:
# str(df.at[5, 'answer'])

# Creating a Pytorch DataSet for T5 Training and Validation

In [8]:
@dataclass
class ScriptArguments:
    model_name: Optional[str] = field(default="google/flan-t5", metadata={"help": "the model name"})
    log_with: Optional[str] = field(default="wandb", metadata={"help": "use 'wandb' to log with wandb"})

    dataset_name: Optional[str] = field(default="lvwerra/stack-exchange-paired", metadata={"help": "the dataset name"})
    subset: Optional[str] = field(default="data/finetune", metadata={"help": "the subset to use"})
    split: Optional[str] = field(default="train", metadata={"help": "the split to use"})
    size_valid_set: Optional[int] = field(default=4000, metadata={"help": "the size of the validation set"})
    streaming: Optional[bool] = field(default=True, metadata={"help": "whether to stream the dataset"})
    shuffle_buffer: Optional[int] = field(default=5000, metadata={"help": "the shuffle buffer size"})
    seq_length: Optional[int] = field(default=1024, metadata={"help": "the sequence length"})
    num_workers: Optional[int] = field(default=4, metadata={"help": "the number of workers"})

    max_steps: Optional[int] = field(default=500, metadata={"help": "the maximum number of sgd steps"})
    logging_steps: Optional[int] = field(default=10, metadata={"help": "the logging frequency"})
    save_steps: Optional[int] = field(default=10, metadata={"help": "the saving frequency"})
    per_device_train_batch_size: Optional[int] = field(default=4, metadata={"help": "the per device train batch size"})
    per_device_eval_batch_size: Optional[int] = field(default=1, metadata={"help": "the per device eval batch size"})
    gradient_accumulation_steps: Optional[int] = field(default=2, metadata={"help": "the gradient accumulation steps"})
    gradient_checkpointing: Optional[bool] = field(
        default=True, metadata={"help": "whether to use gradient checkpointing"}
    )
    group_by_length: Optional[bool] = field(default=False, metadata={"help": "whether to group by length"})
    packing: Optional[bool] = field(default=True, metadata={"help": "whether to use packing for SFTTrainer"})

    lora_alpha: Optional[float] = field(default=16, metadata={"help": "the lora alpha parameter"})
    lora_dropout: Optional[float] = field(default=0.05, metadata={"help": "the lora dropout parameter"})
    lora_r: Optional[int] = field(default=8, metadata={"help": "the lora r parameter"})

    learning_rate: Optional[float] = field(default=1e-4, metadata={"help": "the learning rate"})
    lr_scheduler_type: Optional[str] = field(default="cosine", metadata={"help": "the lr scheduler type"})
    num_warmup_steps: Optional[int] = field(default=100, metadata={"help": "the number of warmup steps"})
    weight_decay: Optional[float] = field(default=0.05, metadata={"help": "the weight decay"})
    optimizer_type: Optional[str] = field(default="paged_adamw_32bit", metadata={"help": "the optimizer type"})

    output_dir: Optional[str] = field(default="./results", metadata={"help": "the output directory"})
    log_freq: Optional[int] = field(default=1, metadata={"help": "the logging frequency"})


parser = HfArgumentParser(ScriptArguments)
script_args = parser.parse_args_into_dataclasses([])[0]

if script_args.group_by_length and script_args.packing:
    raise ValueError("Cannot use both packing and group by length")

# script_args.per_device_train_batch_size,
script_args.gradient_accumulation_steps,
# script_args.per_device_eval_batch_size,

script_args.seq_length

# script_args.dataset_name = "./data/LLLM_TDMS_ALL_TEMPLATE/fold1"
# script_args.output_dir = "./model_ckpt/tdms_all_template_v2"
# script_args.run_name = "sft_llama2_tdms_all_Template_v2"

# script_args.dataset_name = "./data/LLLM_DOCTEAT_TDM_ALL_TEMPLATE/fold2"
# script_args.output_dir = "./model_ckpt/docteat_tdm_f2_all_template"
# script_args.run_name = "sft_llama2_docteat_tdm_f2_all_Template"

script_args.model_name = "google/flan-t5"
script_args.size = "large"
script_args.dataset_name = "../data/LLLM_DOCTEAT_TDM_ALL_TEMPLATE/fold2"
script_args.output_dir = f"../model_ckpt/docteat_flan_t5_{script_args.size}_tdm_f2_all_template"
script_args.run_name = f"sft_docteat_flan_t5_{script_args.size}_tdm_f2_all_Template"
script_args.seq_length = 512
script_args.per_device_train_batch_size = 2
script_args.gradient_accumulation_steps = 2
script_args.per_device_eval_batch_size = 2
script_args.max_source_length = 512
script_args.max_target_length = 512
script_args.label_pad_token_id = -100
script_args.pad_to_multiple_of = 8
script_args.model_max_length = 512

# # multi GPU
# script_args.per_device_train_batch_size = 4

# script_args.dataset_name = "./data/LLLM_LONG_TDM_ALL_TEMPLATE/fold1"
# script_args.output_dir = "./model_ckpt/long_tdm_f1_all_template"
# script_args.run_name = "sft_llama2_long_tdm_f1_all_Template"
# script_args.seq_length = 2400
# script_args.per_device_train_batch_size = 2
# script_args.gradient_accumulation_steps = 2

script_args.save_steps = 2
script_args.logging_steps = 2
script_args.streaming = False
script_args.num_train_epochs = 5
script_args.save_total_limit = 10

In [9]:
# Number of CPU cores
workers = os.cpu_count()
workers

128

In [10]:
tokenizer = AutoTokenizer.from_pretrained(f"{script_args.model_name}-{script_args.size}")

tokenizer.add_tokens(AddedToken("\n", normalized=False))
tokenizer.add_tokens(AddedToken("{", normalized=False))
tokenizer.add_tokens(AddedToken("}", normalized=False))

model = AutoModelForSeq2SeqLM.from_pretrained(f"{script_args.model_name}-{script_args.size}")

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=script_args.label_pad_token_id,
    pad_to_multiple_of=script_args.label_pad_token_id
)

print(f"Max token lenght: {tokenizer.model_max_length}")

Max token lenght: 512


In [11]:
# model_name = ["google/flan-t5", "google/long-t5"]
# size = ["-base", "-large", "-xl"]
# model_attention = ["","-local", "-tglobal"]


# bs = 32
# epochs = 5

# # gpus = 1
# # workers = os.cpu_count()

# gpus = -1
# workers = os.cpu_count()
# # workers = 0

# model_idx = 0
# size_idx = 1
# model_attent_idx = 0


# # # # # model_max_length = None
# # # model_max_length = 6000
# # # max_len_inp = 5750
# # # model_max_len_out = 250

# # model_max_length = 512
# # max_len_inp = 512
# # # max_len_inp = 500
# # # max_len_inp = 512
# # # model_max_len_out = 130
# # model_max_len_out = 512

# # model_id = "google/flan-t5-large"
# model_id = model_name[model_idx]+size[size_idx]
# max_source_length = 512
# max_target_length = 512
# label_pad_token_id = -100
# model_max_length = 512

# # dataset_path = "../data/LLLM_TDMS_ALL_TEMPLATE/"
# dataset_path = "../data/LLLM_DOCTEAT_TDM_ALL_TEMPLATE/"
# metric_name = "partial_f1s_overall"

# # fine_tuned_model_repository = f"{dataset_path}"
# # fine_tuned_model_repository = f"{path_prefix}/models/text_based/{model_is_from_train_set}"
# # tokenizer_repository = f"{path_prefix}/models/text_based/{model_is_from_train_set}/tokenizer"

# tokenizer = AutoTokenizer.from_pretrained(model_id)

# tokenizer.add_tokens(AddedToken("\n", normalized=False))
# tokenizer.add_tokens(AddedToken("{", normalized=False))
# tokenizer.add_tokens(AddedToken("}", normalized=False))

# model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

# data_collator = DataCollatorForSeq2Seq(
#     tokenizer,
#     model=model,
#     label_pad_token_id=label_pad_token_id,
#     pad_to_multiple_of=8
# )


# # t5_tokenizer = AutoTokenizer.from_pretrained(
# #     f"{model_name[model_idx]}{model_attention[model_idx]}{size[size_idx]}", model_max_length=model_max_length)
# # t5_model = T5ForConditionalGeneration.from_pretrained(f"{model_name[model_idx]}{model_attention[model_idx]}{size[size_idx]}")

# # t5_tokenizer.add_tokens(AddedToken("{", normalized=False))
# # t5_tokenizer.add_tokens(AddedToken("}", normalized=False))

# # t5_model = LongT5Model.from_pretrained(f"{model[model_idx]}{model_attention[model_idx]}{size[size_idx]}")
# # t5_model = LongT5ForConditionalGeneration.from_pretrained(f"{model[model_idx]}{model_attention[model_idx]}{size[size_idx]}")

# print(f"Max token lenght: {tokenizer.model_max_length}")

In [12]:
num_gpus = torch.cuda.device_count()

print(f"Max token lenght: {tokenizer.model_max_length}")
print(f"Batch size: {script_args.per_device_train_batch_size * script_args.gradient_accumulation_steps * num_gpus }")
print(f"Number of GPUs available: {num_gpus}")

Max token lenght: 512
Batch size: 4
Number of GPUs available: 1


In [13]:
# model_save_path = f'../model_ckpt/{model_name[model_idx].replace("/","_").replace("-","_")}{model_attention[model_attent_idx].replace("-","_")}{size[size_idx].replace("-","_")}_{bs}_{epochs}_{model_max_length}_{max_source_length}_{max_target_length}_gpu_{num_gpus}'
# tokenizer_repository = f'../model_ckpt/{model_name[model_idx].replace("/","_").replace("-","_")}{model_attention[model_attent_idx].replace("-","_")}{size[size_idx].replace("-","_")}_tokenizer_{bs}_{epochs}_{model_max_length}_{max_source_length}_{max_target_length}_gpu_{num_gpus}'

In [14]:
script_args.dataset_name

'../data/LLLM_DOCTEAT_TDM_ALL_TEMPLATE/fold2'

In [15]:
# dataset = DatasetDict.load_from_disk(f"{dataset_path}/fold2")
dataset = DatasetDict.load_from_disk(f"{script_args.dataset_name}")

dataset = dataset.shuffle(seed=seed)

train_dataset = dataset["train"].shard(num_shards=50, index=0)
eval_dataset = dataset["validation"].shard(num_shards=300, index=0)

# train_dataset = dataset["train"]
# eval_dataset = dataset["validation"]

In [16]:
def compute_metrics(eval_preds):

    preds, labels = eval_preds
    
    if isinstance(preds, tuple):
        preds = preds[0]

    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)  # type: ignore
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)  # type: ignore
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])
    results = clf_metrics.compute(predictions=[1 if "unanswerable" == x.replace("</s>", "") else 0 for x in decoded_preds], 
                        references=[1 if "unanswerable" == x else 0 for x in decoded_labels]
    )

    rouge = evaluate.load('rouge')
    
    rouge_results = rouge.compute(
        predictions=[pred.replace("</s>", "") for pred in decoded_preds],
        references=decoded_labels
    )
    results.update(rouge_results) 

    ipdb.set_trace()
    
    # result = Metrics.evaluate_property_wise_text_based(label_list=decoded_labels, prediction_list=decoded_preds)
    # result.update(Metrics.evaluate_rouge(label_list=decoded_labels, prediction_list=decoded_preds))
    return results

In [17]:
def tokenize_function(sample):
    # tokenize inputs
    model_inputs = tokenizer(sample["prompt"], max_length=script_args.max_source_length, 
                             padding="max_length", truncation=True,
                             return_tensors="pt")

    
    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["answer"], max_length=script_args.max_target_length, padding="max_length",
                       truncation=True, return_tensors="pt")
    # labels = tokenizer(text_target=sample["answer"], padding="max_length",
    #                    truncation=False, return_tensors="pt")

    # # Check if the length of labels is >= 512
    # if any(len(label) >= 512 for label in labels["input_ids"]):
    #     return {}  # Return empty dict to skip this example

    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]  # type: ignore
    ]
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [18]:
# def tokenize_and_filter_function(examples):
#     # Tokenize source and target
#     tokenized_sources = tokenizer(examples["source"], truncation=True, max_length=512, padding=False)
#     tokenized_targets = tokenizer(examples["target"], truncation=True, max_length=512, padding=False)
    
#     # Filter out pairs where either source or target is longer than 512 tokens
#     valid_indices = [idx for idx in range(len(tokenized_sources["input_ids"])) 
#                      if len(tokenized_sources["input_ids"][idx]) <= 512 and len(tokenized_targets["input_ids"][idx]) <= 512]
    
#     tokenized_sources = {key: [value[idx] for idx in valid_indices] for key, value in tokenized_sources.items()}
#     tokenized_targets = {key: [value[idx] for idx in valid_indices] for key, value in tokenized_targets.items()}

#     # Merge tokenized sources and targets
#     tokenized = {f'source_{key}': value for key, value in tokenized_sources.items()}
#     tokenized.update({f'target_{key}': value for key, value in tokenized_targets.items()})

#     return tokenized

# # Apply the function using map
# tokenized_dataset = dataset.map(tokenize_and_filter_function, batched=True)


In [19]:
train_tokenized_dataset = train_dataset.map(tokenize_function, batched=True,
                                            # remove_columns=dataset_columns_to_remove
                                            )
eval_tokenized_dataset = eval_dataset.map(tokenize_function, batched=True,
                                        #   remove_columns=dataset_columns_to_remove
                                          )
print(f"Keys of tokenized dataset: {list(train_tokenized_dataset.features)}")

optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
lr_scheduler = AdafactorSchedule(optimizer)
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=10, early_stopping_threshold=0.001)

Map:   0%|          | 0/1654 [00:00<?, ? examples/s]

Map:   0%|          | 0/118 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['prompt', 'answer', '__index_level_0__', 'input_ids', 'attention_mask', 'labels']


In [20]:
script_args.optimizer_type

'paged_adamw_32bit'

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=script_args.output_dir,
    per_device_train_batch_size=script_args.per_device_train_batch_size,
    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
    per_device_eval_batch_size=script_args.per_device_eval_batch_size,
    learning_rate=script_args.learning_rate,
    logging_steps=script_args.logging_steps,
    # report_to=script_args.log_with,
    save_steps=script_args.save_steps,
    save_total_limit=script_args.save_total_limit,
    lr_scheduler_type=script_args.lr_scheduler_type,
    warmup_steps=script_args.num_warmup_steps,
    # optim=script_args.optimizer_type
    num_train_epochs=script_args.num_train_epochs,
    run_name=script_args.run_name,
    
    predict_with_generate=True,
    generation_max_length=script_args.max_target_length,
    
    load_best_model_at_end=True,
    # metric_for_best_model=metric_name,
    # greater_is_better=True,
    
    # logging_dir=f"{model_save_path}/logs",
    # eval_steps=500,  # Evaluate the model every 500 steps,
    evaluation_strategy="steps",
    # logging_strategy="steps",
    save_strategy="steps",
    
    # push_to_hub=False,    
    # seed=seed
)


# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_tokenized_dataset,
    eval_dataset=eval_tokenized_dataset,
    # max_seq_length=script_args.seq_length,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, lr_scheduler),
    callbacks=([early_stopping_callback])

)

trainer.train()

# trainer.evaluate()

[34m[1mwandb[0m: Currently logged in as: [33mskabongo[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Rouge1,Rouge2,Rougel,Rougelsum
2,2.9777,3.329874,0.694915,0.571429,0.705882,0.48,0.244166,0.009701,0.242554,0.24466


> [0;32m/tmp/ipykernel_1382267/2500649464.py[0m(31)[0;36mcompute_metrics[0;34m()[0m
[0;32m     29 [0;31m    [0;31m# result = Metrics.evaluate_property_wise_text_based(label_list=decoded_labels, prediction_list=decoded_preds)[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     30 [0;31m    [0;31m# result.update(Metrics.evaluate_rouge(label_list=decoded_labels, prediction_list=decoded_preds))[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 31 [0;31m    [0;32mreturn[0m [0mresults[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  c


In [None]:
# !pip install transformers[torch]

In [None]:
tokenizer.save_pretrained(f"{script_args.output_dir}_tokenize")
best_ckpt_path = trainer.state.best_model_checkpoint
print(f"best epoch: {best_ckpt_path}")

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# 1. Load the model and tokenizer
model_name = "t5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 2. Tokenize the input sequence
input_text = "translate English to French: I love programming"
input_tokens = tokenizer.encode(input_text, return_tensors="pt")

# 3. Generate the output sequence
output_tokens = model.generate(input_tokens)

# 4. Decode the generated output
output_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

print(output_text)


In [None]:
decoded_preds

In [None]:
#imports
import pandas as pd
import torch, os
from tqdm import tqdm
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from pprint import pprint
import copy
import numpy as np

from collections import defaultdict
import ipdb
import random

import pytorch_lightning as pl
from torch.optim import AdamW
import argparse
from transformers import (
    get_linear_schedule_with_warmup
  )

from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor

from tokenizers import AddedToken

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup,
    AutoTokenizer, 
    LongT5Model,
    LongT5ForConditionalGeneration
) 




In [None]:
# df["Lenght answer"] = df.answer.apply(lambda x: len(str(x)))
# df["Lenght answer"] = df.answer.apply(lambda x: len(str(x).split()))
# df["Lenght question"] = df.template_question.apply(lambda x: len(x.split()))

# # df["Lenght answer"] = df.answer.apply(lambda x: len(x))
# # df["Lenght question"] = df.template_question.apply(lambda x: len(x))

# df["Lenght answer"] = df.answer.apply(lambda x: len(t5_tokenizer.batch_encode_plus(
#                 [str(x)], 
#                 truncation = False,
#                 return_tensors="pt"
#             )['input_ids'][0]))
# df["Lenght question"] = df.template_question.apply(lambda x: len(t5_tokenizer.batch_encode_plus(
#                 [str(x)], 
#                 truncation = False,
#                 return_tensors="pt"
#             )['input_ids'][0]))

# df.describe()

In [None]:
class QuestionGenerationDataset(Dataset):
    def __init__(self, tokenizer, filepath, max_len_inp=512,max_len_out=96):
        self.path = filepath

        self.passage_column = "context"
        self.answer = "answer"
        self.question = "question"
        self.template_question = "template_question"

        # self.data = pd.read_csv(self.path)
        self.data = pd.read_parquet(self.path)

        self.max_len_input = max_len_inp
        self.max_len_output = max_len_out
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []
        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()  #squeeze to get rid of the batch dimension
        target_mask = self.targets[index]["attention_mask"].squeeze()  # convert [batch,dim] to [dim] 

        labels = copy.deepcopy(target_ids)
        labels [labels==0] = -100

        return {
            "source_ids": source_ids, "source_mask": src_mask, 
            "target_ids": target_ids, "target_mask": target_mask,"labels":labels}

    def _build(self):
        for rownum,val in tqdm(self.data.iterrows(), total=len(self.data)): # Iterating over the dataframe
#             passage,answer,target = val[self.passage_column],val[self.answer],val[self.question]
            template_question, answer = val[self.template_question], val[self.answer]

#             input_ = f"context: {passage}  answer: {answer}" # T5 Input format for question answering tasks 
#             target = f"question: {str(target)}" # Output format we require
            
#             input_ = f"context: {passage} question: {str(target)}" # T5 Input format for question answering tasks 
#             target = f"answer: {answer}" # Output format we require
            
#             input_ = f"{str(template_question)} </s>" # T5 Input format for question answering tasks 
#             target = f"{answer} </s>" # Output format we require
            input_ = f"{str(template_question)}" # T5 Input format for question answering tasks 
            target = f"{answer}" # Output format we require
            
            # TODO: Not sure if this is needed as the tokenizer can truncate the output. 
            encoded = t5_tokenizer.batch_encode_plus(
                [input_], 
                truncation = False,
                return_tensors="pt"
            )
            
            if len(encoded['input_ids'][0]) > self.max_len_output:
                continue 
                
#             question_plus = f"answer_me: {str(question)}"
#             question_plus += f" context: {str(context)} </s>"

            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], 
                max_length = self.max_len_input,
                padding = 'max_length',
                truncation = True,
                return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], 
                max_length = self.max_len_output,
#                 padding = 'longest',
                padding = 'max_length',
                truncation = True,
#                 truncation = False,
                return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

In [None]:
# # train_path = 'train_squad.parquet' # change this accordingly
# # validation_path = 'validation_squad.parquet'

# # train_path = 'train_tdm_f1_v1.parquet' # change this accordingly
# # train_path = 'dev_tdm_f1_v1.parquet' # change this accordingly
# # validation_path = 'dev_tdm_f1_v1.parquet'

# # train_path = 'train_tdm_f1_v1_short.parquet'
# train_path = 'data/train_tdm_f1_v2_short.parquet'
# validation_path = 'data/dev_tdm_f1_v2_short.parquet'


# train_dataset = QuestionGenerationDataset(t5_tokenizer, train_path, 
#                                           max_len_inp=max_len_inp, max_len_out=model_max_len_out)
# # validation_dataset = QuestionGenerationDataset(t5_tokenizer, validation_path, 
# #                                                max_len_inp=max_len_inp, max_len_out=model_max_len_out)

# validation_dataset = train_dataset


train_dataset = QuestionGenerationDataset(t5_tokenizer, train_path, 
                                          max_len_inp = max_len_inp, 
                                          max_len_out = model_max_len_out)
validation_dataset = QuestionGenerationDataset(t5_tokenizer, validation_path, 
                                               max_len_inp = max_len_inp, 
                                               max_len_out = model_max_len_out)


In [None]:
# Data Sample

train_sample = train_dataset[100] # thanks to __getitem__
decoded_train_input = t5_tokenizer.decode(train_sample['source_ids'])
decoded_train_output = t5_tokenizer.decode(train_sample['target_ids'])

print(decoded_train_input)
print(decoded_train_output)

In [None]:
# Data Sample

train_sample = train_dataset[100] # thanks to __getitem__
decoded_train_input = t5_tokenizer.decode(train_sample['source_ids'])
decoded_train_output = t5_tokenizer.decode(train_sample['target_ids'])

print(decoded_train_input)
print(decoded_train_output)

In [None]:
print(f"Length training {len(train_dataset)}")
print(f"Length validation {len(validation_dataset)}")

In [None]:
len("[{ 'leaderboard': { 'Dataset': 'Human3.6M', 'Metric': 'Average MPJPE (mm)', 'Task</s>")

# Fine Tuning T5

In [None]:
import pytorch_lightning as pl
from torch.optim import AdamW
import argparse
from transformers import (
    get_linear_schedule_with_warmup
  )

# from transformers import get_linear_schedule_with_warmup, \
# T5ForConditionalGeneration, AdamW, get_linear_schedule_with_warmup

class T5Tuner(pl.LightningModule):
    # 3e-4
    def __init__(self, t5model, t5tokenizer, batchsize=4):
#     def __init__(self, t5model, t5tokenizer, 
#                  lr=5e-5, num_train_epochs=15, warmup_steps=1000, batchsize=4):
#     super(T5Tuner, self).__init__()    
        super().__init__()
        self.model = t5model
        self.tokenizer = t5tokenizer
        self.batch_size = batchsize
#         self.save_hyperparameters()

    def forward(self, input_ids, attention_mask=None, 
                decoder_attention_mask=None, 
                lm_labels=None):
      
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
            labels=lm_labels,
        )
        
        return outputs

    def training_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            decoder_attention_mask=batch['target_mask'],
            lm_labels=batch['labels']
        )

        loss = outputs[0]
        self.log('train_loss',loss)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            decoder_attention_mask=batch['target_mask'],
            lm_labels=batch['labels']
        )

        loss = outputs[0]
        self.log("val_loss",loss)
        return loss

    def train_dataloader(self):
        return DataLoader(train_dataset, batch_size=self.batch_size,
                          num_workers=workers)

    def val_dataloader(self):
        return DataLoader(validation_dataset, 
                          batch_size=self.batch_size,
                          num_workers=workers)

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=3e-4, eps=1e-8)
        return optimizer
        
#     def configure_optimizers(self):
#         # create optimizer
#         optimizer = AdamW(self.parameters(), lr=self.hparams.lr)
#         # create learning rate scheduler
#         num_train_optimization_steps = self.hparams.num_train_epochs * len(\
# self.train_dataloader())
#         lr_scheduler = {
# 'scheduler': get_linear_schedule_with_warmup(optimizer,
#                         num_warmup_steps=self.hparams.warmup_steps,
#                         num_training_steps=num_train_optimization_steps),
#                         'name': 'learning_rate',
#                         'interval':'step',
#                         'frequency': 1}
        
#         return {"optimizer": optimizer, "lr_scheduler": lr_scheduler}
    


In [None]:
# device

In [None]:
gpus

In [None]:
early_stop_callback = EarlyStopping(
   monitor='val_loss',
   min_delta=0.00,
   patience=5,
   strict=False,
   verbose=False,
   mode='min'
)

bs = 1


# # for early stopping, 
# # see https://pytorch-lightning.readthedocs.io/en/1.0.0/early_stopping\
# # .html?highlight=early%20stopping
# early_stop_callback = EarlyStopping(
#     monitor='validation_loss',
#     patience=3,
#     strict=False,
#     verbose=False,
#     mode='min'
# )

# lr_monitor = LearningRateMonitor(logging_interval='step')

model = T5Tuner(t5_model, t5_tokenizer, batchsize=bs)

trainer = pl.Trainer(max_epochs = epochs,
                     accelerator="auto", 
                     devices="auto",
                     # gpus=gpus,
                     strategy='dp',
                     # accelerator='cuda', 
                     # devices=gpus,
                     callbacks=[
                         early_stop_callback, 
#                          lr_monitor
                     ]
                    )

trainer.fit(model)

In [None]:
# torch.cuda.empty_cache() 

In [None]:
# # saving the model
# !mkdir "t5_tokenizer"
# !mkdir "t5_trained_model"
# model.model.save_pretrained('t5_trained_model_QA')
# model.model.save_pretrained('t5_trained_model_QA__')

# t5_tokenizer.save_pretrained('t5_tokenizer_QA')

In [None]:
# model[model_idx].replace("/","_").replace("-","_")

In [None]:
# model.model.save_pretrained(\
# f'../model_ckpt/{model[model_idx].replace("/","_").replace("-","_")}{model_attention[model_idx].replace("-","_")}{size[size_idx].replace("-","_")}_{bs}_{epochs}_{model_max_length}_{max_len_inp}_{model_max_len_out}_gpu_{num_gpus}')
# f'../model_ckpt/{model_name[model_idx].replace("/","_").replace("-","_")}{model_attention[model_idx].replace("-","_")}{size[size_idx].replace("-","_")}_tokenizer_{bs}_{epochs}_{model_max_length}_{max_len_inp}_{model_max_len_out}_gpu_{num_gpus}'

# Inference / Predictions

In [None]:
!pwd

In [None]:
# model.model.save_pretrained(f'model_ckpt/{mode}_{model_name[model_idx].replace("/","_").replace("-","_")}{model_attention[model_idx].replace("-","_")}{size[size_idx].replace("-","_")}_{bs}_{epochs}_{model_max_length}_{max_len_inp}_{model_max_len_out}_gpu_{num_gpus}')
# # t5_tokenizer.save_pretrained(f'{model[model_idx]}{model_attention[model_idx]}{size[size_idx]}_tokenizer_{bs}_{epochs}_{model_max_length}_{max_len_inp}_{model_max_len_out}')
# t5_tokenizer.save_pretrained(f'model_ckpt/{mode}_{model_name[model_idx].replace("/","_").replace("-","_")}{model_attention[model_idx].replace("-","_")}{size[size_idx].replace("-","_")}_tokenizer_{bs}_{epochs}_{model_max_length}_{max_len_inp}_{model_max_len_out}_gpu_{num_gpus}')


In [None]:
# trained_model_path = f'Longt5_trained_tdms_model_QA_bs_{bs}_epochs_{epochs}_model_max_length_{model_max_length}_max_len_inp_{max_len_inp}_model_max_len_out_{model_max_len_out}'
# trained_tokenizer = f'Longt5_tokenizer_tdms_QA_bs_{bs}_epochs_{epochs}_model_max_length_{model_max_length}_max_len_inp_{max_len_inp}_model_max_len_out_{model_max_len_out}'

# trained_model_path = f'./Longt5_trained_tdms_{bs}_{epochs}_{model_max_length}_{max_len_inp}_{model_max_len_out}'
# trained_tokenizer = f'./Longt5_tokenizer_tdms_{bs}_{epochs}_{model_max_length}_{max_len_inp}_{model_max_len_out}'

trained_model_path = f'../model_ckpt/{mode}_{model_name[model_idx].replace("/","_").replace("-","_")}{model_attention[model_idx].replace("-","_")}{size[size_idx].replace("-","_")}_{bs}_{epochs}_{model_max_length}_{max_len_inp}_{model_max_len_out}_gpu_{num_gpus}'
trained_tokenizer = f'../model_ckpt/{mode}_{model_name[model_idx].replace("/","_").replace("-","_")}{model_attention[model_idx].replace("-","_")}{size[size_idx].replace("-","_")}_tokenizer_{bs}_{epochs}_{model_max_length}_{max_len_inp}_{model_max_len_out}_gpu_{num_gpus}'

# trained_model_path = f'../model_ckpt/{model[model_idx].replace("/","_").replace("-","_")}{model_attention[model_idx].replace("-","_")}{size[size_idx].replace("-","_")}_{bs}_{epochs}_{model_max_length}_{max_len_inp}_{model_max_len_out}_gpu_{num_gpus}'
# trained_tokenizer = f'./Longt5_tokenizer_tdms_6_5_512_400_112'

# device  = 'cuda' if torch.cuda.is_available() else "cpu"
device = "cpu"

In [None]:
model = T5ForConditionalGeneration.from_pretrained(trained_model_path)
tokenizer = T5Tokenizer.from_pretrained(trained_tokenizer)

In [None]:
# context ="President Donald Trump said and predicted that some states would reopen this month."
# # answer = "Donald Trump"
# question = "Who is the pre"
# text = "context: "+context + " " + "answer: " + answer
# print(text)

# Data Sample Validation 

# dev_sample = validation_dataset[10] # thanks to __getitem__
dev_sample = train_dataset[100] # thanks to __getitem__


decoded_dev_input = t5_tokenizer.decode(dev_sample['source_ids'])
decoded_dev_output = t5_tokenizer.decode(dev_sample['target_ids'])

print(decoded_dev_input)
print(decoded_dev_output)

In [None]:
# context ="Since its topping out in 2013, One World Trade Center in New York City has been the tallest skyscraper in the United States."
# # answer = "World Trade Center"
# # question = "When did the One World Trade Center became the tallest skyscraper"
# question = "Where is the world trade center ?"
# text = "context: "+context + " " + "answer: " + question
# # text = "context: "+context + " " + "answer: " + answer

# print(text)

In [None]:
encoding = tokenizer.encode_plus(decoded_dev_input,max_length =512,padding='max_length', 
                                 truncation = True,
                                 return_tensors="pt")#.to(device)
print (encoding.keys())
# input_ids,attention_mask  = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
input_ids,attention_mask  = encoding["input_ids"], encoding["attention_mask"]

In [None]:
model.eval()
beam_outputs = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=model_max_len_out, # How long the generated questions should be
    early_stopping=True,
    num_beams=10,
    num_return_sequences=1
)

for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    print(sent)