In [None]:
import os
import time
import torch
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers.optimization import  Adafactor
import warnings
warnings.filterwarnings('ignore')

In [None]:
train_df=pd.read_csv('../data/training_paths.csv', index_col=[0])

In [None]:
train_df.head()

In [None]:
batch_size=8
num_of_batches=len(train_df)/batch_size
num_of_epochs=4
num_of_batches=int(num_of_batches)
num_of_epochs=50

train_set=train_df.iloc[  :2000,:]

In [None]:
if torch.cuda.is_available():
    dev = torch.device("cuda:0")
    print("Running on the GPU")
else:
    dev = torch.device("cpu")
    print("Running on the CPU")

In [None]:
%%capture
tokenizer = T5Tokenizer.from_pretrained('t5-base') # t5-base
model = T5ForConditionalGeneration.from_pretrained('t5-base', return_dict=True) # t5-base
# moving the model to device(GPU/CPU)
model.to(dev)

In [None]:
optimizer = Adafactor(
    model.parameters(),
    lr=1e-3,
    eps=(1e-30, 1e-3),
    clip_threshold=1.0,
    decay_rate=-0.8,
    beta1=None,
    weight_decay=0.0,
    relative_step=False,
    scale_parameter=False,
    warmup_init=False
)

In [None]:
model.train()

loss_per_10_steps = []
for epoch in range(1, num_of_epochs + 1):
    print("Running epoch: {}".format(epoch))

    running_loss = 0

    out = display(progress(1, num_of_batches + 1), display_id=True)
    for i in range(num_of_batches):
        inputbatch = []
        labelbatch = []
        new_df = train_set[i * batch_size : i * batch_size + batch_size]
        for indx, row in new_df.iterrows():
            input = row["input_text"] + "</s>"
            labels = row["target"] + "</s>"
            inputbatch.append(input)
            labelbatch.append(labels)
        if len(inputbatch) >= 1:
            inputbatch = tokenizer.batch_encode_plus(
                inputbatch, padding=True, max_length=512, return_tensors="pt"
            )["input_ids"]
            labelbatch = tokenizer.batch_encode_plus(
                labelbatch, padding=True, max_length=512, return_tensors="pt"
            )["input_ids"]
            inputbatch = inputbatch.to(dev)
            labelbatch = labelbatch.to(dev)

            # clear out the gradients of all Variables
            optimizer.zero_grad()

            # Forward propogation
            outputs = model(input_ids=inputbatch, labels=labelbatch)
            loss = outputs.loss
            loss_num = loss.item()
            logits = outputs.logits
            running_loss += loss_num
            if i % 10 == 0:
                loss_per_10_steps.append(loss_num)
            out.update(progress(loss_num, i, num_of_batches + 1))

            # calculating the gradients
            loss.backward()

            # updating the params
            optimizer.step()

    running_loss = running_loss / int(num_of_batches)
    print("Epoch: {} , Running loss: {}".format(epoch, running_loss))

In [None]:
def text_inference(model, keywords):
    model.eval()
    input_ids = tokenizer.encode(
        keywords + "</s>", max_length=512, truncation=True, return_tensors="pt"
    )  
    input_ids = input_ids.to(dev)
    outputs = model.generate(input_ids, max_length=1024)
    output_text = tokenizer.decode(outputs[0])
    return output_text[6:-4]

In [None]:
# torch.save(model, 't5-model-finetuned-50-epochs.bin')