In [2]:
import json
import torch
import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

In [3]:
# Read JSON data into pandas dataframe
with open('data.json', 'r') as f:
    data = json.load(f)

# Convert the list of definitions into a dataframe
definitions = data['definitions']
dataframe = pd.DataFrame(definitions)

In [4]:
# EDA: Check the structure and summary of the dataframe
print(dataframe.info())
print(dataframe.describe())
print(dataframe.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1962 entries, 0 to 1961
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   term        1962 non-null   object
 1   definition  1962 non-null   object
 2   imgURL      20 non-null     object
 3   altText     20 non-null     object
dtypes: object(4)
memory usage: 61.4+ KB
None
                    term                       definition  \
count               1962                             1962   
unique              1962                             1935   
top     24-hour coverage  To get better after being sick.   
freq                   1                                2   

                                imgURL  \
count                               20   
unique                              19   
top     ./images/2308a_The_Trachea.jpg   
freq                                 2   

                                                  altText  
count                          

In [5]:
train_df, val_df = train_test_split(dataframe, test_size=0.1)

In [6]:
trainDS = Dataset.from_pandas(train_df)
valDS = Dataset.from_pandas(val_df)

In [7]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [8]:
def tokenize_function(examples):
    term = " ".join(examples['term']) if isinstance(examples['term'], list) else examples['term']
    inputTexts = [term for term in examples['term']]
    targetTexts = examples['definition']

    modelInputs = tokenizer(inputTexts, padding="max_length", truncation=True, max_length=128)
    labels = tokenizer(targetTexts, padding="max_length", truncation=True, max_length=128)

    modelInputs['labels'] = labels['input_ids']
    return modelInputs

In [9]:
trainDS = trainDS.map(tokenize_function, batched=True)
valDS = valDS.map(tokenize_function, batched=True)

Map: 100%|██████████| 1765/1765 [00:00<00:00, 3373.62 examples/s]
Map: 100%|██████████| 197/197 [00:00<00:00, 6298.95 examples/s]


In [10]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [11]:
trainingArgs = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)

In [12]:
trainer = Trainer(
    model=model,
    args=trainingArgs,
    train_dataset=trainDS,
    eval_dataset=valDS
)


In [13]:
trainer.train()

  0%|          | 0/1110 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
  1%|          | 10/1110 [06:43<7:56:02, 25.97s/it]

{'loss': 12.4278, 'grad_norm': 103.59442901611328, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.09}


  2%|▏         | 20/1110 [13:50<8:24:08, 27.75s/it] 

{'loss': 12.8668, 'grad_norm': 52.654869079589844, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.18}


  3%|▎         | 30/1110 [15:38<3:45:19, 12.52s/it]

{'loss': 12.644, 'grad_norm': 65.02728271484375, 'learning_rate': 3e-06, 'epoch': 0.27}


  4%|▎         | 40/1110 [18:12<4:11:40, 14.11s/it]

{'loss': 12.304, 'grad_norm': 55.18180465698242, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.36}


  5%|▍         | 50/1110 [19:51<2:32:36,  8.64s/it]

{'loss': 12.0799, 'grad_norm': 45.56892776489258, 'learning_rate': 5e-06, 'epoch': 0.45}


  5%|▌         | 60/1110 [31:45<14:29:14, 49.67s/it] 

{'loss': 11.7848, 'grad_norm': 56.29859924316406, 'learning_rate': 6e-06, 'epoch': 0.54}


In [None]:
results = trainer.evaluate()
print(results)

In [None]:
def generate_model(term):
    inputText = f"define: {term}"
    inputID = tokenizer.encode(input_text, return_tensors="pt")

    model.eval()
    with torch.no_grad():
        output = model.generate(input_ids, max=128)

        predictedDef = tokenizer.decode(output[0], skip_special_tokens=True)
        return predictedDef

In [None]:
term_to_define = "coronary artery bypass graft"
predictedDef = generate_model(term_to_define)
print(f"Definition of '{term_to_define}': {predictedDef}")