In [4]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, EarlyStoppingCallback
import pandas as pd
import torch
import csv
import datetime
from transformers.integrations import TensorBoardCallback

# Get current date and time
now = datetime.datetime.now()
timestamp = now.strftime("%Y%m%d_%H%M%S")

# Load pre-trained T5 model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Load the custom vocabulary from the CSV file
vocab_path = '../../data/custom_vocab.csv'
custom_vocab = []
with open(vocab_path, 'r') as file:
    reader = csv.reader(file)
    next(reader)  # Skip the header
    for row in reader:
        custom_vocab.append(row[0])

# Add essential tokens to ensure tokenizer functionality
essential_tokens = ['[PAD]', '[EOS]', '[SEP]', '[CLS]', '[MASK]']
full_vocab = essential_tokens + custom_vocab

# Ensure no duplicates
full_vocab = list(set(full_vocab))

# Recreate the tokenizer with only the custom vocabulary
tokenizer.add_tokens(full_vocab)
model.resize_token_embeddings(len(tokenizer))

# Encode the special tokens manually
special_tokens_dict = {'pad_token': '[PAD]', 'eos_token': '[EOS]'}
tokenizer.add_special_tokens(special_tokens_dict)

# Load the training data
df = pd.read_csv('../../data/full_dataset.csv')

# Filter out rows with empty 'dsl' values
df = df[df['dsl'].notna()]

train_texts = df['text'].tolist()
train_labels = df['dsl'].tolist()
train_texts_with_concat = df.apply(lambda row: f"{row['label']} {row['text']}", axis=1).tolist()

for i in range(10):
    print(f"Text: {train_texts[i]}")
    print(f"Label: {train_labels[i]}")
    print(f"Text with label: {train_texts_with_concat[i]}")
    print()


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Text: Do the laundry, but first wash the car
Label: car.wash(), laundry_room.clean(laundry)
Text with label: REVERSE Do the laundry, but first wash the car

Text: Clean the kitchen, then empty the trash
Label: kitchen.clean(), trash.empty()
Text with label: FIRST Clean the kitchen, then empty the trash

Text: Water the plants and then vacuum the living room
Label: garden.water(plants), living_room.vacuum()
Text with label: FIRST Water the plants and then vacuum the living room

Text: Take out the trash, but first clean the kitchen sink
Label: kitchen.clean(sink), trash_can.empty()
Text with label: REVERSE Take out the trash, but first clean the kitchen sink

Text: Clean the kitchen. No, do the laundry instead
Label: laundry_room.clean(laundry)
Text with label: REPLACE Clean the kitchen. No, do the laundry instead

Text: Start the dishwasher. Nevermind
Label: EMPTY
Text with label: CLEAR Start the dishwasher. Nevermind

Text: Cook dinner, then set the table, and finally clean the dishes