In [2]:
from dataset import GOOGLE, GOODBYE, CHAT, VISION
data = GOOGLE + GOODBYE + CHAT + VISION

print(len(data))

402


In [14]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

# Prepare the dataset
texts = [item[0] for item in data]
labels = [item[1] for item in data]
label_map = {'vision': 0, 'chat': 1, 'goodbye': 2, 'google': 3}
labels = [label_map[label] for label in labels]

# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Tokenize the text
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Create the custom dataset
train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)

# Create the Trainer
training_args = TrainingArguments(
    output_dir='../models',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    learning_rate=5e-5,
    save_total_limit=1,
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Save a checkpoint at the end of each epoch
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Fine-tune the model
trainer.train()


In [15]:
from transformers import pipeline

# Load the fine-tuned model
model_path = '../models/cd_CKPT_IV'
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
classifier = pipeline('text-classification', model=model_path, tokenizer=tokenizer)

def command_filter(prompt):
    # Classify the input prompt
    result = classifier(prompt)
    command_id = int(result[0]['label'].split('_')[-1])
    command = {0: 'vision', 1: 'chat', 2: 'goodbye', 3: 'google'}[command_id]

    return command
    
# Example prompts

prompts = ["Hello there!",
           "I'd like you to tell me about powerlifting",
           "Can you see me?",
           "What do you see in this image?",
           "See you tomorrow!",
           "Goodbye GPT",
           "What is a compiled programing language?",
           "How many calories does Ultra White Monster Energy have?"]

for prompt in prompts:


    print(f'{prompt} : {command_filter(prompt)}')


Hello there! : chat
I'd like you to tell me about powerlifting : chat
Can you see me? : vision
What do you see in this image? : vision
See you tomorrow! : goodbye
Goodbye GPT : goodbye
What is a compiled programing language? : google
How many calories does Ultra White Monster Energy have? : google
