In [1]:
id2label = {0: "Bathroom", 1: "Elevator", 2: "Classroom", 3: "Dean's office", 4: "Library", 5: "Starbucks", 6: "Dose", 7: "Coffee shop", 8: "Jamoka", 9: "Segafredo", 10: "Robotics Lab", 11: "Prince Turki Center", 12: "Sports Center"}
label2id = {"Bathroom": 0, "Elevator": 1, "Classroom": 2, "Dean's office": 3, "Library": 4, "Starbucks": 5, "Dose": 6, "Coffee shop": 7, "Jamoka": 8, "Segafredo": 9, "Robotics Lab": 10, "Prince Turki Center": 11, "Sports Center": 12}

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import pandas as pd

# Load your own dataset from a CSV file or another source
# Replace 'your_dataset.csv' with the actual path to your CSV file
your_dataset = pd.read_csv('/Users/lara/Desktop/Dash-NLP/destinations.csv')

# Split the dataset into training and validation sets
train_data, val_data = train_test_split(your_dataset, test_size=0.2, random_state=42)

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label_str = str(self.labels.iloc[idx])  # Get the string label from the DataFrame
        label = label2id[label_str]  # Convert the string label to integer using label2id mapping

        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load pre-trained BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label2id))
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create instances of the custom dataset
train_dataset = CustomDataset(train_data['sentence'], train_data['class'], tokenizer)
val_dataset = CustomDataset(val_data['sentence'], val_data['class'], tokenizer)

# Define training arguments and trainer
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Fine-tune the model
trainer.train()

# Evaluate the fine-tuned model
results = trainer.evaluate()
print(results)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.8814


{'eval_loss': 0.0029572637286037207, 'eval_runtime': 2.8666, 'eval_samples_per_second': 43.606, 'eval_steps_per_second': 5.582, 'epoch': 10.0}


In [4]:
tokenizer.save_pretrained('./fine_tuned_model/tokenizer')
model.save_pretrained('./fine_tuned_model')  

In [7]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the fine-tuned model and tokenizer
model = BertForSequenceClassification.from_pretrained('./fine_tuned_model')  # Specify the path to the directory where you saved the fine-tuned model
tokenizer = BertTokenizer.from_pretrained('./fine_tuned_model/tokenizer')  # Specify the path to the directory where you saved the tokenizer

# Function to get predictions for a list of sentences
def predict_class(sentences):
    inputs = tokenizer(sentences, truncation=True, padding=True, return_tensors='pt')
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=1)
    predicted_class = torch.argmax(probabilities, dim=1).tolist()
    return predicted_class

# Example usage
new_sentences = ["take me to the dean's office please.", "can you please take me to the bathroom?", "where's the elevator?", "can you take me to classroom."]
predicted_classes = predict_class(new_sentences)

# Map predicted class indices back to class labels using label2id
predicted_class_labels = [id2label[idx] for idx in predicted_classes]

# Print the results
for sentence, predicted_label in zip(new_sentences, predicted_class_labels):
    print(f"Sentence: {sentence}\t Predicted Class: {predicted_label}")


Sentence: take me to the dean's office please.	 Predicted Class: Dean's office
Sentence: can you please take me to the bathroom?	 Predicted Class: Bathroom
Sentence: where's the elevator?	 Predicted Class: Elevator
Sentence: can you take me to classroom.	 Predicted Class: Classroom


In [8]:
new_sentence = "where's the closest coffee shop from here?"
pclass = predict_class(new_sentence)
pclasslabel = [id2label[idx] for idx in pclass]
print(pclasslabel)

['Coffee shop']
