In [55]:
import torch
import pandas as pd
import joblib
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [56]:
merged_df = pd.read_csv(r"data/merged_dataset.csv")

In [57]:
# Step 2: Tokenization
print("Initializing tokenizer...")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer.save_pretrained("bert_chatbot_tokenization")  # Save tokenizer for reuse

Initializing tokenizer...


('bert_chatbot_tokenization\\tokenizer_config.json',
 'bert_chatbot_tokenization\\special_tokens_map.json',
 'bert_chatbot_tokenization\\vocab.txt',
 'bert_chatbot_tokenization\\added_tokens.json')

In [58]:
# Step 3: Encode text and labels
# Ensure all values in X are strings and handle missing values
X = merged_df['Pattern'].astype(str).fillna("").tolist()

y = merged_df['Response'].tolist()

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
joblib.dump(label_encoder, "label_encode.pkl")

['label_encode.pkl']

In [18]:
X_encoded = tokenizer(X, truncation=True, padding='max_length', max_length=128, return_tensors="pt")


In [19]:
test_size = 0.2

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded["input_ids"], y_encoded, test_size=test_size, random_state=42, shuffle=True
)


In [20]:
# Convert to PyTorch tensors
train_encodings = {"input_ids": X_train}
test_encodings = {"input_ids": X_test}
y_train_tensor = torch.tensor(y_train)
y_test_tensor = torch.tensor(y_test)

In [21]:
# Custom dataset class
class ChatDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = ChatDataset(train_encodings, y_train_tensor)
test_dataset = ChatDataset(test_encodings, y_test_tensor)

In [37]:
# Load pre-trained BERT model for classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(set(y_encoded)))

# Training arguments with optimizations
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,  # Increased batch size for efficiency
    per_device_eval_batch_size=16,
    num_train_epochs=5,  # More epochs for better training
    logging_dir="./logs",
    learning_rate=3e-5,  # Fine-tuned learning rate
    weight_decay=0.01,
    load_best_model_at_end=True,  # Load best model for better performance
)

# Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train model
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Epoch,Training Loss,Validation Loss
1,No log,7.664407
2,No log,7.548858
3,7.557700,7.643505
4,7.557700,7.668167
5,7.557700,7.664006


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


TrainOutput(global_step=855, training_loss=7.42657592059576, metrics={'train_runtime': 4723.8465, 'train_samples_per_second': 2.89, 'train_steps_per_second': 0.181, 'total_flos': 915416537817600.0, 'train_loss': 7.42657592059576, 'epoch': 5.0})

In [38]:
# Save model and tokenizer
model.save_pretrained("bert_chatbot")
tokenizer.save_pretrained("bert_chatbot")

# Evaluate model
trainer.evaluate()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


{'eval_loss': 7.548858165740967,
 'eval_runtime': 49.7609,
 'eval_samples_per_second': 13.726,
 'eval_steps_per_second': 0.864,
 'epoch': 5.0}