In [1]:
!pip install transformers datasets torch



In [2]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd

# Load the dataset (replace with your local dataset path)
df = pd.read_csv("Query_Classification.csv")

# Rename the columns if necessary to match the required format
df = df.rename(columns={"query": "text", "intent": "label"})

# Map string labels to integers
label_mapping = {label: idx for idx, label in enumerate(df['label'].unique())}
df['label'] = df['label'].map(label_mapping)

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Split into train and test sets
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_mapping))

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# Remove unused columns
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["text"])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["text"])

# Set the format for PyTorch
tokenized_train_dataset.set_format("torch")
tokenized_test_dataset.set_format("torch")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)

# Train the model
trainer.train()

# Save the model and tokenizer
model.save_pretrained("fine_tuned_bert")
tokenizer.save_pretrained("fine_tuned_bert")





Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/852 [00:00<?, ? examples/s]

Map:   0%|          | 0/213 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,No log,0.700637
2,No log,0.121171
3,No log,0.062751


('fine_tuned_bert\\tokenizer_config.json',
 'fine_tuned_bert\\special_tokens_map.json',
 'fine_tuned_bert\\vocab.txt',
 'fine_tuned_bert\\added_tokens.json')