# Fine-tuning a BERT (Bidirectional Encoder Representations from Transformers) model for downstream tasks like intent detection and Named Entity Recognition (NER)

In [1]:
pip install transformers datasets torch accelerate scikit-learn seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB

Lets Start finetuning BERT for Intent Detection

In [5]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForTokenClassification

# For Intent Detection
model_name_intent = "bert-base-uncased"  # Choose your desired BERT variant
tokenizer_intent = AutoTokenizer.from_pretrained(model_name_intent)

# Define the number of unique intents
# Based on the global variables, it seems num_labels might be intended for this.
# Replace with your actual number of intents if different.
num_intents = num_labels # Using the existing global variable if it represents the number of intents

model_intent = AutoModelForSequenceClassification.from_pretrained(model_name_intent, num_labels=num_intents) # num_labels is the number of unique intents

# For NER
model_name_ner = "bert-base-uncased"  # Choose your desired BERT variant
tokenizer_ner = AutoTokenizer.from_pretrained(model_name_ner)

# Define the number of unique NER tags
# You need to define this based on your specific NER dataset
num_ner_tags = 10 # Replace with the actual number of unique NER tags

model_ner = AutoModelForTokenClassification.from_pretrained(model_name_ner, num_labels=num_ner_tags) # num_ner_tags is the number of unique NER tags

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def preprocess_intent_data(examples):
    return tokenizer_intent(examples["text"], truncation=True, padding="max_length")

def preprocess_ner_data(examples):
    tokenized_inputs = tokenizer_ner(examples["tokens"], truncation=True, is_split_into_words=True, padding="max_length")
    labels = []
    for i, tokens in enumerate(examples["tokens"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special token
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[examples["ner_tags"][i][word_idx]])
            else:
                label_ids.append(-100)  # Continuation of the same word
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [9]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
from seqeval.metrics import classification_report as ner_classification_report
from seqeval.metrics import f1_score as ner_f1_score
import numpy as np # Import numpy

# For Intent Detection
model_name_intent = "bert-base-uncased"  # Choose your desired BERT variant
tokenizer_intent = AutoTokenizer.from_pretrained(model_name_intent)

# Define the number of unique intents
# Based on the global variables, it seems num_labels might be intended for this.
# Replace with your actual number of intents if different.
# Ensure num_labels is defined before this section if it's not a global variable you control
# For example, if you have a dataset object `dataset`, you might get it from there:
# num_intents = dataset['train'].features['label'].num_classes
# Assuming `num_labels` is available from a previous cell as per global variables
num_intents = num_labels # Using the existing global variable if it represents the number of intents

model_intent = AutoModelForSequenceClassification.from_pretrained(model_name_intent, num_labels=num_intents) # num_labels is the number of unique intents

# For NER
model_name_ner = "bert-base-uncased"  # Choose your desired BERT variant
tokenizer_ner = AutoTokenizer.from_pretrained(model_name_ner)

# Define the number of unique NER tags
# You need to define this based on your specific NER dataset
# Assuming `label_names` is available from a previous cell as per global variables
num_ner_tags = len(label_names) # Using the length of the existing label_names for the number of NER tags

model_ner = AutoModelForTokenClassification.from_pretrained(model_name_ner, num_labels=num_ner_tags) # num_ner_tags is the number of unique NER tags


def preprocess_intent_data(examples):
    return tokenizer_intent(examples["text"], truncation=True, padding="max_length")

def preprocess_ner_data(examples):
    tokenized_inputs = tokenizer_ner(examples["tokens"], truncation=True, is_split_into_words=True, padding="max_length")
    labels = []
    for i, tokens in enumerate(examples["tokens"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special token
            elif word_idx != previous_word_idx:
                # Assuming label_to_id is defined and maps tag strings to integer IDs
                label_ids.append(label_to_id[examples["ner_tags"][i][word_idx]])
            else:
                label_ids.append(-100)  # Continuation of the same word
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


# For Intent Detection
def compute_metrics_intent(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

training_args_intent = TrainingArguments(
    output_dir="./intent_model",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    eval_strategy="epoch", # Changed from evaluation_strategy
    save_strategy="epoch",
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
)

# Ensure tokenized_train_dataset_intent and tokenized_val_dataset_intent are defined
# For example:
# from datasets import load_dataset
# raw_dataset_intent = load_dataset(...) # Load your intent dataset
# tokenized_dataset_intent = raw_dataset_intent.map(preprocess_intent_data, batched=True)
# tokenized_train_dataset_intent = tokenized_dataset_intent["train"]
# tokenized_val_dataset_intent = tokenized_dataset_intent["validation"]


# trainer_intent = Trainer(
#     model=model_intent,
#     args=training_args_intent,
#     train_dataset=tokenized_train_dataset_intent,
#     eval_dataset=tokenized_val_dataset_intent,
#     compute_metrics=compute_metrics_intent,
#     tokenizer=tokenizer_intent,
# )

# trainer_intent.train()
# trainer_intent.evaluate()
# trainer_intent.save_model("./best_intent_model")

# For NER
def compute_metrics_ner(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Ensure label_names is a list of string labels
    # For example:
    # label_names = ["O", "B-PER", "I-PER", ...]

    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = ner_classification_report(true_labels, true_predictions, digits=4)
    return {"f1": ner_f1_score(true_labels, true_predictions), "report": results}

training_args_ner = TrainingArguments(
    output_dir="./ner_model",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    eval_strategy="epoch", # Changed from evaluation_strategy
    save_strategy="epoch",
    metric_for_best_model="f1",
    load_best_model_at_end=True,
)

# Ensure tokenized_train_dataset_ner and tokenized_val_dataset_ner are defined
# For example:
# from datasets import load_dataset
# raw_dataset_ner = load_dataset(...) # Load your NER dataset
# tokenized_dataset_ner = raw_dataset_ner.map(preprocess_ner_data, batched=True)
# tokenized_train_dataset_ner = tokenized_dataset_ner["train"]
# tokenized_val_dataset_ner = tokenized_dataset_ner["validation"]


# trainer_ner = Trainer(
#     model=model_ner,
#     args=training_args_ner,
#     train_dataset=tokenized_train_dataset_ner,
#     eval_dataset=tokenized_val_dataset_ner,
#     compute_metrics=compute_metrics_ner,
#     tokenizer=tokenizer_ner,
#     data_collator=DataCollatorForTokenClassification(tokenizer_ner), # Important for NER
# )

# trainer_ner.train()
# trainer_ner.evaluate()
# trainer_ner.save_model("./best_ner_model")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoModelForTokenClassification

# For Intent Detection - Loading from local directory
model_path_intent = "./best_intent_model"  # Make sure this path is correct
try:
    tokenizer_intent = AutoTokenizer.from_pretrained(model_path_intent, local_files_only=True)
    model_intent = AutoModelForSequenceClassification.from_pretrained(model_path_intent, local_files_only=True)
    intent_pipeline = pipeline(
        task="text-classification",
        model=model_intent,
        tokenizer=tokenizer_intent
    )
    result_intent = intent_pipeline("What's the weather like in Bangalore?")
    print("Intent Detection Result:", result_intent)
except Exception as e:
    print(f"Error loading intent model: {e}")

# For NER - Loading from local directory
model_path_ner = "./best_ner_model"  # Make sure this path is correct
try:
    tokenizer_ner = AutoTokenizer.from_pretrained(model_path_ner, local_files_only=True)
    model_ner = AutoModelForTokenClassification.from_pretrained(model_path_ner, local_files_only=True)
    ner_pipeline = pipeline(
        task="ner",
        model=model_ner,
        tokenizer=tokenizer_ner,
        aggregation_strategy="simple"
    )
    result_ner = ner_pipeline("Sundar Pichai is the CEO of Google based in Mountain View, California.")
    print("NER Result:", result_ner)
except Exception as e:
    print(f"Error loading NER model: {e}")

Error loading intent model: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: './best_intent_model'.
Error loading NER model: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: './best_ner_model'.
