# **Finetune BERT Model For Car Sales NER Classification**

In [16]:
pip install huggingface_hub



In [18]:
pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16165 sha256=952e83e9066a4f7fd6cfa40d9198d0496d7fb443114de8adf3372424741bc45f
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [8]:
!pip install transformers torch



In [1]:
import os
import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForTokenClassification, AdamW, Trainer, TrainingArguments

In [3]:
import os

def clean_data(file_path, output_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    filtered_lines = []
    for line in lines:
        if not line.strip().startswith("```") and line.strip() and not line.startswith("Sample Sentences"):
            filtered_lines.append(line)

    with open(output_path, 'w') as file:
        file.writelines(filtered_lines)

def process_directory(input_directory, output_directory):
    # Ensure output directory exists
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    for filename in os.listdir(input_directory):
        if filename.endswith(".txt"):  # Adjust if you have other file types
            input_file_path = os.path.join(input_directory, filename)
            output_file_path = os.path.join(output_directory, f"cleaned_{filename}")

            clean_data(input_file_path, output_file_path)
            print(f"Processed file: {filename}")

# Use the function
input_dir = '/'  # Replace with your input directory path
output_dir = 'cleaned_files'  # Replace with your desired output directory path

process_directory(input_dir, output_dir)


In [5]:
def read_data_from_files(file_paths):
    sentences = []
    labels = []

    for file_path in file_paths:
        with open(file_path, 'r') as file:
            lines = file.read().strip().split('\n')

        sentence = []
        label = []

        for line in lines:
            # Remove any extra whitespace
            line = line.strip()
            if not line:
                continue

            parts = line.split()
            if len(parts) == 2:
                token, tag = parts
                sentence.append(token)
                label.append(tag)
            elif len(parts) > 2:
                # Handle cases where extra information might be present
                token_tag = " ".join(parts[:-1])  # Join all but the last part
                tag = parts[-1]  # The last part is assumed to be the tag
                sentence.append(token_tag)
                label.append(tag)
            else:
                print(f"Skipping line: '{line}'. Expected format: 'token tag', found: {len(parts)} parts.")

        if sentence:  # Add sentence and labels to lists
            sentences.append(sentence)
            labels.append(label)

    return sentences, labels

# Example usage
file_paths = [
    'conv1.txt', 'conv2.txt'
]  # List of your files

sentences, labels = read_data_from_files(file_paths)

# Print out the first few sentences and labels to verify
print("Sample Sentences:", sentences[:2])
print("Sample Labels:", labels[:2])


Sample Sentences: [['Salesperson:', 'We', 'have', 'a', 'few', 'options', 'in', 'terms', 'of', 'colors', ',', 'sir', '.', 'What', 'color', 'are', 'you', 'interested', 'in', '?', 'So', 'far', 'we', "'ve", 'got', 'only', 'blue', 'and', 'white', '.', 'Customer:', 'Okay', ',', 'white', 'is', 'okay', '.', 'No', 'special', 'preference', ',', 'and', 'what', 'about', 'the', 'budget', '?', 'How', 'much', 'can', 'you', 'offer', 'for', 'that', '?', 'No', 'details', 'mentioned', '.', 'Salesperson:', 'Yes', ',', 'let', 'me', 'confirm', '.', 'The', 'starting', 'price', 'is', 'around', 'Rs', '.', '10', ',', '50', ',', '000', '.', 'That', '’s', 'for', 'the', '2021', 'model', ',', 'diesel', 'SUV', '.', 'Customer:', 'The', 'car', 'without', 'any', 'label', '?', 'You', 'remove', 'the', 'labels', 'for', 'the', 'sold', 'ones', ',', 'or', 'what', '?', 'Salesperson:', 'Correct', ',', 'sir', '.', 'Sold', 'cars', 'don', '’t', 'usually', 'have', 'labels', 'on', 'them', '.', 'Customer:', 'Right', ',', 'so', 'my',

In [6]:
from transformers import AutoTokenizer
import numpy as np

def tokenize_and_align_labels(sentences, labels, tokenizer, label_list):
    label_map = {label: i for i, label in enumerate(label_list)}
    tokenized_inputs = []
    aligned_labels = []

    for sentence, label_seq in zip(sentences, labels):
        # Tokenize the input sentence
        tokenized_input = tokenizer(sentence, is_split_into_words=True, truncation=True, padding='max_length', return_tensors="pt")
        token_ids = tokenized_input['input_ids'].squeeze().tolist()
        tokens = tokenizer.convert_ids_to_tokens(token_ids)

        # Align labels with tokenized words
        label_ids = [label_map.get(label, label_map['O']) for label in label_seq]
        label_ids += [label_map.get('O')] * (len(tokens) - len(label_ids))  # Padding for alignment

        tokenized_inputs.append(tokenized_input)
        aligned_labels.append(label_ids)

    return tokenized_inputs, aligned_labels, label_map

# Example usage
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Example data
sentences = [["hello", "world"], ["how", "are", "you"]]
labels = [["O", "O"], ["O", "O", "O"]]
label_list = ["O"]  # Define your label list

tokenized_inputs, aligned_labels, label_map = tokenize_and_align_labels(sentences, labels, tokenizer, label_list)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
from transformers import BertTokenizerFast

def tokenize_and_align_labels(sentences, labels, tokenizer, label_list):

    # Map labels to IDs
    label_map = {label: i for i, label in enumerate(label_list)}

    # Tokenize the sentences with return_offsets_mapping=True
    encodings = tokenizer(sentences, truncation=True, padding=True, is_split_into_words=True, return_offsets_mapping=True)

    # Align the labels with the tokenized outputs
    aligned_labels = []
    for i, labels_seq in enumerate(labels):
        word_ids = encodings.word_ids(i)  # Word IDs for sentence i
        prev_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special token, so ignore the label
            elif word_idx != prev_word_idx:
                label_ids.append(label_map.get(labels_seq[word_idx], -100))  # Map label to ID
            else:
                label_ids.append(-100)  # Avoid duplicate label for the same word
            prev_word_idx = word_idx
        aligned_labels.append(label_ids)

    return encodings, aligned_labels, label_map

# Example usage
sentences = [["Hello", "world"], ["This", "is", "a", "test"]]
labels = [["O", "O"], ["O", "O", "O", "O"]]
label_list = ["O"]

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
tokenized_inputs, aligned_labels, label_map = tokenize_and_align_labels(sentences, labels, tokenizer, label_list)


In [8]:
def create_dataloader(tokenized_inputs, aligned_labels, batch_size=16):
    input_ids = torch.tensor(tokenized_inputs['input_ids'])
    attention_mask = torch.tensor(tokenized_inputs['attention_mask'])
    labels = torch.tensor(aligned_labels)

    dataset = TensorDataset(input_ids, attention_mask, labels)
    dataloader = DataLoader(dataset, batch_size=batch_size)

    return dataloader

# Example usage
dataloader = create_dataloader(tokenized_inputs, aligned_labels)


In [10]:
import os
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
)
from datasets import load_dataset, load_metric

# Step 1: Define paths and settings
model_name = "bert-base-uncased"
num_labels = 12  # Adjust this to match your number of labels

# Step 2: Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

# Define function to preprocess data
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', return_offsets_mapping=False)

# List of dataset paths
dataset_paths = [
    "conv1.txt", "conv2.txt", "conv3.txt", "conv4.txt",
    "conv5.txt", "conv6.txt", "conv7.txt", "conv8.txt"
]

# Define function to load and preprocess dataset
def load_and_preprocess_dataset(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Dataset '{path}' does not exist.")

    # Load dataset from file
    dataset = load_dataset('text', data_files={'train': path}, split='train')
    dataset = dataset.map(preprocess_function, batched=True)

    # Split dataset into train and eval
    split = dataset.train_test_split(test_size=0.1)
    train_dataset = split['train']
    eval_dataset = split['test']

    return train_dataset, eval_dataset

# Step 4: Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Step 5: Define data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Step 6: Define compute_metrics function
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=2)
    results = metric.compute(predictions=predictions, references=labels)
    return results

# Step 7: Train and evaluate model for each dataset
for path in dataset_paths:
    try:
        train_dataset, eval_dataset = load_and_preprocess_dataset(path)

        # Initialize Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        # Train the model
        trainer.train()

        # Evaluate the model
        results = trainer.evaluate()

        # Print evaluation results
        print(f"Evaluation results for dataset {path}:")
        for key, value in results.items():
            print(f"{key}: {value}")

    except Exception as e:
        print(f"An error occurred while processing dataset {path}: {e}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


An error occurred while processing dataset conv1.txt: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,token_type_ids,attention_mask.


Map:   0%|          | 0/620 [00:00<?, ? examples/s]

An error occurred while processing dataset conv2.txt: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,token_type_ids,attention_mask.
An error occurred while processing dataset conv3.txt: Dataset 'conv3.txt' does not exist.
An error occurred while processing dataset conv4.txt: Dataset 'conv4.txt' does not exist.
An error occurred while processing dataset conv5.txt: Dataset 'conv5.txt' does not exist.
An error occurred while processing dataset conv6.txt: Dataset 'conv6.txt' does not exist.
An error occurred while processing dataset conv7.txt: Dataset 'conv7.txt' does not exist.
An error occurred while processing dataset conv8.txt: Dataset 'conv8.txt' does not exist.


In [11]:
model.save_pretrained('./bert-ner-model')
tokenizer.save_pretrained('./bert-ner-model')


('./bert-ner-model/tokenizer_config.json',
 './bert-ner-model/special_tokens_map.json',
 './bert-ner-model/vocab.txt',
 './bert-ner-model/added_tokens.json',
 './bert-ner-model/tokenizer.json')

In [12]:
from transformers import pipeline

# Load the trained model
model = BertForTokenClassification.from_pretrained('./bert-ner-model')
tokenizer = BertTokenizer.from_pretrained('./bert-ner-model')

# Create NER pipeline
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

# Test with a sample text
text = "The car is a red sedan with an automatic transmission, 50,000 km driven, and was manufactured in 2018."
ner_results = nlp(text)
print(ner_results)


[{'entity': 'LABEL_10', 'score': 0.13237429, 'index': 1, 'word': 'the', 'start': None, 'end': None}, {'entity': 'LABEL_6', 'score': 0.15134872, 'index': 2, 'word': 'car', 'start': None, 'end': None}, {'entity': 'LABEL_2', 'score': 0.114651434, 'index': 3, 'word': 'is', 'start': None, 'end': None}, {'entity': 'LABEL_10', 'score': 0.11921799, 'index': 4, 'word': 'a', 'start': None, 'end': None}, {'entity': 'LABEL_0', 'score': 0.13467012, 'index': 5, 'word': 'red', 'start': None, 'end': None}, {'entity': 'LABEL_6', 'score': 0.14524344, 'index': 6, 'word': 'sedan', 'start': None, 'end': None}, {'entity': 'LABEL_2', 'score': 0.12074226, 'index': 7, 'word': 'with', 'start': None, 'end': None}, {'entity': 'LABEL_9', 'score': 0.12584478, 'index': 8, 'word': 'an', 'start': None, 'end': None}, {'entity': 'LABEL_2', 'score': 0.12192302, 'index': 9, 'word': 'automatic', 'start': None, 'end': None}, {'entity': 'LABEL_2', 'score': 0.12994146, 'index': 10, 'word': 'transmission', 'start': None, 'end'

In [15]:
import os
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
)
from datasets import load_dataset, load_metric

# Step 1: Define paths and settings
model_name = "bert-base-uncased"  # Replace with your model name or path
num_labels = 12  # Replace with the number of labels in your classification task

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

# Define function to preprocess data
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', return_offsets_mapping=False)

# Define function to load and preprocess test dataset
def load_and_preprocess_dataset(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Dataset '{path}' does not exist.")

    # Load dataset from file
    dataset = load_dataset('text', data_files={'test': path}, split='test')
    dataset = dataset.map(preprocess_function, batched=True)

    return dataset

# Define the compute_metrics function
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=2)
    results = metric.compute(predictions=predictions, references=labels)
    return results

# Load the test dataset
test_dataset_path = "conv3.txt"  # Replace with your test dataset path
test_dataset = load_and_preprocess_dataset(test_dataset_path)

# Define training arguments (can be minimal for evaluation)
training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=16,
)

# Initialize Trainer for evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Evaluate the model
results = trainer.evaluate()

# Print evaluation results
print("Evaluation results:")
for key, value in results.items():
    print(f"{key}: {value}")

# Calculate and print accuracy
accuracy = results.get("eval_accuracy", "Accuracy metric not found")
print(f"Accuracy: {accuracy}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 

In [17]:
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [18]:
trainer.model.push_to_hub("alstonpeter/finetuned-bert-car-sales-ner")
tokenizer.push_to_hub("alstonpeter/finetuned-bert-car-sales-ner")

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/alstonpeter/finetuned-bert-car-sales-ner/commit/3d51e31f16217eba5de17ad5082371bd69286898', commit_message='Upload tokenizer', commit_description='', oid='3d51e31f16217eba5de17ad5082371bd69286898', pr_url=None, pr_revision=None, pr_num=None)