# **Finetune BERT Model for Customer Objections**


**Install Dependencies**

In [1]:
pip install torch transformers datasets accelerate -q

In [5]:
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import os

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer.push_to_hub("haz3-jolt/finetuned-bert-cst-obj-classifier")


README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/haz3-jolt/finetuned-bert-cst-obj-classifier/commit/61df9994c01599d82fdaacf2a00eb5fcf00756c8', commit_message='Upload tokenizer', commit_description='', oid='61df9994c01599d82fdaacf2a00eb5fcf00756c8', pr_url=None, pr_revision=None, pr_num=None)

**Preprocess the data in CSV files and Prep the Labels for Tokenizer**

In [12]:
# Load the labeled data CSV
df = pd.read_csv('labeled_data.csv')

# Ensure the labels are numeric
categories = [
    'Refurbishment Quality',
    'Car Issues',
    'Price Issues',
    'Customer Experience Issues',
    'No Label'
]
label_mapping = {label: idx for idx, label in enumerate(categories)}
df['label'] = df['label'].map(label_mapping)

# Convert DataFrame to Dataset
dataset = Dataset.from_pandas(df)


In [13]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, return_tensors='pt')

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Display the tokenized dataset format
print(tokenized_dataset)


Map:   0%|          | 0/282 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 282
})


**Setting Reserve Parameters for evaluating the Finetuned model**

In [14]:
# Split the dataset into training and evaluation sets
train_test_split = tokenized_dataset.train_test_split(test_size=0.25)

train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Save the tokenized datasets
train_dataset.save_to_disk('train_dataset')
eval_dataset.save_to_disk('eval_dataset')

# Display the sizes of the splits
print(f"Training dataset size: {len(train_dataset)}")
print(f"Evaluation dataset size: {len(eval_dataset)}")


Saving the dataset (0/1 shards):   0%|          | 0/211 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/71 [00:00<?, ? examples/s]

Training dataset size: 211
Evaluation dataset size: 71


In [15]:
from datasets import load_from_disk

# Load the datasets
train_dataset = load_from_disk('train_dataset')
eval_dataset = load_from_disk('eval_dataset')

# Display dataset info
print(train_dataset)
print(eval_dataset)


Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 211
})
Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 71
})


**Preperaing Comupte Metrics**

In [16]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Define the compute_metrics function
def compute_metrics(p):
    preds = p.predictions.argmax(axis=-1)
    accuracy = accuracy_score(p.label_ids, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Load a pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(categories))

# Define the training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir='./results',
    num_train_epochs=3,
    evaluation_strategy='epoch',
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=10,      # Log every 10 steps
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # Pass the compute_metrics function here
)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# Train the model
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5294,0.503076,0.887324,0.787344,0.887324,0.834349
2,0.6106,0.465104,0.887324,0.787344,0.887324,0.834349
3,0.3625,0.440197,0.887324,0.787344,0.887324,0.834349


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=81, training_loss=0.5502715671871915, metrics={'train_runtime': 74.3573, 'train_samples_per_second': 8.513, 'train_steps_per_second': 1.089, 'total_flos': 166553784179712.0, 'train_loss': 0.5502715671871915, 'epoch': 3.0})

In [18]:
# Save the trained model
model.save_pretrained('./trained_model')

# Evaluate the model
results = trainer.evaluate()
print(results)


{'eval_loss': 0.4401974081993103, 'eval_accuracy': 0.8873239436619719, 'eval_precision': 0.7873437809958342, 'eval_recall': 0.8873239436619719, 'eval_f1': 0.834349379861257, 'eval_runtime': 2.0214, 'eval_samples_per_second': 35.125, 'eval_steps_per_second': 4.452, 'epoch': 3.0}


  _warn_prf(average, modifier, msg_start, len(result))


In [3]:
pip install huggingface_hub[cli]

Collecting InquirerPy==0.3.4 (from huggingface_hub[cli])
  Downloading InquirerPy-0.3.4-py3-none-any.whl.metadata (8.1 kB)
Collecting pfzy<0.4.0,>=0.3.1 (from InquirerPy==0.3.4->huggingface_hub[cli])
  Downloading pfzy-0.3.4-py3-none-any.whl.metadata (4.9 kB)
Downloading InquirerPy-0.3.4-py3-none-any.whl (67 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.7/67.7 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pfzy-0.3.4-py3-none-any.whl (8.5 kB)
Installing collected packages: pfzy, InquirerPy
Successfully installed InquirerPy-0.3.4 pfzy-0.3.4


# **Publishing Model upstream to HuggingFace**

In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
trainer.model.push_to_hub("haz3-jolt/finetuned-bert-cst-obj-classifier")
tokenizer.push_to_hub("haz3-jolt/finetuned-bert-cst-obj-classifier")