# **Finetune BERT Model for Policy Classification**

### Install dependencies

In [12]:
%pip install torch transformers datasets accelerate -q

Note: you may need to restart the kernel to use updated packages.


### Load the BERT Tokenizer

In [1]:
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')




In [16]:
# Load the labeled data CSV
df = pd.read_csv('./labeled_data.csv')

# Ensure the labels are numeric
categories = [
    'free_rc_transfer',
    '5_day_money_back_guarantee',
    'free_rsa_for_one_year',
    'return_policy',
    'No Label'
]
label_mapping = {label: idx for idx, label in enumerate(categories)}
df['label'] = df['label'].map(label_mapping)

# Convert DataFrame to Dataset
dataset = Dataset.from_pandas(df)


### Tokenize the dataset

In [17]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, return_tensors='pt')

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Display the tokenized dataset format
print(tokenized_dataset)


Map: 100%|██████████| 283/283 [00:00<00:00, 1264.91 examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 283
})





### Split the training dataset

In [18]:
# Split the dataset into training and evaluation sets
train_test_split = tokenized_dataset.train_test_split(test_size=0.25)

train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Save the tokenized datasets
train_dataset.save_to_disk('train_dataset')
eval_dataset.save_to_disk('eval_dataset')

# Display the sizes of the splits
print(f"Training dataset size: {len(train_dataset)}")
print(f"Evaluation dataset size: {len(eval_dataset)}")


Saving the dataset (1/1 shards): 100%|██████████| 212/212 [00:00<00:00, 10179.30 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 71/71 [00:00<00:00, 9312.22 examples/s] 

Training dataset size: 212
Evaluation dataset size: 71





In [19]:
from datasets import load_from_disk

# Load the datasets
train_dataset = load_from_disk('train_dataset')
eval_dataset = load_from_disk('eval_dataset')

# Display dataset info
print(train_dataset)
print(eval_dataset)


Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 212
})
Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 71
})


### Load the base model and trainer

In [20]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Define the compute_metrics function
def compute_metrics(p):
    preds = p.predictions.argmax(axis=-1)
    accuracy = accuracy_score(p.label_ids, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Load a pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(categories))

# Define the training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir='./results',
    num_train_epochs=3,
    evaluation_strategy='epoch',
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=10,      # Log every 10 steps
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # Pass the compute_metrics function here
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Now, let's train the model!

In [21]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3201,0.301011,0.929577,0.864114,0.929577,0.895651
2,0.3048,0.260784,0.929577,0.864114,0.929577,0.895651
3,0.1781,0.279025,0.929577,0.864114,0.929577,0.895651


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=81, training_loss=0.331223684673508, metrics={'train_runtime': 52.0043, 'train_samples_per_second': 12.23, 'train_steps_per_second': 1.558, 'total_flos': 167343138607104.0, 'train_loss': 0.331223684673508, 'epoch': 3.0})

In [22]:
# Save the trained model
model.save_pretrained('./trained_model')

# Evaluate the model
results = trainer.evaluate()
results


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2790246903896332,
 'eval_accuracy': 0.9295774647887324,
 'eval_precision': 0.864114263043047,
 'eval_recall': 0.9295774647887324,
 'eval_f1': 0.8956512799424283,
 'eval_runtime': 0.9982,
 'eval_samples_per_second': 71.125,
 'eval_steps_per_second': 9.016,
 'epoch': 3.0}

### Upload the model to HuggingFace Hub

In [None]:
%pip install huggingface_hub[cli]

zsh:1: no matches found: huggingface_hub[cli]
Note: you may need to restart the kernel to use updated packages.


In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
model = BertForSequenceClassification.from_pretrained('./trained_model')

model.push_to_hub("aryaniyaps/finetuned-bert-policy-classifier")
tokenizer.push_to_hub("aryaniyaps/finetuned-bert-policy-classifier")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/aryaniyaps/finetuned-bert-policy-classifier/commit/6e2a26b54f7cace2d8d94624035bf2faf4813136', commit_message='Upload tokenizer', commit_description='', oid='6e2a26b54f7cace2d8d94624035bf2faf4813136', pr_url=None, pr_revision=None, pr_num=None)