# Fine-Tunning BERT for CHIME Framework DREAMS


Installing Compatible packages

In [None]:
!pip uninstall -y numpy datasets
!pip install numpy==1.26.4 datasets --no-cache-dir


In [15]:
import pandas as pd
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
from datasets import Dataset

## Reading the Dataset

In [69]:
df  = pd.read_csv("/content/dataset.csv")

In [70]:
df['labels'].value_counts()

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
Meaning,112
Empowerment,109
Identity,106
Hope,98
Connectedness,79


## Train Test Split

In [71]:
train_df, val_df = train_test_split(df, test_size=0.1, stratify=df['labels'])

### Exporting the Dataset from Pandas To Hugging Face Dataset

In [72]:
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))

## Model Selection
### In this case we have selected the bert-base-uncased model from google with the hugging face transformers model for the tokenizer

In [73]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(df['labels'].unique()))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
print(train_dataset[0])

### Defining the Tokenizer Function
We have the labels column in text forms but the model needs them to be in numerical format so we need to convert those

In [None]:
def tokenize_function(examples):
  tokenized_inputs = tokenizer(examples['CAPTIONS'], padding='max_length', truncation=True, max_length=128)
  label_map = {label: i for i, label in enumerate(unique_labels)}
  tokenized_inputs['labels'] = [label_map[label] for label in examples['labels']]
  return tokenized_inputs

unique_labels = df['labels'].unique().tolist()

#### Mapping the Dataset to the Tokenizer Function

In [None]:
train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["CAPTIONS"])
val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=["CAPTIONS"])


#### Setting the Format for the Model


In [None]:
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


### Defining the Trainer Parameters

In [None]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(unique_labels))

# Re-initialize the trainer with the updated model and datasets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

Importing the Evaluation Metrics

In [75]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [76]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


## Fine Tunning the Model

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()

| Step | Training Loss |
|------|---------------|
| 10   | 1.588700      |
| 20   | 1.418100      |
| 30   | 1.310200      |
| 40   | 1.106600      |
| 50   | 0.910300      |
| 60   | 0.744400      |
| 70   | 0.632500      |
| 80   | 0.583700      |





TrainOutput(global_step=87, training_loss=0.9958692802779976, metrics={'train_runtime': 1815.2712, 'train_samples_per_second': 0.749, 'train_steps_per_second': 0.048, 'total_flos': 89394388902144.0, 'train_loss': 0.9958692802779976, 'epoch': 3.0})

## Evaluating on Evaluation Set
{'eval_loss': 0.6967468857765198, 'eval_accuracy': 0.8431372549019608, 'eval_f1': 0.8416544596031978, 'eval_precision': 0.8469498910675383, 'eval_recall': 0.8431372549019608, 'eval_runtime': 30.1742, 'eval_samples_per_second': 1.69, 'eval_steps_per_second': 0.133, 'epoch': 3.0}

In [None]:
eval_results = trainer.evaluate()
print(eval_results)

### Testing custom captions

In [94]:
custom_text = "I am not my past but i am my future"

tokenized_input = tokenizer(custom_text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")

with torch.no_grad():
    outputs = model(**tokenized_input)
    logits = outputs.logits

predicted_class_index = torch.argmax(logits, dim=1).item()

predicted_label = unique_labels[predicted_class_index]

print(f"The predicted label for the custom text is: {predicted_label}")

The predicted label for the custom text is: Empowerment


Exporting my model for future reference

In [95]:
output_dir = "./my_bert_model"

model.save_pretrained(output_dir)

tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")

Model and tokenizer saved to ./my_bert_model
