# Libraries

In [2]:
import numpy as np

from datasets import load_from_disk

import torch
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)

import evaluate
import time
import datetime

import shutil
import os

# Configurations

In [4]:
INPUT_DIR = "../Data/mams_processed_bert"
OUTPUT_DIR = "../model/absa_bert_model"
LOG_DIR = "../model/working/logs"
MODEL_NAME = "bert-base-uncased"

In [5]:
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5
NUM_LABELS = 3  # Negative, Neutral, Positive

# Load Data

In [6]:
dataset = load_from_disk(INPUT_DIR)

In [7]:
print("Train dataset size:", len(dataset["train"]))
print("Validation dataset size:", len(dataset["validation"]))

Train dataset size: 17045
Validation dataset size: 2220


In [8]:
dataset['train'][0]

{'label': tensor(2),
 'input_ids': tensor([  101,  2009,  2453,  2022,  1996,  2190,  4133,  2091,  2833,  1045,
          1005,  2310,  2018,  1999,  1996,  2181,  1010,  2061,  2065,  2017,
          2024,  2183,  2000,  1996, 10051,  6926,  4250,  1010,  2030,  1996,
          3871,  1010,  2009,  2071,  2022,  2074,  1996,  2173,  2005,  2017,
          1012,   102,  2833,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,   

# Model Setup

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Metrics

In [11]:
metric = evaluate.load('accuracy')

In [12]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

# Model Train

In [13]:
training_args = TrainingArguments(
    output_dir='../model/checkpoints',
    # evaluation and saving strategy
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    # hyperparameters
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    # logging
    logging_dir=LOG_DIR,
    logging_steps=100,
    report_to='none',
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [15]:
print("\n" + "="*30)
print("ðŸš€ Starting Training...")
print("="*30)

start_time = time.time()

trainer.train()

end_time = time.time()
training_time = end_time - start_time
formatted_time = str(datetime.timedelta(seconds=int(training_time)))

print("\n" + "="*30)
print(f"âœ… Training Complete in {formatted_time}")
print("="*30)


ðŸš€ Starting Training...




Epoch,Training Loss,Validation Loss,Accuracy
1,0.5348,0.496877,0.804505
2,0.3841,0.435137,0.835135
3,0.2878,0.43978,0.846847





âœ… Training Complete in 0:13:22


# Evaluation

In [16]:
print("\nRunning Final Evaluation on Validation Set...")
eval_results = trainer.evaluate()
print(f"Final Validation Accuracy: {eval_results['eval_accuracy']:.4f}")
print(f"Final Validation Loss:     {eval_results['eval_loss']:.4f}")


Running Final Evaluation on Validation Set...




Final Validation Accuracy: 0.8468
Final Validation Loss:     0.4398


# Save Model

In [None]:
print("Zipping the model folder...")

zip_path = shutil.make_archive(OUTPUT_DIR, 'zip', OUTPUT_DIR)

print(f"âœ… Model zipped successfully at: {zip_path}")