## Packages Import

In [1]:
# !pip install optuna

import pandas as pd
import torch
import logging
import time
import csv
import optuna
from torch.utils.data import Dataset
from transformers import AutoTokenizer, TrainingArguments,Trainer,DataCollatorWithPadding,TrainerCallback,BertModel
from nltk.corpus import stopwords
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
import matplotlib.pyplot as plt

"""
Colab setting
83.5 GB system ram
Nvidia A100 is used for training
40 GB gpu ram required


reference:

bert lstm
https://www.kaggle.com/code/christofhenkel/bert-embeddings-lstm
https://arxiv.org/abs/2405.05136
https://huggingface.co/google-bert/bert-base-uncased
https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
https://huggingface.co/docs/transformers/en/main_classes/trainer


training config
https://huggingface.co/learn/nlp-course/en/chapter3/3

"""


ModuleNotFoundError: No module named 'optuna'

## Dataset Preparation and Utils

In [None]:
# stop words preparation
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


def remove_stopwords(text):

    tokens = text.split()
    filtered_tokens = []
    for word in tokens:
        if word.lower() not in stop_words:
            filtered_tokens.append(word)
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text


## dataset preparation

In [None]:
class ReviewDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.texts = dataframe['reviewText'].str.lower().tolist()
        self.labels = dataframe['class'].tolist()
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)
        return {'input_ids': input_ids,'attention_mask': attention_mask,'labels': torch.tensor(label)}


## model construction

In [None]:

# base model
class BaseBertModel(nn.Module):
    def __init__(self):
        super(BaseBertModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        return outputs




# construct the model architecture, inspired by https://arxiv.org/abs/2405.05136
class BertLSTMClassifier(torch.nn.Module):
    def __init__(self, model_name, num_labels=2, hidden_size=768, lstm_hidden_size=256, num_lstm_layers=1):
        super(BertLSTMClassifier, self).__init__()
        self.num_labels = num_labels
        self.bert = BertModel.from_pretrained(model_name)
        self.lstm = torch.nn.LSTM(
            input_size=hidden_size,
            hidden_size=lstm_hidden_size,
            num_layers=num_lstm_layers,
            batch_first=True
        )
        self.classifier = torch.nn.Linear(lstm_hidden_size, num_labels)
        self.loss_fn = torch.nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        lstm_output, _ = self.lstm(sequence_output)
        lstm_output = lstm_output[:, -1, :]
        logits = self.classifier(lstm_output)
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
        return (loss, logits) if loss is not None else logits


## training

In [None]:

# Load your data
data_dir = 'drive/MyDrive/CS6220Model/dataset_finalized.json'


data = pd.read_json(data_dir)

print(f"Data length: {len(data)}")

# process data
data['reviewText'] = data['reviewText'].apply(remove_stopwords)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_dataset = ReviewDataset(train_data, tokenizer)
test_dataset = ReviewDataset(test_data, tokenizer)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
fp16_supported = torch.cuda.is_available()

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



## compute metrics, and training config learned from hugging face documents https://huggingface.co/docs/transformers/en/main_classes/trainer

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if isinstance(logits, tuple):
        logits = logits[0]
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc,'precision': precision,'recall': recall,'f1': f1}


def model_init():
    return BertLSTMClassifier(model_name=model_name, num_labels=2).to(device)


# optuna configurations
def objective(trial):

    learning_rate = trial.suggest_float('learning_rate', 1e-5, 5e-5, log=True)
    num_train_epochs = trial.suggest_int('num_train_epochs', 2, 5)
    per_device_train_batch_size = trial.suggest_categorical('per_device_train_batch_size', [8, 16, 32])
    per_device_eval_batch_size = per_device_train_batch_size * 2
    lstm_hidden_size = trial.suggest_int('lstm_hidden_size', 128, 512, step=128)
    num_lstm_layers = trial.suggest_int('num_lstm_layers', 1, 2)



    # set hte parameters

    training_args = TrainingArguments(
        output_dir='./bert-lstm-finetuned',
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size = per_device_train_batch_size,
        per_device_eval_batch_size = per_device_eval_batch_size,
        evaluation_strategy='epoch',
        save_strategy='no',
        logging_steps=100,
        learning_rate=learning_rate,
        fp16=fp16_supported,
        report_to='none',
        load_best_model_at_end=False,
    )


    trainer = Trainer(
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
        data_collator=data_collator,
        tokenizer=tokenizer,
        model_init=BertLSTMClassifier(model_name=model_name,num_labels=2,lstm_hidden_size=lstm_hidden_size,num_lstm_layers=num_lstm_layers).to(device)
    )

    trainer.train()
    eval_result = trainer.evaluate()
    return eval_result['eval_accuracy']

# optuna task init
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)
best_trial = study.best_trial


for key, value in best_trial.params.items():
    print(f"  {key}: {value}")


study.trials_dataframe().to_csv('optuna_study_results.csv')


best_training_args = TrainingArguments(
    output_dir='./bert-lstm-finetuned',
    num_train_epochs=best_trial.params['num_train_epochs'],
    per_device_train_batch_size=best_trial.params['per_device_train_batch_size'],
    per_device_eval_batch_size=best_trial.params['per_device_train_batch_size'] * 2,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_steps=100,
    learning_rate=best_trial.params['learning_rate'],
    fp16=fp16_supported,
    report_to='none',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
)


def best_model_init():
    return BertLSTMClassifier(
        model_name=model_name,
        num_labels=2,
        lstm_hidden_size=best_trial.params['lstm_hidden_size'],
        num_lstm_layers=best_trial.params['num_lstm_layers']
    ).to(device)


trainer = Trainer(
    args=best_training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer,
    model_init=best_model_init,
)

# train starts
trainer.train()

# save model
trainer.save_model('./bert-lstm-finetuned')
tokenizer.save_pretrained('./bert-lstm-finetuned')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Total data points loaded: 100000


Predicting: 100%|██████████| 3125/3125 [05:59<00:00,  8.70it/s]


Predictions saved to predictions.csv
Classification Report on the first 80,000 entries:
              precision    recall  f1-score   support

           0     0.9288    0.9063    0.9174     50000
           1     0.9085    0.9305    0.9194     50000

    accuracy                         0.9184    100000
   macro avg     0.9186    0.9184    0.9184    100000
weighted avg     0.9186    0.9184    0.9184    100000

