In [None]:
%pip install -r "requirements_bert.txt"

In [None]:
import torch
import transformers
from peft import LoraConfig, get_peft_model, TaskType
import bitsandbytes
import accelerate
import datasets
#import scikit-learn
import numpy as np
import pandas as pd
from datetime import datetime
from transformers import Conv1D, AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig, AutoModelForCausalLM
from sklearn.model_selection import train_test_split
from datasets import Dataset

In [None]:
model_name = "jhu-clsp/mmBERT-base"

quantization_config = BitsAndBytesConfig(
                                        load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.bfloat16,
                                         bnb_4bit_quant_type="nf4",
                                         bnb_4bit_use_double_quant=True,
                                         )

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=quantization_config,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
lora_config = LoraConfig(
    r=8,  # Low-rank dimension
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["Wqkv"],  # Fine-tuning the attention layer specifically
)

lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()

In [None]:
'''
Consider the batch size, could be increased for efficiency purposes.
ADD WEIGHTED LOSS FUNCTION FOR HITS ON 1 FOR POSITIVE LABELLING AS THIS IS WAY MORE RARE = HIGHER "REWARD"
Have a look at learning rate and gradient norm clipping which I need to read up on.
    In addition to this, look at the implications of gradient accumulation steps
    Much of this pipeline was constrained due to computational restrictions which I think was caused by errors and not actual training process.
Early stopping: load_best_model_at_end=True
'''

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    auto_find_batch_size=True, # Allows for auto adjusting of batch to avoid OOM
    gradient_accumulation_steps=12,  # Simulate larger batch size

    logging_steps=1,
    eval_strategy="epoch",
    save_strategy="epoch",
    fp16=True,  # Enable mixed precision
    dataloader_pin_memory=False,
    remove_unused_columns=True, # Avoiding manual handling of residual text columns
    max_grad_norm=1.0,

    disable_tqdm=False,
)

In [None]:
'''
Dataset generation chunk
We need to pass it through the BERT tokenizer here, make a train / test / val split and pass that to the model

Below is the structure which worked for the Pol_NLI dataset, we should strive to do the same
'''
dataframe = pd.read_json("/work/RuneEgeskovTrust#9638/Bachelor/training_data/training_data.json")


In [None]:
dataset = dataframe[0:5000]

random_dataset = dataset.sample(n=2000, axis=0, random_state=40)

X = dataset["text"]
y = dataset["label"]

X_train, y_train, X_test, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [9]:
# From dataframe to dataset for mapping tokenizer function 
random_dataset = random_dataset[['preceding_sentence', 'text', 'succeeding_sent', 'label']]
dataset = Dataset.from_pandas(random_dataset)


# ADD CONTEXT
'''
def tokenize_function(example):
    return tokenizer(example["preceding_sentence"],
                     example["text"],
                     example["succeeding_sent"],
                     padding="max_length",
                     truncation=True)
'''

def tokenize_function(example):
    result = tokenizer(
        example["preceding_sentence"],
        example["text"],
        example["succeeding_sent"],
        padding="max_length",
        truncation=True
    )
    #print("Keys returned by tokenizer:", result.keys())
    return result


tokenized_dataset = dataset.map(tokenize_function)

Map: 100%|██████████| 2000/2000 [00:06<00:00, 314.57 examples/s]


In [10]:
test = tokenize_function(dataset[0])
print(test.keys())

KeysView({'input_ids': [2, 2028, 14108, 43767, 791, 496, 17924, 3355, 84956, 235273, 235269, 3355, 916, 14108, 235248, 12160, 101995, 80294, 103000, 604, 134049, 42134, 10009, 235269, 159864, 2407, 696, 235248, 916, 152864, 2303, 3879, 235269, 696, 916, 34096, 2726, 604, 19128, 3355, 6191, 235251, 235248, 89246, 45718, 235265, 1, 59902, 1123, 604, 35886, 13191, 1567, 23844, 655, 226184, 32821, 207251, 235269, 696, 235248, 916, 48676, 638, 2326, 85212, 672, 235269, 3355, 496, 2239, 134924, 18998, 13191, 3410, 2709, 48676, 235269, 696, 235248, 12883, 12252, 12160, 14108, 27515, 2407, 5011, 696, 1586, 14700, 554, 4091, 13357, 604, 235248, 168279, 6979, 220675, 14412, 235269, 3393, 23383, 8729, 5011, 177725, 3355, 709, 1522, 49064, 80919, 235269, 604, 235248, 3554, 2623, 3410, 2709, 194485, 1223, 59967, 641, 1070, 496, 42815, 2828, 564, 24854, 496, 659, 235248, 90577, 115302, 235265, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
tokenized_dataset.column_names

In [None]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.2)

In [None]:
split_dataset["train"]["label"]
split_dataset["train"]["labels"]

In [None]:
X, y = tokenized_dataset["text"], tokenized_dataset["label"]

In [None]:

# Preliminary 80/20 split for training
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [None]:
# Further 50/50 split from the 20% test data so 10% of total data is test and 10% is validation
# For a final 80:10:10 split following convention
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': f1_score(labels, predictions, average='macro') # Macro is better suited for imbalanced data
    }

In [None]:
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    compute_metrics=compute_metrics,
)


trainer.train()

In [None]:
'''
We would expect to see a gradual decrease in both training and validation loss.
If either om them split too far from eachother that indicates issues with the training process.
The process itself should be pretty smooth with no dips either up or down.
'''

In [None]:
# This is where we should very much remember to save the finetuned model locally as this contains the new weights for use in analyzing new text
lora_model.save_pretrained(f"output/mmBERT/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")