In [1]:
import os
ROOT = '/home/mav204/Documents/minor-project'
os.chdir(ROOT)
print(ROOT)

/home/mav204/Documents/minor-project


In [2]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd drive/MyDrive/minor-project

In [4]:
import numpy as np
import torch
from torch import nn
from sklearn.metrics import accuracy_score, f1_score, classification_report
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from model.tokenizer import tokenize
from misc.dataset_modifier import separate, combine_data
from model.b_trainer import BertTrainer


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    # Macro F1 is critical here because both files are skewed
    f1 = f1_score(labels, preds, average="macro", zero_division=0)
    return {"accuracy": acc, "f1_macro": f1}

In [None]:
# Define paths to your specific split files
data_files = {
    "train": "./data/updated/combined/train.json",
    "validation": "./data/updated/curated/validation.json"
}
train = separate(data_files["train"])
val = separate(data_files['validation'])


data_files = {
    "train": combine_data(train['correct'], train['incorrect']),
    "validation": combine_data(val['correct'], val['incorrect'])
}

In [None]:
# Load and Tokenize
dataset_dict = tokenize(data_files=data_files, is_training=True)
full_dataset = dataset_dict["train"]

tokenized_data = tokenize(data_files=data_files, is_training=True)

train_labels = [x["labels"].item() for x in tokenized_data["train"]]
counts = np.bincount(train_labels)
total = len(train_labels)

class_weights = torch.tensor([total / (2 * counts[0]), total / (2 * counts[1])], dtype=torch.float)



In [None]:
from transformers import BertTokenizerFast


tokenizer = BertTokenizerFast.from_pretrained("./model/bert_tokenizer")

def print_dataset_sample(dataset, name):
    print(f"\n{'='*30} {name} {'='*30}")
    
    batch = dataset[0]
    
    
    decoded_text = tokenizer.decode(batch["input_ids"], skip_special_tokens=False)
    
    print(f"Label: {batch['labels'].item()} (1=correct, 0=incorrect)")
    print(f"\nDecoded Input Text:\n{decoded_text}")
    
    
    if "token_type_ids" in batch:
        print(f"\nToken Type IDs: {batch['token_type_ids'].tolist()[:50]}...") 
        
print('-------------------------------------------- Datasets --------------------------------------------')

if "train" in tokenized_data:
    print_dataset_sample(tokenized_data["train"], "TRAINING (COMBINED)")
    
if "validation" in tokenized_data:
    print_dataset_sample(tokenized_data["validation"], "VALIDATION")

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
)

training_args = TrainingArguments(
    output_dir="./model/results",
    eval_strategy="epoch", 
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=8,
    save_strategy="epoch",
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    weight_decay=0.01,
    report_to="none"
)

trainer = BertTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    compute_metrics=compute_metrics,
    class_weights=class_weights
)

trainer.train()