In [16]:
#load packages
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
from typing import Union, List, Tuple
import json
from tqdm import tqdm 
from peft import PeftModel
from datasets import Dataset

In [None]:
#OBS need to be made usable
'''class BlameDetectorDa(object):

    def __init__(self, model_path, max_length, batch_size = None):

        self.model_path = model_path
        self.max_length = max_length
        self.batch_size = batch_size

        self.model_initialization()

        return

    def model_initialization(self):
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_path)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)

        self.model.eval()
            
        # Move to GPU if available
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(device)

        print(f"Model loaded successfully on {self.device}")

        return

    def predict(self):
        """Make a prediction on a single text input."""
        # Tokenize input
        inputs = self.tokenizer(
            self.text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        # Move inputs to device
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        # Make prediction
        with torch.no_grad():
            outputs = self.model(**inputs)
            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=1)
            predicted_class = torch.argmax(probabilities, dim=1).item()
            confidence = probabilities[0][predicted_class].item()
        
        return predicted_class, confidence, probabilities[0].cpu().numpy()

    def run_prediction(self, text):

        self.text = text
        predicted_class, confidence, probs = self.predict()
            
        return predicted_class, confidence
'''


In [None]:
'''example = "Goddad min kære ven, jeg kan virkelig godt lide dig."

BD = BlameDetectorDa()
BD.run_prediction(text = example)'''

In [2]:


class BlameDetectorDa(object):
    """
    Danish Blame Detection classifier with support for single and batch predictions.
    """

    def __init__(self, model_path: str, max_length: int = 512, batch_size: int = None ):
        """
        Initialize the BlameDetectorDa classifier.
        
        Args:
            model_path: Path to the pretrained model
            max_length: Maximum sequence length for tokenization
            batch_size: Batch size for batch predictions (None for single predictions)
        """
        self.model_path = model_path
        self.max_length = max_length
        self.batch_size = batch_size

        #, num_labels = 2, base_model_name = ''
        #self.base_model_name = base_model_name
        #self.num_labels = num_labels

        self.model_initialization()

    def model_initialization(self):
        """Initialize the model, tokenizer, and device."""
        print(f"Loading model from {self.model_path}...")
        
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_path,
            trust_remote_code=True  # Add this for ModernBERT
        )
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)

        
            
        # Move to GPU if available
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)  # Fixed: was 'device', should be 'self.device'
        self.model.eval()
        print(f"Model loaded successfully on {self.device}")

    def model_initialization_prelim(self):
        print(f"Loading model from {self.model_path}...")
    
        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.base_model_name)
        
        # Load base model with classification head
        print(f"Loading base model: {self.base_model_name}")
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.base_model_name,
            num_labels=self.num_labels,
            trust_remote_code=True
        )
        
        # Load LoRA adapters
        print("Loading LoRA adapters...")
        self.model = PeftModel.from_pretrained(self.model, self.model_path)
        
        # Merge LoRA weights with base model for faster inference (optional)
        print("Merging LoRA weights...")
        self.model = self.model.merge_and_unload()
        
        # Set model to evaluation mode
        self.model.eval()
        
        # Move to GPU if available
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        
        print(f"Model loaded successfully on {self.device}")
        return
        

    def predict_single(self, text: str) -> Tuple[int, float, np.ndarray]:
        """
        Make a prediction on a single text input.
        
        Args:
            text: Input text to classify
            
        Returns:
            Tuple of (predicted_class, confidence, probabilities)
        """
        # Tokenize input
        inputs = self.tokenizer(
            text,
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        # Move inputs to device
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        # Make prediction
        #with torch.inference_mode():  # Faster than no_grad()
        with torch.no_grad():
            outputs = self.model(**inputs)
            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=1)
            predicted_class = torch.argmax(probabilities, dim=1).item()
            confidence = probabilities[0][predicted_class].item()
        
        return predicted_class, confidence, probabilities[0].cpu().numpy()

    def predict_batch(self, texts: List[str]) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """
        Make predictions on a batch of text inputs.
        
        Args:
            texts: List of input texts to classify
            
        Returns:
            Tuple of (predicted_classes, confidences, all_probabilities)
        """
        # Tokenize all inputs
        inputs = self.tokenizer(
            texts,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        # Move inputs to device
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        # Make predictions
        with torch.no_grad():
            outputs = self.model(**inputs)
            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=1)
            predicted_classes = torch.argmax(probabilities, dim=1).cpu().numpy()
            confidences = probabilities.max(dim=1).values.cpu().numpy()
            all_probs = probabilities.cpu().numpy()
        
        return predicted_classes, confidences, all_probs

    def run_prediction(
        self, 
        text: Union[str, List[str]]
    ) -> Union[Tuple[int, float], Tuple[np.ndarray, np.ndarray]]:
        """
        Run prediction on single text or batch of texts.
        
        Args:
            text: Single text string or list of text strings
            
        Returns:
            For single text: (predicted_class, confidence)
            For batch: (predicted_classes, confidences)
        """
        # Check if input is a list (batch) or single text
        if isinstance(text, list):
            # Batch prediction
            if self.batch_size is not None and len(text) > self.batch_size:
                # Process in batches
                all_classes = []
                all_confidences = []
                
                for i in range(0, len(text), self.batch_size):
                    batch = text[i:i + self.batch_size]
                    classes, confidences, _ = self.predict_batch(batch)
                    all_classes.extend(classes)
                    all_confidences.extend(confidences)
                
                return np.array(all_classes), np.array(all_confidences)
            else:
                # Single batch prediction
                predicted_classes, confidences, _ = self.predict_batch(text)
                return predicted_classes, confidences
        else:
            # Single text prediction
            predicted_class, confidence, _ = self.predict_single(text)
            return predicted_class, confidence




In [3]:
# ---- 1. Load JSON dataset ----
import json
json_path = "/work/MarkusLundsfrydJensen#1865/Bachelor_project/Model_data/validation_set.json"

with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)


In [36]:
detector = BlameDetectorDa(
        model_path="runetrust/mmBERT-blame-da-test",
        max_length=512,
        batch_size=224
    )

Loading model from runetrust/mmBERT-blame-da-test...
Model loaded successfully on cuda


In [10]:



detector = BlameDetectorDa(
        model_path="/work/MarkusLundsfrydJensen#1865/Bachelor_project/full_tune_results/checkpoint-3032",
        max_length=512,
        batch_size=128,
        num_labels = 2,
        base_model_name = "jhu-clsp/mmBERT-base"
        
    )

Loading model from /work/MarkusLundsfrydJensen#1865/Bachelor_project/full_tune_results/checkpoint-3032...
Loading base model: jhu-clsp/mmBERT-base


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at jhu-clsp/mmBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading LoRA adapters...
Merging LoRA weights...
Model loaded successfully on cuda


In [37]:


# ---- 2. Extract texts ----
texts = [item["text"] for item in data if "text" in item]

# ---- 3. Run predictions in batches ----
pred_classes, confidences = detector.run_prediction(texts)


# ---- 4. Attach predictions back to data ----
for item, cls, conf in zip(data, pred_classes, confidences):
    item["predicted_class"] = int(cls)
    item["confidence"] = float(conf)


#save data



In [38]:
true_labels = 0
pred_true = 0
false_labels = 0
pred_false = 0

correct_pred = 0
incorrect_pred = 0

for entry in data:
    if entry["label"] == 1:
        true_labels += 1
    if entry["predicted_class"] == 1:
        pred_true +=1

    if entry["label"] == 0:
        false_labels += 1
    if entry["predicted_class"] == 0:
        pred_false +=1

    if entry["label"] == entry["predicted_class"]:
        correct_pred +=1

    if entry["label"] != entry["predicted_class"]:
        incorrect_pred +=1



In [39]:
correct_pred/len(data)



0.4844961240310077

In [None]:
def weighted_bincrossentropy(true, pred, weight_zero = 99.0, weight_one = 1):
    """
    Calculates weighted binary cross entropy. The weights are fixed to represent class imbalance in the dataset.
        
    For example if there are 10x as many positive classes as negative classes,
        if you adjust weight_zero = 1.0, weight_one = 0.1, then false positives 
        will be penalized 10 times as much as false negatives.

    """
  
    # calculate the binary cross entropy
    bin_crossentropy = binary_crossentropy(true, pred)
    
    # apply the weights
    weights = true * weight_one + (1. - true) * weight_zero
    #weights /= (weight_one + weight_zero) # Normalizing to be more consistent with regular BCE for comparison 
    weighted_bin_crossentropy = weights * bin_crossentropy 

    return np.mean(weighted_bin_crossentropy)



def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    #From logits to probabilities
    probs_2d = np.exp(predictions) / np.exp(predictions).sum(axis=1, keepdims=True)
    probs = probs_2d[:, 1]  # positive class extraction
    
    weigthted_bce = weighted_bincrossentropy(labels, probs)
    keras_bce = binary_crossentropy(labels, probs)
    keras_bce = float(np.mean(keras_bce.numpy()))  # Converting from keras eagertensor to float value
    
    # Wrapping all metrics to floats for json serialization during model eval
    return {
        'keras_BCE': keras_bce,
        'weighted BCE': weigthted_bce,
        'recall': float(recall_score(labels, probs.round())),
        'precision': float(average_precision_score(labels, probs)),
        'accuracy': float(accuracy_score(labels, probs.round())), # Need rounding for these two computations (integer required)
        'f1': float(f1_score(labels, probs.round(), average='macro')) # macro f1 is better for imbalanced dataset
    }

# Custom trainer class (weigthed)
from collections import Counter

labels = test_dataframe['label'].tolist()
class_counts = Counter(labels)
total = sum(class_counts.values())

# Higher weight = more emphasis
weights = [total/class_counts[0], total/class_counts[1]]
class_weights = torch.tensor(weights, dtype=torch.float)

#define custom trainer that uses weigted loss
import torch.nn as nn

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        # Define weighted loss
        loss_fct = nn.CrossEntropyLoss(weight=class_weights.to(model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss


'''
Look into learning rates, model is currently overfitting quite drastically ("small" test-set)
Normalizing weigthed BCE or no?
Look into regularization, dropout and early stopping to avoid overfitting
'''

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_test,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
)


trainer.train()

eval_results = trainer.evaluate()
print(eval_results)
with open("/work/RuneEgeskovTrust#9638/Bachelor/Bachelor_project/Evalresult345templates.txt", "w") as f:
    f.write(str(eval_results))

# This is where we should very much remember to save the finetuned model locally as this contains the new weights for use in analyzing new text
lora_model.save_pretrained(f"output/mmBERT/template_3_4_5_model")

In [29]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    #From logits to probabilities
    probs_2d = np.exp(predictions) / np.exp(predictions).sum(axis=1, keepdims=True)
    probs = probs_2d[:, 1]  # positive class extraction
    
    weigthted_bce = weighted_bincrossentropy(labels, probs)
    keras_bce = binary_crossentropy(labels, probs)
    keras_bce = float(np.mean(keras_bce.numpy()))  # Converting from keras eagertensor to float value
    
    # Wrapping all metrics to floats for json serialization during model eval
    return {
        'keras_BCE': keras_bce,
        'weighted BCE': weigthted_bce,
        'recall': float(recall_score(labels, probs.round())),
        'precision': float(average_precision_score(labels, probs)),
        'accuracy': float(accuracy_score(labels, probs.round())), # Need rounding for these two computations (integer required)
        'f1': float(f1_score(labels, probs.round(), average='macro')) # macro f1 is better for imbalanced dataset
    }

def weighted_bincrossentropy(true, pred, weight_zero = 99.0, weight_one = 1):
    """
    Calculates weighted binary cross entropy. The weights are fixed to represent class imbalance in the dataset.
        
    For example if there are 10x as many positive classes as negative classes,
        if you adjust weight_zero = 1.0, weight_one = 0.1, then false positives 
        will be penalized 10 times as much as false negatives.

    """
  
    # calculate the binary cross entropy
    bin_crossentropy = binary_crossentropy(true, pred)
    
    # apply the weights
    weights = true * weight_one + (1. - true) * weight_zero
    #weights /= (weight_one + weight_zero) # Normalizing to be more consistent with regular BCE for comparison 
    weighted_bin_crossentropy = weights * bin_crossentropy 

    return np.mean(weighted_bin_crossentropy)

from keras.losses import binary_crossentropy

In [23]:
model_name = "jhu-clsp/mmBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
val_dataframe = pd.read_json("/work/MarkusLundsfrydJensen#1865/Bachelor_project/Model_data/validation_set.json")
val_dataframe = val_dataframe[['text', 'label']]
val_dataset = Dataset.from_pandas(val_dataframe)
def tokenize_function(examples):
    return tokenizer(examples["text"], 
    padding="max_length", 
    truncation=True,
    max_length=512, # Padding to 512 to massively cut down on computation compared to base 8,192 tokens. 
    )
tokenized_val = val_dataset.map(tokenize_function)

Map: 100%|██████████| 258/258 [00:00<00:00, 3970.84 examples/s]


In [18]:
correct = 0
for entry in data:
    if entry["label"] == entry["predicted_class"]:
        correct+=1


correct/len(data)

0.49224806201550386

In [None]:
#former was .484
#now 4922

In [31]:
import torch
import numpy as np
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score, average_precision_score, recall_score

# Load model
model_name = "runetrust/mmBERT-blame-da-test"
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.eval()

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

eval_loader = DataLoader(
    tokenized_val.remove_columns(["text"]),  # remove raw text if you don't need it
    batch_size=16,
    shuffle=False,
    collate_fn=data_collator
)
# Collect predictions
all_logits = []
all_labels = []

with torch.no_grad():
    for batch in eval_loader:
        labels = batch["labels"].cpu().numpy()  # HF Dataset usually uses 'labels' or 'label'
        batch = {k: v.to(model.device) for k, v in batch.items()}  # move to GPU/CPU

        outputs = model(**batch)
        logits = outputs.logits.cpu().numpy()

        all_logits.append(logits)
        all_labels.append(labels)

# Convert to numpy arrays
all_logits = np.concatenate(all_logits, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Compute metrics with your function
results = compute_metrics((all_logits, all_labels))
print("Manual recomputed validation metrics:", results)


Manual recomputed validation metrics: {'keras_BCE': 0.8103038668632507, 'weighted BCE': np.float32(53.134575), 'recall': 0.8181818181818182, 'precision': 0.40772366052391074, 'accuracy': 0.4844961240310077, 'f1': 0.48168512000966723}
