In [1]:
#load packages
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
import pandas as pd

In [None]:
#OBS need to be made usable
'''class BlameDetectorDa(object):

    def __init__(self, model_path, max_length, batch_size = None):

        self.model_path = model_path
        self.max_length = max_length
        self.batch_size = batch_size

        self.model_initialization()

        return

    def model_initialization(self):
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_path)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)

        self.model.eval()
            
        # Move to GPU if available
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(device)

        print(f"Model loaded successfully on {self.device}")

        return

    def predict(self):
        """Make a prediction on a single text input."""
        # Tokenize input
        inputs = self.tokenizer(
            self.text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        # Move inputs to device
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        # Make prediction
        with torch.no_grad():
            outputs = self.model(**inputs)
            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=1)
            predicted_class = torch.argmax(probabilities, dim=1).item()
            confidence = probabilities[0][predicted_class].item()
        
        return predicted_class, confidence, probabilities[0].cpu().numpy()

    def run_prediction(self, text):

        self.text = text
        predicted_class, confidence, probs = self.predict()
            
        return predicted_class, confidence
'''


In [None]:
example = "Goddad min kære ven, jeg kan virkelig godt lide dig."

BD = BlameDetectorDa()
BD.run_prediction(text = example)

In [35]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
from typing import Union, List, Tuple

class BlameDetectorDa(object):
    """
    Danish Blame Detection classifier with support for single and batch predictions.
    """

    def __init__(self, model_path: str, max_length: int = 512, batch_size: int = None, base_model_name = "jhu-clsp/mmBERT-base", num_labels = 2):
        """
        Initialize the BlameDetectorDa classifier.
        
        Args:
            model_path: Path to the pretrained model
            max_length: Maximum sequence length for tokenization
            batch_size: Batch size for batch predictions (None for single predictions)
        """
        self.model_path = model_path
        self.max_length = max_length
        self.batch_size = batch_size

        self.base_model_name = base_model_name #delete when model is ready
        self.num_labels = num_labels #delete when model is ready

        self.model_initialization_prelim() #change when model is ready

    def model_initialization(self):
        """Initialize the model, tokenizer, and device."""
        print(f"Loading model from {self.model_path}...")
        
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_path,
            trust_remote_code=True  # Add this for ModernBERT
        )
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)

        self.model.eval()
            
        # Move to GPU if available
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)  # Fixed: was 'device', should be 'self.device'

        print(f"Model loaded successfully on {self.device}")

    def model_initialization_prelim(self):
        print(f"Loading model from {self.model_path}...")
    
        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.base_model_name)
        
        # Load base model with classification head
        print(f"Loading base model: {self.base_model_name}")
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.base_model_name,
            num_labels=self.num_labels,
            trust_remote_code=True
        )
        
        # Load LoRA adapters
        print("Loading LoRA adapters...")
        self.model = PeftModel.from_pretrained(self.model, self.model_path)
        
        # Merge LoRA weights with base model for faster inference (optional)
        print("Merging LoRA weights...")
        self.model = self.model.merge_and_unload()
        
        # Set model to evaluation mode
        self.model.eval()
        
        # Move to GPU if available
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(device)
        
        print(f"Model loaded successfully on {self.device}")
        return
        

    def predict_single(self, text: str) -> Tuple[int, float, np.ndarray]:
        """
        Make a prediction on a single text input.
        
        Args:
            text: Input text to classify
            
        Returns:
            Tuple of (predicted_class, confidence, probabilities)
        """
        # Tokenize input
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        # Move inputs to device
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        # Make prediction
        with torch.no_grad():
            outputs = self.model(**inputs)
            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=1)
            predicted_class = torch.argmax(probabilities, dim=1).item()
            confidence = probabilities[0][predicted_class].item()
        
        return predicted_class, confidence, probabilities[0].cpu().numpy()

    def predict_batch(self, texts: List[str]) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """
        Make predictions on a batch of text inputs.
        
        Args:
            texts: List of input texts to classify
            
        Returns:
            Tuple of (predicted_classes, confidences, all_probabilities)
        """
        # Tokenize all inputs
        inputs = self.tokenizer(
            texts,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        # Move inputs to device
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        # Make predictions
        with torch.no_grad():
            outputs = self.model(**inputs)
            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=1)
            predicted_classes = torch.argmax(probabilities, dim=1).cpu().numpy()
            confidences = probabilities.max(dim=1).values.cpu().numpy()
            all_probs = probabilities.cpu().numpy()
        
        return predicted_classes, confidences, all_probs

    def run_prediction(
        self, 
        text: Union[str, List[str]]
    ) -> Union[Tuple[int, float], Tuple[np.ndarray, np.ndarray]]:
        """
        Run prediction on single text or batch of texts.
        
        Args:
            text: Single text string or list of text strings
            
        Returns:
            For single text: (predicted_class, confidence)
            For batch: (predicted_classes, confidences)
        """
        # Check if input is a list (batch) or single text
        if isinstance(text, list):
            # Batch prediction
            if self.batch_size is not None and len(text) > self.batch_size:
                # Process in batches
                all_classes = []
                all_confidences = []
                
                for i in range(0, len(text), self.batch_size):
                    batch = text[i:i + self.batch_size]
                    classes, confidences, _ = self.predict_batch(batch)
                    all_classes.extend(classes)
                    all_confidences.extend(confidences)
                
                return np.array(all_classes), np.array(all_confidences)
            else:
                # Single batch prediction
                predicted_classes, confidences, _ = self.predict_batch(text)
                return predicted_classes, confidences
        else:
            # Single text prediction
            predicted_class, confidence, _ = self.predict_single(text)
            return predicted_class, confidence


'''# Example usage

# Initialize detector
MODEL_PATH = "/work/MarkusLundsfrydJensen#1865/Bachelor_project/output/mmBERT/template_3_4_5_merged"
detector = BlameDetectorDa(
    model_path=MODEL_PATH,
    max_length=512,
    batch_size=8  # Process 8 texts at a time
)

print("Single Prediction Test")


# Single prediction
single_text = "Dette er en test sætning."
pred_class, confidence = detector.run_prediction(single_text)
print(f"\nText: {single_text}")
print(f"Predicted Class: {pred_class}")
print(f"Confidence: {confidence:.4f}")

print("Batch Prediction Test")

# Batch prediction
batch_texts = [
    "Første test sætning.",
    "Anden test sætning.",
    "Tredje test sætning.",
    "Fjerde test sætning.",
]

pred_classes, confidences = detector.run_prediction(batch_texts)

print("\nBatch Results:")
for i, (text, cls, conf) in enumerate(zip(batch_texts, pred_classes, confidences), 1):
    print(f"\n{i}. Text: {text}")
    print(f"   Class: {cls}, Confidence: {conf:.4f}")'''

'# Example usage\n\n# Initialize detector\nMODEL_PATH = "/work/MarkusLundsfrydJensen#1865/Bachelor_project/output/mmBERT/template_3_4_5_merged"\ndetector = BlameDetectorDa(\n    model_path=MODEL_PATH,\n    max_length=512,\n    batch_size=8  # Process 8 texts at a time\n)\n\nprint("Single Prediction Test")\n\n\n# Single prediction\nsingle_text = "Dette er en test sætning."\npred_class, confidence = detector.run_prediction(single_text)\nprint(f"\nText: {single_text}")\nprint(f"Predicted Class: {pred_class}")\nprint(f"Confidence: {confidence:.4f}")\n\nprint("Batch Prediction Test")\n\n# Batch prediction\nbatch_texts = [\n    "Første test sætning.",\n    "Anden test sætning.",\n    "Tredje test sætning.",\n    "Fjerde test sætning.",\n]\n\npred_classes, confidences = detector.run_prediction(batch_texts)\n\nprint("\nBatch Results:")\nfor i, (text, cls, conf) in enumerate(zip(batch_texts, pred_classes, confidences), 1):\n    print(f"\n{i}. Text: {text}")\n    print(f"   Class: {cls}, Confide

In [55]:
import json
from tqdm import tqdm  # for progress bar (optional)

MODEL_PATH = "/work/MarkusLundsfrydJensen#1865/Bachelor_project/output/mmBERT/template_3_4_5_model"
detector = BlameDetectorDa(
    model_path=MODEL_PATH,
    max_length=1024,
    batch_size=2  # Process 2 texts at a time
)


Loading model from /work/MarkusLundsfrydJensen#1865/Bachelor_project/output/mmBERT/template_3_4_5_model...
Loading base model: jhu-clsp/mmBERT-base


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at jhu-clsp/mmBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading LoRA adapters...
Merging LoRA weights...
Model loaded successfully on cpu


In [None]:
# ---- 1. Load JSON dataset ----
import json
json_path = "/work/MarkusLundsfrydJensen#1865/inferece_data/final_inference_data.json"

with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)


In [None]:


# ---- 2. Extract texts ----
texts = [item["text"] for item in data if "text" in item]

# ---- 3. Run predictions in batches ----
pred_classes, confidences = detector.run_prediction(texts)


# ---- 4. Attach predictions back to data ----
for item, cls, conf in zip(data, pred_classes, confidences):
    item["predicted_class"] = int(cls)
    item["confidence"] = float(conf)


#save data



In [None]:
for entry in data:
    if entry["label"] == 1:
        print(entry["predicted_class"])
        print(entry["confidence"])

0
0.5366607308387756
0
0.7602487802505493
