# Load BloomBERT Model

In [None]:
%pip install transformers --quiet

In [2]:
import torch
from src.model.bloombert import BloomBERT

config = {
    # "learning_rate": 1e-5,
    # "batch_size": 128,
    # "epochs": 50,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu")
}

# load model
best_model = BloomBERT(output_dim=6).to(config["device"])
best_model.load_state_dict(torch.load("model/bloombert_model.pt", map_location=config["device"]))
best_model.eval()

BloomBERT(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_fe

In [14]:
import pandas as pd

# Define new mapping from model index to own label
category_map = {
    0: 1,  # Remember -> 1
    1: 2,  # Understand -> 2
    2: 3,  # Apply -> 3
    3: 4,  # Analyse -> 4
    4: 5,  # Evaluate -> 5
    5: 6   # Create -> 6
}

In [15]:
import re

def split(text):
    # Split text into sentences using ., !, ?, or newlines
    if not isinstance(text, str):
        return []
    # Split on . ! ? or newline, keep non-empty
    sentences = re.split(r'[.!?]\s+|\n+', text)
    return [s.strip() for s in sentences if s.strip()]

def pred_blooms(text, model, tokenizer, device):
    # Treat empty, whitespace, or "nan" (from pandas) as blank
    if not isinstance(text, str) or not text.strip() or text.strip().lower() == "nan":
        return 0, [0.0]*6  # blank or not a string or "nan"
    try:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs if isinstance(outputs, torch.Tensor) else outputs[0]
            probs = F.softmax(logits, dim=-1).cpu().numpy()[0]
            label_idx = probs.argmax()
            mapped_label = category_map.get(label_idx, 0)
        return mapped_label, probs.tolist()
    except Exception:
        return 0, [0.0]*6  # fallback for errors

# Load CSV
df = pd.read_csv("\\THESIS2\\CuatroBloomBERT\\csv_files\\combined_apc_syllabi_data.csv")

# Columns to classify
target_columns = ["Learning Outcomes", "Deliverables", "Assessments"]

# Create new columns for predictions (per sentence)
for col in target_columns:
    all_sentence_labels = []
    for cell in df[col].astype(str):
        sentences = split(cell)
        labels = []
        for sent in sentences:
            label, _ = pred_blooms(sent, best_model, tokenizer, config["device"])
            labels.append(label)
        all_sentence_labels.append(labels)
    df[f"{col}_Bloom"] = all_sentence_labels

# Save results
df.to_csv("bloom_classified_output.csv", index=False)
print("✅ Bloom's classification (per sentence) completed and saved to bloom_classified_output.csv")

✅ Bloom's classification (per sentence) completed and saved to bloom_classified_output.csv
