In [2]:
%load_ext autoreload
%autoreload 2

In [None]:
import torch
from transformers import DistilBertTokenizer
from src.dataset.process_data import load_dataset, calculate_class_weights
from src.training.train_bloombert import train_model_bloombert
from src.helper.plots_helper import plot_training_history

# CONFIGURE TORCH
torch.cuda.empty_cache()
if torch.cuda.is_available():
    torch.backends.cudnn.deterministic = True
torch.manual_seed(1111)

device = torch.device("cuda"  if torch.cuda.is_available() else "cpu")
print(f'-- Training on: {device}')

In [4]:
print("-- Loading dataset")
df = load_dataset('data/blooms_dataset.csv', clean=True)
df = df.sort_values(by=['Label', 'Text'], ascending=[True, True])
df.to_csv('data/blooms_cleaned_dataset.csv', index=False)

print(df['Label'].value_counts().sort_index())

class_weights = calculate_class_weights(df["Label"].to_numpy()).to(device)
print(class_weights)

-- Loading dataset
Label
0    1532
1    2348
2     671
3     560
4     634
5     430
Name: count, dtype: int64
tensor([0.6718, 0.4383, 1.5338, 1.8378, 1.6233, 2.3934])


In [None]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

config = {
    "learning_rate": 1e-5,
    "batch_size": 128,
    "epochs": 50,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu")
}

class_weights = calculate_class_weights(df["Label"].to_numpy()).to(config["device"])

best_model, history, best_val_acc = train_model_bloombert(
    df,
    tokenizer,
    config,
    class_weights=class_weights,
    test_size=0.2,
    augment=False
)

In [None]:
plot_training_history(history)

In [None]:
# save best model
torch.save(best_model.state_dict(), "model/bloombert_model.pt")

# Load trained model

In [6]:
from src.model.bloombert import BloomBERT

config = {
    # "learning_rate": 1e-5,
    # "batch_size": 128,
    # "epochs": 50,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu")
}

# load model
best_model = BloomBERT(output_dim=6).to(config["device"])
best_model.load_state_dict(torch.load("model/bloombert_model.pt", map_location=config["device"]))
best_model.eval()

BloomBERT(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_fe

In [7]:
import numpy as np
np.set_printoptions(suppress=True, precision=4)

texts = [
    "remember the main point of the lesson",
    "apply the concept to solve a problem",
    "develop new project ideas",
]

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
encodings = tokenizer(texts, truncation=True, padding=True, return_tensors="pt")

input_ids = encodings["input_ids"].to(device)
attention_mask = encodings["attention_mask"].to(device)

best_model.eval()

with torch.no_grad():
    prob_ge = best_model(input_ids=input_ids, attention_mask=attention_mask)

print("prob_ge outputs:")
print(prob_ge)

prob_ge_np = prob_ge.cpu().numpy()
print(prob_ge_np)

prob_ge outputs:
tensor([[ 3.0434, -1.8520,  0.1674, -0.0178, -0.4197,  0.1016],
        [-0.8945, -2.0402,  3.7073, -0.3888, -0.7278, -0.0404],
        [-1.5322, -2.0863, -1.1225, -0.2648, -0.8602,  4.0943]])
[[ 3.0434 -1.852   0.1674 -0.0178 -0.4197  0.1016]
 [-0.8945 -2.0402  3.7073 -0.3888 -0.7278 -0.0404]
 [-1.5322 -2.0863 -1.1225 -0.2648 -0.8602  4.0943]]


In [8]:
category_dict = {
    0: "Remember",
    1: "Understand",
    2: "Apply",
    3: "Analyse",
    4: "Evaluate",
    5: "Create"
}

def predict_blooms(text, model, tokenizer, device):
    encodings = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
    input_ids = encodings["input_ids"].to(device)
    attention_mask = encodings["attention_mask"].to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        probs = torch.nn.functional.softmax(outputs, dim=1).cpu().numpy()[0]
        pred_class = int(torch.argmax(outputs, dim=1).cpu().numpy()[0])
    
    return category_dict[pred_class], probs

In [9]:
test_text = "Brainstorm a new market strategy to expand in asia"
predicted_class, probabilities = predict_blooms(test_text, best_model, tokenizer, config["device"])

print("Text:", test_text)
print("Predicted Class:", predicted_class)
print("Probabilities:", probabilities)

Text: Brainstorm a new market strategy to expand in asia
Predicted Class: Create
Probabilities: [0.0034 0.0028 0.0097 0.0074 0.008  0.9687]
