## This was developed in Google Colab

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import Trainer, TrainingArguments
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix, f1_score, accuracy_score, mean_squared_error, mean_absolute_error, f1_score
import numpy as np

## Load the training data

In [8]:
# Load the dataset (assuming 'train.csv' exists with text and labels)
df = pd.read_csv('train.csv')

In [3]:
len(df)

300

In [9]:
for col in df.columns[1:]:
  # Divide the column by 10
  df[col] = df[col] / 10

In [10]:
df[-2:]

Unnamed: 0,question,remembering_complexity,creation_complexity,evaluation_complexity,analysis_complexity,synthesis_complexity,applying_complexity,hypothesis_complexity
298,The challenge of capturing the essence of huma...,0.6,0.9,0.8,0.8,0.9,0.7,0.8
299,The challenge of understanding consciousness p...,0.6,0.9,0.8,0.8,0.9,0.7,0.9


In [13]:
df['question'] = df['question'].str.lower()
df[-2:]


Unnamed: 0,question,remembering_complexity,creation_complexity,evaluation_complexity,analysis_complexity,synthesis_complexity,applying_complexity,hypothesis_complexity
298,the challenge of capturing the essence of huma...,0.6,0.9,0.8,0.8,0.9,0.7,0.8
299,the challenge of understanding consciousness p...,0.6,0.9,0.8,0.8,0.9,0.7,0.9


## Model preparation

In [14]:
# Create a custom dataset class
class BloomsTaxonomyDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = torch.tensor(self.labels[idx], dtype=torch.float)

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.FloatTensor(label)  # Float for continuous labels
        }


In [None]:
label_cols = ['creation_complexity', 'evaluation_complexity', 'analysis_complexity', 'synthesis_complexity', 'applying_complexity', 'hypothesis_complexity']

In [None]:
# Load tokenizer and model
# model_name = "distilbert-base-uncased"
# tokenizer = DistilBertTokenizer.from_pretrained(model_name)
# model = DistilBertForSequenceClassification.from_pretrained(
#     model_name,
#     num_labels=len(label_cols),
#     problem_type="multi_label_classification"
# )

In [15]:
# Load tokenizer and model
model_name = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label_cols),  # Number of labels
    problem_type="multi_label_classification"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['question'].values,
    df[label_cols].values,
    test_size=0.1,
    random_state=42
)

In [18]:
# Create the training dataset object
train_dataset = BloomsTaxonomyDataset(
    texts=train_texts,
    labels=train_labels,
    tokenizer=tokenizer
)

# Create the evaluation dataset object
eval_dataset = BloomsTaxonomyDataset(
    texts=val_texts,
    labels=val_labels,
    tokenizer=tokenizer
)

In [19]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5, # Increase epochs for small datasets
    per_device_train_batch_size=4,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
    report_to="none",
)

In [20]:
# Compute metrics for continuous multi-label classification
def compute_metrics(pred):
    # Get model predictions (sigmoid-activated logits) and labels
    logits = pred.predictions
    preds = torch.sigmoid(torch.tensor(logits)).numpy()  # Probabilities [0,1]
    labels = pred.label_ids  # Continuous labels [0,1]

    # Debug: Check label ranges
    print("Label range in validation set:", labels.min(), "to", labels.max())

    # Regression metrics for continuous labels
    mse = mean_squared_error(labels, preds)
    mae = mean_absolute_error(labels, preds)

    # Optional: Classification metrics after thresholding
    threshold = 0.5
    preds_binary = (preds > threshold).astype(int)
    labels_binary = (labels > threshold).astype(int)
    f1 = f1_score(labels_binary, preds_binary, average='micro', zero_division=0)

    return {
        'mse': mse,
        'mae': mae,
        'f1_micro_binary': f1  # Binary F1 after thresholding
    }

In [21]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics, # Metrics function for evaluation
)

In [22]:
# Train the model
trainer.train()

Step,Training Loss
10,0.6889
20,0.596
30,0.5524
40,0.5417
50,0.5556
60,0.5141
70,0.5782
80,0.5606
90,0.5285
100,0.5236


TrainOutput(global_step=340, training_loss=0.5418119122000301, metrics={'train_runtime': 61.5066, 'train_samples_per_second': 21.949, 'train_steps_per_second': 5.528, 'total_flos': 44710936243200.0, 'train_loss': 0.5418119122000301, 'epoch': 5.0})

In [23]:
# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)

Label range in validation set: 0.1 to 0.9
Evaluation results: {'eval_loss': 0.5368255376815796, 'eval_mse': 0.0037132089491933584, 'eval_mae': 0.04629255458712578, 'eval_f1_micro_binary': 0.9285714285714286, 'eval_runtime': 0.2264, 'eval_samples_per_second': 132.536, 'eval_steps_per_second': 17.671, 'epoch': 5.0}


In [24]:
# Create a function for prediction
def evaluate_text(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

    # Move inputs to the same device as the model
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Get model predictions (logits)
    with torch.no_grad():
        outputs = model(**inputs)

    # Convert logits to probabilities (0-1 scores) using sigmoid
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(outputs.logits.squeeze())

    # Create a dictionary of scores
    scores = {label: prob.item() for label, prob in zip(label_cols, probs)}
    return scores


In [25]:
# --- Example Usage ---
new_text = "sports are a great way to stay active and connect with others. have you tried any new sports recently?"
scores = evaluate_text(new_text)

print(f"Evaluation for text: '{new_text}'")
for label, score in scores.items():
    print(f"- {label}: {score:.4f}")

Evaluation for text: 'sports are a great way to stay active and connect with others. have you tried any new sports recently?'
- creation_complexity: 0.1184
- evaluation_complexity: 0.2261
- analysis_complexity: 0.1897
- synthesis_complexity: 0.1051
- applying_complexity: 0.1588
- hypothesis_complexity: 0.1190


In [26]:
new_text_2 = "the challenge of understanding consciousness persists as one of the most complex puzzles in neuroscience. how can we craft a comprehensive model that accounts for subjective experience while integrating data from diverse neural processes?"
scores_2 = evaluate_text(new_text_2)

print(f"\nEvaluation for text: '{new_text_2}'")
for label, score in scores_2.items():
    print(f"- {label}: {score:.4f}")


Evaluation for text: 'the challenge of understanding consciousness persists as one of the most complex puzzles in neuroscience. how can we craft a comprehensive model that accounts for subjective experience while integrating data from diverse neural processes?'
- creation_complexity: 0.9048
- evaluation_complexity: 0.8058
- analysis_complexity: 0.8285
- synthesis_complexity: 0.9033
- applying_complexity: 0.7395
- hypothesis_complexity: 0.8550


## Export the model and tokenizer

In [27]:
# Specify the directory where you want to save the model
output_dir = "./saved_model"

# Save the model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model saved to {output_dir}")

Model saved to ./saved_model


In [32]:
# !zip -r /content/saved_model.zip /content/saved_model

  adding: content/saved_model/ (stored 0%)
  adding: content/saved_model/model.safetensors (deflated 7%)
  adding: content/saved_model/vocab.json (deflated 59%)
  adding: content/saved_model/tokenizer_config.json (deflated 75%)
  adding: content/saved_model/config.json (deflated 55%)
  adding: content/saved_model/merges.txt (deflated 53%)
  adding: content/saved_model/tokenizer.json (deflated 82%)
  adding: content/saved_model/special_tokens_map.json (deflated 52%)


In [33]:
# !ls -lh /content/saved_model.zip

-rw-r--r-- 1 root root 292M May 27 13:42 /content/saved_model.zip


## Load the model back

In [28]:
# Specify the path to the saved model directory
saved_model_path = "./saved_model"

# Load the saved model and tokenizer
loaded_model = AutoModelForSequenceClassification.from_pretrained(saved_model_path)
loaded_tokenizer = AutoTokenizer.from_pretrained(saved_model_path)

In [29]:
# Create a function for prediction using the loaded model
def evaluate_text_loaded(text):
    # Tokenize the input text using the loaded tokenizer
    inputs = loaded_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

    # Move inputs to the same device as the model (if using GPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    loaded_model.to(device)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Get model predictions (logits)
    with torch.no_grad():
        outputs = loaded_model(**inputs)

    # Convert logits to probabilities (0-1 scores) using sigmoid
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(outputs.logits.squeeze())

    # Create a dictionary of scores
    scores = {label: prob.item() for label, prob in zip(label_cols, probs)}
    return scores

In [30]:
# --- Example Usage with Loaded Model ---
new_text = "sports are a great way to stay active and connect with others. have you tried any new sports recently?"
scores = evaluate_text_loaded(new_text)

print(f"Evaluation for text: '{new_text}'")
for label, score in scores.items():
    print(f"- {label}: {score:.4f}")

Evaluation for text: 'sports are a great way to stay active and connect with others. have you tried any new sports recently?'
- creation_complexity: 0.1184
- evaluation_complexity: 0.2261
- analysis_complexity: 0.1897
- synthesis_complexity: 0.1051
- applying_complexity: 0.1588
- hypothesis_complexity: 0.1190


In [31]:
new_text_2 = "the challenge of understanding consciousness persists as one of the most complex puzzles in neuroscience. how can we craft a comprehensive model that accounts for subjective experience while integrating data from diverse neural processes?"
scores_2 = evaluate_text(new_text_2)

print(f"\nEvaluation for text: '{new_text_2}'")
for label, score in scores_2.items():
    print(f"- {label}: {score:.4f}")


Evaluation for text: 'the challenge of understanding consciousness persists as one of the most complex puzzles in neuroscience. how can we craft a comprehensive model that accounts for subjective experience while integrating data from diverse neural processes?'
- creation_complexity: 0.9048
- evaluation_complexity: 0.8058
- analysis_complexity: 0.8285
- synthesis_complexity: 0.9033
- applying_complexity: 0.7395
- hypothesis_complexity: 0.8550
