In [1]:
import os
import urllib.request
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.metrics import accuracy_score
from tqdm import tqdm

In [None]:
# ==========================================
# 1. Data Download and Processing
# ==========================================
def download_probing_data(task_name="past_present", data_dir="./data"):
    """
    Downloads a specific probing task from the SentEval repository.
    task_name options: 'sent_len', 'wc', 'tree_depth', 'top_const', 
                       'past_present', 'subj_number', 'obj_number', 
                       'odd_man_out', 'coord_inv'
    """
    os.makedirs(data_dir, exist_ok=True)
    url = f"https://raw.githubusercontent.com/facebookresearch/SentEval/main/data/probing/{task_name}.txt"
    file_path = os.path.join(data_dir, f"{task_name}.txt")
    
    if not os.path.exists(file_path):
        print(f"Downloading {task_name}...")
        urllib.request.urlretrieve(url, file_path)
        print("Download complete.")
    else:
        print(f"Data for {task_name} already exists.")
    return file_path

class ProbingDataset(Dataset):
    def __init__(self, file_path, partition_filter, tokenizer, max_len=128):
        self.sentences = []
        self.labels = []
        self.tokenizer = tokenizer
        self.max_len = max_len
        
        # SentEval format: "partition \t label \t sentence content..."
        label_set = set()
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split('\t')
                if len(parts) < 3: continue
                
                partition = parts[0]
                label = parts[1]
                text = parts[2]
                
                if partition == partition_filter:
                    self.sentences.append(text)
                    self.labels.append(label)
                    label_set.add(label)
        
        # Create a mapping from label string to integer
        # Note: In a real scenario, ensure this mapping is consistent across train/test
        self.label_map = {l: i for i, l in enumerate(sorted(list(label_set)))}
        self.num_classes = len(self.label_map)
        
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        text = self.sentences[idx]
        label_str = self.labels[idx]
        label_id = self.label_map[label_str]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label_id, dtype=torch.long)
        }

In [3]:
# ==========================================
# 2. Setup BERT Encoder (The Subject of the Probe)
# ==========================================
def get_bert_encoder():
    print("Loading BERT model...")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    
    # Freeze the encoder weights! We are probing it, not training it.
    for param in model.parameters():
        param.requires_grad = False
        
    return tokenizer, model

In [4]:
# ==========================================
# 3. Setup Probing Classifier
# ==========================================
class ProbingClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(ProbingClassifier, self).__init__()
        # Standard probing architecture: Linear -> Tanh -> Linear
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.activation = nn.Tanh()
        self.dropout = nn.Dropout(0.1)
        self.fc2 = nn.Linear(hidden_dim, num_classes)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [5]:
# ==========================================
# 4. Training and Evaluation Functions
# ==========================================
def train_probe(encoder, probe, train_loader, device, epochs=5, learning_rate=1e-3):
    encoder.to(device)
    probe.to(device)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(probe.parameters(), lr=learning_rate)
    
    print(f"Starting training on {device}...")
    
    for epoch in range(epochs):
        probe.train()
        total_loss = 0
        
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            input_ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # Step 1: Get Embeddings (No Gradients for Encoder)
            with torch.no_grad():
                outputs = encoder(input_ids, attention_mask=mask)
                # Use [CLS] token representation (first token)
                embeddings = outputs.last_hidden_state[:, 0, :] 
                
            # Step 2: Train Probe
            optimizer.zero_grad()
            logits = probe(embeddings)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
        print(f"Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f}")


In [6]:
def evaluate_probe(encoder, probe, test_loader, device):
    encoder.to(device)
    probe.to(device)
    probe.eval()
    
    predictions = []
    true_labels = []
    
    print("Evaluating...")
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluation"):
            input_ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = encoder(input_ids, attention_mask=mask)
            embeddings = outputs.last_hidden_state[:, 0, :] # [CLS] token
            
            logits = probe(embeddings)
            preds = torch.argmax(logits, dim=1)
            
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
            
    acc = accuracy_score(true_labels, predictions)
    print(f"Probe Accuracy: {acc*100:.2f}%")
    return acc

In [8]:
# Settings
TASK = "past_present" # Try: 'sent_len', 'tree_depth'
BATCH_SIZE = 128
HIDDEN_DIM = 512
EPOCHS = 3
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
DEVICE

device(type='cuda')

In [9]:
data_path = download_probing_data(TASK)

Downloading past_present...
Download complete.


In [10]:
tokenizer, encoder = get_bert_encoder()

Loading BERT model...


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [11]:
# We must ensure the label mapping is consistent. 
# In this simple script, we initialize the dataset twice, which isn't ideal 
# but works if the data file contains all labels in both splits.
# Ideally, build the label_map once from the whole file.
train_ds = ProbingDataset(data_path, 'tr', tokenizer)
test_ds = ProbingDataset(data_path, 'te', tokenizer)

# Sync label maps (just in case)
test_ds.label_map = train_ds.label_map 

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE)

In [12]:
print(f"Task: {TASK}")
print(f"Classes: {train_ds.label_map}")

# 2. Setup Probe
# Input dim for BERT-base is 768
probe = ProbingClassifier(768, HIDDEN_DIM, train_ds.num_classes)

Task: past_present
Classes: {'PAST': 0, 'PRES': 1}


In [13]:
# 3. Train
train_probe(encoder, probe, train_loader, DEVICE, epochs=EPOCHS)

Starting training on cuda...


Epoch 1/3: 100%|██████████| 782/782 [03:05<00:00,  4.21it/s]


Epoch 1 Loss: 0.3010


Epoch 2/3: 100%|██████████| 782/782 [03:04<00:00,  4.24it/s]


Epoch 2 Loss: 0.2766


Epoch 3/3: 100%|██████████| 782/782 [03:04<00:00,  4.24it/s]

Epoch 3 Loss: 0.2641





In [14]:
# 4. Evaluate
evaluate_probe(encoder, probe, test_loader, DEVICE)

Evaluating...


Evaluation: 100%|██████████| 79/79 [00:18<00:00,  4.27it/s]

Probe Accuracy: 88.29%





0.8829

.8829 accuracy on past present

In [8]:
TASK = "sentence_length"

data_path = download_probing_data(TASK)
tokenizer, encoder = get_bert_encoder()

train_ds = ProbingDataset(data_path, 'tr', tokenizer)
test_ds = ProbingDataset(data_path, 'te', tokenizer)

# Sync label maps (just in case)
test_ds.label_map = train_ds.label_map 

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE)

print(f"Task: {TASK}")
print(f"Classes: {train_ds.label_map}")

probe = ProbingClassifier(768, HIDDEN_DIM, train_ds.num_classes)

train_probe(encoder, probe, train_loader, DEVICE, epochs=EPOCHS)

evaluate_probe(encoder, probe, test_loader, DEVICE)

Data for sentence_length already exists.
Loading BERT model...


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


Task: sentence_length
Classes: {'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5}
Starting training on cuda...


Epoch 1/3: 100%|██████████| 782/782 [04:08<00:00,  3.15it/s]


Epoch 1 Loss: 0.9147


Epoch 2/3: 100%|██████████| 782/782 [03:08<00:00,  4.14it/s]


Epoch 2 Loss: 0.7915


Epoch 3/3: 100%|██████████| 782/782 [03:09<00:00,  4.13it/s]


Epoch 3 Loss: 0.7648
Evaluating...


Evaluation: 100%|██████████| 79/79 [00:18<00:00,  4.19it/s]

Probe Accuracy: 64.92%





0.6491596638655462

.6491

In [9]:
TASK = "tree_depth"

data_path = download_probing_data(TASK)
tokenizer, encoder = get_bert_encoder()

train_ds = ProbingDataset(data_path, 'tr', tokenizer)
test_ds = ProbingDataset(data_path, 'te', tokenizer)

# Sync label maps (just in case)
test_ds.label_map = train_ds.label_map 

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE)

print(f"Task: {TASK}")
print(f"Classes: {train_ds.label_map}")

probe = ProbingClassifier(768, HIDDEN_DIM, train_ds.num_classes)

train_probe(encoder, probe, train_loader, DEVICE, epochs=EPOCHS)

evaluate_probe(encoder, probe, test_loader, DEVICE)

Data for tree_depth already exists.
Loading BERT model...
Task: tree_depth
Classes: {'10': 0, '11': 1, '5': 2, '6': 3, '7': 4, '8': 5, '9': 6}
Starting training on cuda...


Epoch 1/3: 100%|██████████| 782/782 [03:06<00:00,  4.20it/s]


Epoch 1 Loss: 1.6926


Epoch 2/3: 100%|██████████| 782/782 [03:04<00:00,  4.24it/s]


Epoch 2 Loss: 1.6302


Epoch 3/3: 100%|██████████| 782/782 [03:04<00:00,  4.23it/s]


Epoch 3 Loss: 1.6092
Evaluating...


Evaluation: 100%|██████████| 79/79 [00:18<00:00,  4.28it/s]

Probe Accuracy: 29.90%





0.299