In [1]:
pip list

Package                           Version
--------------------------------- ---------------
absl-py                           2.1.0
aiofiles                          22.1.0
aiosqlite                         0.19.0
alembic                           1.13.1
altair                            5.2.0
anyio                             4.2.0
archspec                          0.2.2
argon2-cffi                       23.1.0
argon2-cffi-bindings              21.2.0
arrow                             1.3.0
asttokens                         2.4.1
astunparse                        1.6.3
async-generator                   1.10
async-lru                         2.0.4
attrs                             23.2.0
Babel                             2.14.0
beautifulsoup4                    4.12.3
bleach                            6.1.0
blinker                           1.7.0
bokeh                             3.3.3
boltons                           23.1.1
Bottleneck                        1.3.7
branca              

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
from small_text import TransformersDataset, TransformerModelArguments
from transformers import AutoTokenizer

# Fake data example:
# 50 spam and 50 non-spam examples
text = np.array(['this is ham'] * 50 +['this is spam'] * 50)
labels = np.array([0] * 50 + [1] * 50)
transformer_model = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(transformer_model)
train = TransformersDataset.from_arrays(text, labels, tokenizer, target_labels=np.array([0, 1]), max_length=10)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



#### Demonstration code from paper by Schröder et al., "Small-Text: Active Learning for Text Classification in Python" 

https://aclanthology.org/2023.eacl-demo.11v2.pdf

In [3]:
from small_text import LeastConfidence, TransformerBasedClassificationFactory as TransformerFactory

num_classes = 2
model_args = TransformerModelArguments(transformer_model)
clf_factory = TransformerFactory(model_args, num_classes, kwargs={'device': 'cuda'})
query_strategy = LeastConfidence()

In [4]:
from small_text import PoolBasedActiveLearner, random_initialization_balanced as init

active_learner = PoolBasedActiveLearner(clf_factory, query_strategy, train)
# Provide initial data.
indices_initial = init(train.y, n_samples=10)
active_learner.initialize_data(indices_initial,train.y[indices_initial])

In [5]:
from sklearn.metrics import accuracy_score

num_queries = 5

for i in range(num_queries):
    # Query 10 samples per iteration.
    indices_queried = active_learner.query(num_samples=10)
    # Simulate user interaction here.
    # Replace this for real-world usage.
    y = train.y[indices_queried]
    # Provide labels for the queried indices.
    active_learner.update(y)
    # Evaluate accuracy on the train set
    print(f'Iteration {i+1}')
    y_pred = active_learner.classifier.predict(train)
    print('Train accuracy: {:.2f}'.format(accuracy_score(y_pred, train.y)))


Iteration 1
Train accuracy: 1.00
Iteration 2
Train accuracy: 1.00
Iteration 3
Train accuracy: 1.00
Iteration 4
Train accuracy: 1.00
Iteration 5
Train accuracy: 1.00
