In [None]:
%pip install small-text[transformers]  # use "small-text" without "[transformers]" if you want to work on the CPU only
%pip install datasets
%pip install matplotlib

In [None]:
import datasets
import logging
import numpy as np

In [None]:
# disables the progress bar for notebooks: https://github.com/huggingface/datasets/issues/2651
datasets.logging.get_verbosity = lambda: logging.NOTSET
num_classes = 2 # change to 6 when use the Ar_PuFi Multi dataset

In [None]:
import transformers
from transformers import AutoTokenizer

In [None]:
transformer_model_name = 'asafaya/bert-base-arabic'
tokenizer = AutoTokenizer.from_pretrained(
    transformer_model_name
)

In [None]:
from small_text.integrations.transformers.datasets import TransformersDataset
import pandas as pd
import numpy as np
def get_transformers_dataset(tokenizer, data, labels, max_length=100):

    data_out = []

    for i, doc in enumerate(data):
        encoded_dict = tokenizer.encode_plus(
            doc,
            add_special_tokens=True,
            padding='max_length',
            max_length=max_length,
            return_attention_mask=True,
            return_tensors='pt',
            truncation='longest_first'
        )

        data_out.append((encoded_dict['input_ids'], encoded_dict['attention_mask'], labels[i]))

    return TransformersDataset(data_out)
from sklearn.model_selection import train_test_split
df= pd.read_csv('/content/Ar_PuFi.csv')
train_, test_ = train_test_split(df, test_size=0.2, random_state=42)
train_text = train_["Text"].tolist()
train_labels=train_["Label"].tolist()
test_text = test_["Text"].tolist()
test_labels=test_["Label"].tolist()
train = get_transformers_dataset(tokenizer, train_text, train_labels)
test = get_transformers_dataset(tokenizer, test_text, test_labels)

**Pool Based Active Learner**

In [None]:
from small_text.active_learner import PoolBasedActiveLearner
from small_text.initialization import random_initialization_balanced
from small_text.integrations.transformers import TransformerModelArguments
from small_text.integrations.transformers.classifiers.factories import TransformerBasedClassificationFactory
from small_text.query_strategies import PredictionEntropy
from small_text.integrations.transformers import TransformerModelArguments

In [None]:
# simulates an initial labeling to warm-start the active learning process, ChanGING THE SAMPLE SIZE TO 15
def initialize_active_learner(active_learner, y_train):
    x_indices_initial = random_initialization_balanced(y_train, n_samples=960)
    y_initial = y_train[x_indices_initial]

    active_learner.initialize_data(x_indices_initial, y_initial)

    return x_indices_initial

In [None]:
transformer_model = TransformerModelArguments(transformer_model_name)
clf_factory = TransformerBasedClassificationFactory(transformer_model, 
                                                    num_classes, 
                                                    kwargs=dict({'device': 'cuda', 
                                                                 'mini_batch_size': 32,
                                                                 'early_stopping_no_improvement': -1
                                                                }))
query_strategy = PredictionEntropy()
active_learner = PoolBasedActiveLearner(clf_factory, query_strategy, train)
labeled_indices = initialize_active_learner(active_learner, train.y)

**Active Learning Loop**

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score, f1_score


num_queries = 24

#Evaulate Function 
def evaluate(active_learner, train, test):
    y_pred = active_learner.classifier.predict(train)
    y_pred_test = active_learner.classifier.predict(test)
    
    test_acc = accuracy_score(y_pred_test, test.y)
    test_precision = precision_score(y_pred_test, test.y)
    test_recall = recall_score(y_pred_test, test.y)
    test_F_score = f1_score(y_pred_test, test.y)

    #confustion_matrix_dis = confusion_matrix(y_pred_test, test.y)

    # print('Train accuracy: {:.2f}'.format(accuracy_score(y_pred, train.y)))
    print('Test accuracy: {:.2f}'.format(test_acc))
    # print('Train Precision: {:.2f}'.format(precision_score(y_pred, train.y)))
    print('Test Precision: {:.2f}'.format(test_precision))
    # print('Train Recall: {:.2f}'.format(recall_score(y_pred, train.y)))
    print('Test Recall: {:.2f}'.format(test_recall))
    print('Test f_score: {:.2f}'.format(test_F_score))

    
    #print
    
    return test_acc, test_precision, test_recall,test_F_score


results = []
results.append(evaluate(active_learner, train[labeled_indices], test))

for i in range(num_queries):
    # ...where each iteration consists of labelling 960 samples
    q_indices = active_learner.query(num_samples=960)

    # Simulate user interaction here. Replace this for real-world usage.
    y = train.y[q_indices]

    # Return the labels for the current query to the active learner.
    active_learner.update(y)

    labeled_indices = np.concatenate([q_indices, labeled_indices])

    print('Iteration #{:d} ({} samples)'.format(i, len(labeled_indices)))
    results.append(evaluate(active_learner, train[labeled_indices], test))