# 1. Library Imports

In [1]:
import numpy as np
import matplotlib as mlp
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification

from skactiveml.classifier import SklearnClassifier
from skactiveml.utils import unlabeled_indices, labeled_indices, MISSING_LABEL


from ActiveLearner import ActiveLearner # Custom Wrapper Class

import warnings
mlp.rcParams["figure.facecolor"] = "white"
warnings.filterwarnings("ignore")


RANDOM_STATE=42

In [12]:
import skactiveml as sk
sk.__version__

'0.5.2'

# 2. Load Dataset

In [2]:
X, y_true = make_classification(n_samples=20_000, n_classes=2, n_features=20, n_redundant=0, random_state=RANDOM_STATE)

In [3]:
X.shape, y_true.shape

((20000, 20), (20000,))

# 3. Model Selection

Models must implement the following functions to be valid :
- predict(X) -> y_pred (absolute values)
- predict_proba(X) -> y_pred (probabilities)
- fit(X, y) -> None
- score(X, y_true) -> accuracy_score

Datatypes :
- X, y, y_pred are all numpy arrays
- accuracy_score is a float

In [4]:
models = {
    'LogisticRegression' : LogisticRegression(random_state=RANDOM_STATE),
    'DecisionTree' : DecisionTreeClassifier(max_depth=6, min_samples_split=2,criterion='gini',random_state=RANDOM_STATE)
    
}

selected_model = models['DecisionTree']
selected_model = SklearnClassifier(selected_model, classes=np.unique(y_true))

In [5]:
selected_model.fit([[1],[2]], [0,1])

selected_model.predict_proba([[1], [2]])

array([[1., 0.],
       [0., 1.]])

In [6]:
import pandas as pd


In [7]:
from sklearn.base import BaseEstimator, ClassifierMixin

class CustomClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.x = 12

    def fit(self, X, y):
        self.X_ = X
        self.y_ = y
        return self

    def predict(self, X):
        # Example prediction logic
        predictions = np.array([0] * 20_000)
        return pd.get_dummies(predictions).to_numpy().astype(int)

    def predict_proba(self, X):
        # Example prediction logic
        predictions = np.array([0.5] * 20_000)

        proba = np.zeros((len(X), 2))
        proba[:, 0] = (1 - predictions)
        proba[:, 1] = predictions
        
        return proba

    def score(self, X, y):
        predictions = self.predict(X)
        return np.mean(predictions == y)

selected_model = SklearnClassifier(CustomClassifier(), classes=np.unique(y_true))

selected_model.fit([], [])

# 4. Query Strategy

This object will pick samples from the unlabeled set to give to the oracle to mark at each step.
Note that these are all child classes of the skactiveml.base.QueryStrategy class.
The details and full list of strategies can be found in the [skactiveml documentation](https://scikit-activeml.github.io/scikit-activeml-docs/latest/generated/api/skactiveml.base.QueryStrategy.html)

In [8]:
strategies = ['UncertaintySampling', 'GreedySamplingX']

selection_strategy = strategies[0]

# 5. Experiment Parameters



In [9]:
ex_params = {
    'n_cycles' : 200,
    'batch_size' : 100,
    'USE_TQDM' : True,
    'PRINT_LOGS' : False,
    'log_every' : 10, # only used if PRINT_LOGS is true
    }

# 6. Skactiveml Wrapper Class

In [10]:
learner = ActiveLearner(X, y_true, selected_model, selection_strategy, random_state=RANDOM_STATE)

In [11]:
ex_results = learner.run_experiment(ex_params)

  0%|▋                                                                                                                                  | 1/200 [00:00<00:10, 19.42it/s]


ValueError: could not broadcast input array from shape (20000,) into shape (19900,)

In [None]:
plt.plot(range(ex_results['number_of_iterations']), ex_results['acc_scores'])

plt.title('Model accuracy vs Number of training rounds')
plt.xlabel('Round No.')
plt.ylabel('Acc Score')
plt.show()

In [None]:
plt.plot(ex_results['percentage_labeled'], ex_results['acc_scores'])

plt.title('Model accuracy vs Percentage of annotated examples')
plt.xlabel('Percentage Annotated')
plt.ylabel('Acc Score')
plt.show()

In [None]:
display_upto = 100
plt.plot(ex_results['percentage_labeled'][:display_upto], ex_results['acc_scores'][:display_upto])

plt.title('Model accuracy vs Percentage of annotated examples')
plt.xlabel('Percentage Annotated')
plt.ylabel('Acc Score')
plt.show()