Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

QueryRandom strategy didn't work. #1

Closed
evanzhu2013 opened this issue Feb 15, 2019 · 3 comments
Closed

QueryRandom strategy didn't work. #1

evanzhu2013 opened this issue Feb 15, 2019 · 3 comments

Comments

@evanzhu2013
Copy link

Thanks for your work. The most comprehensive AL package I've seen.

Issue:

from sklearn.datasets import load_iris,load_breast_cancer
from alipy.experiment.al_experiment import AlExperiment

import warnings
warnings.filterwarnings('ignore')

import copy
from sklearn.datasets import make_classification
from alipy import ToolBox
from alipy.query_strategy.query_labels import QueryInstanceGraphDensity, QueryInstanceQBC, \
    QueryInstanceQUIRE, QueryRandom, QueryInstanceUncertainty, QureyExpectedErrorReduction, QueryInstanceLAL

X, y = make_classification(n_samples=500, n_features=20, n_informative=2, n_redundant=2,
    n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0,
    hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None)

alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.')

# Split data
alibox.split_AL(test_ratio=0.3, initial_label_rate=0.1, split_count=10)

# Use the default Logistic Regression classifier
model = alibox.get_default_model()

# The cost budget is 50 times querying
stopping_criterion = alibox.get_stopping_criterion('num_of_queries', 50)


def main_loop(alibox, strategy, round):
    # Get the data split of one fold experiment
    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
    # Get intermediate results saver for one fold experiment
    saver = alibox.get_stateio(round)
    while not stopping_criterion.is_stop():
        # Select a subset of Uind according to the query strategy
        # Passing model=None to use the default model for evaluating the committees' disagreement
        select_ind = strategy.select(label_ind, unlab_ind, batch_size=1)
        label_ind.update(select_ind)
        unlab_ind.difference_update(select_ind)

        # Update model and calc performance according to the model you are using
        model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
        pred = model.predict(X[test_idx, :])
        accuracy = alibox.calc_performance_metric(y_true=y[test_idx],
                                                  y_pred=pred,
                                                  performance_metric='accuracy_score')

        # Save intermediate results to file
        st = alibox.State(select_index=select_ind, performance=accuracy)
        saver.add_state(st)

        # Passing the current progress to stopping criterion object
        stopping_criterion.update_information(saver)
    # Reset the progress in stopping criterion object
    stopping_criterion.reset()
    return saver

unc_result = []
qbc_result = []
random_result = []

for round in range(5):
    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)

    # Use pre-defined strategy
    unc = QueryInstanceUncertainty(X, y)
    qbc = QueryInstanceQBC(X, y)
    rnd = QueryRandom(X,y)

    unc_result.append(copy.deepcopy(main_loop(alibox, unc, round)))
    qbc_result.append(copy.deepcopy(main_loop(alibox, qbc, round)))
    random_result.append(copy.deepcopy(main_loop(alibox, rnd, round)))

analyser = alibox.get_experiment_analyser(x_axis='num_of_queries')

analyser.add_method(method_name='QBC', method_results=qbc_result)
analyser.add_method(method_name='Unc', method_results=unc_result)
analyser.add_method(method_name='RANDOM', method_results=random_result)

print(analyser)
analyser.plot_learning_curves(title='Example of alipy', std_area=False)

Error is below:

| round | initially labeled data | number of queries | cost | Performance: |
|   0   |   35 (10.00% of all)   |         50        |  0   | 0.846 ± 0.02 |
| round | initially labeled data | number of queries | cost | Performance: |
|   0   |   35 (10.00% of all)   |         50        |  0   | 0.841 ± 0.01 |
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-15-5d0ada1f815d> in <module>
     70     unc_result.append(copy.deepcopy(main_loop(alibox, unc, round)))
     71     qbc_result.append(copy.deepcopy(main_loop(alibox, qbc, round)))
---> 72     random_result.append(copy.deepcopy(main_loop(alibox, rnd, round)))
     73 
     74 analyser = alibox.get_experiment_analyser(x_axis='num_of_queries')

<ipython-input-15-5d0ada1f815d> in main_loop(alibox, strategy, round)
     35         # Select a subset of Uind according to the query strategy
     36         # Passing model=None to use the default model for evaluating the committees' disagreement
---> 37         select_ind = strategy.select(label_ind, unlab_ind, batch_size=1)
     38         label_ind.update(select_ind)
     39         unlab_ind.difference_update(select_ind)

TypeError: select() got multiple values for argument 'batch_size'
@Lggggggx
Copy link
Collaborator

Sorry about that.
The QueryRandom API has not been unified yet. The format of the QueryRandom.select looks like this select(self, unlabel_index, batch_size=1). We will fix this in the next version.
If you want to use the QueryRandom, you should write a separate process for this.
Here is an example for your reference.

random = QueryRandom(X, y)
random_result = []

for round in range(5):
    # Get the data split of one fold experiment
    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
    # Get intermediate results saver for one fold experiment
    saver = alibox.get_stateio(round)
    # calc the initial point
    model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
    pred = model.predict(X[test_idx, :])
    accuracy = sum(pred == y[test_idx]) / len(test_idx)
    saver.set_initial_point(accuracy)

    while not stopping_criterion.is_stop():
        # Select a subset of Uind according to the query strategy
        # Passing model=None to use the default model for evaluating the committees' disagreement
        select_ind = random.select(unlab_ind, batch_size=1)
        label_ind.update(select_ind)
        unlab_ind.difference_update(select_ind)

        # Update model and calc performance according to the model you are using
        model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
        pred = model.predict(X[test_idx, :])
        accuracy = alibox.calc_performance_metric(y_true=y[test_idx],
                                                y_pred=pred,
                                                performance_metric='accuracy_score')

        # Save intermediate results to file
        st = alibox.State(select_index=select_ind, performance=accuracy)
        saver.add_state(st)
        saver.save()

        # Passing the current progress to stopping criterion object
        stopping_criterion.update_information(saver)
    # Reset the progress in stopping criterion object
    stopping_criterion.reset()
    random_result.append(copy.deepcopy(saver))

@evanzhu2013
Copy link
Author

Thanks for your quick answer. It works. I suggest putting this code to example folder.

@Lggggggx
Copy link
Collaborator

Thank you for your advice. It's a good idea.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants