In [1]:
from datasets import SyntheticDataset
from crowd_evaluation import OldEvaluator, ConfidenceEvaluatorC, MajorityEvaluator, VoteAggregator
import numpy as np

import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline

from visualizations.utils import say

In [2]:
from joblib import Memory, Parallel, delayed
memory = Memory('accuracy_vs_num_workers', verbose=0)

@memory.cache
def calc_fraction_of_wrong_interval_estimates(
        num_samples: int,
        num_workers: int,
        confidence: float,
        evaluator_name: str,
        iter_count: int,
        error_rates: list[int]):

    correct_interval_estimates = np.zeros(iter_count * num_workers, dtype=np.float32)
    error_rate_estimation_errors = np.zeros(iter_count * num_workers, dtype=np.float32)
    int_sizes = np.zeros(iter_count * num_workers, dtype=np.float32)

    # Saves for every sample generated whether the
    # majority vote estimates the label correctly (=1) or not (=0)
    majority_vote_estimations = np.zeros(iter_count * num_samples)

    # Saves for every sample generated whether the
    # weighted vote estimates the label correctly (=1) or not (=0)
    weighted_vote_estimations = np.zeros(iter_count * num_samples)

    for i in range(iter_count):
        p_true = np.random.choice(error_rates, size=num_workers)
        dataset = SyntheticDataset(num_samples=num_samples, num_workers=num_workers, p_true=p_true)

        if evaluator_name == 'old':
            evaluator = OldEvaluator(dataset)
            ps, confs = evaluator.evaluate_workers_with_confidence(
                dataset.workers,
                confidence=confidence,
                method='exhaustive'
            )
        elif evaluator_name == 'old greedy':
            evaluator = OldEvaluator(dataset)
            ps, confs = evaluator.evaluate_workers_with_confidence(
                dataset.workers,
                confidence=confidence,
                method='greedy'
            )
        elif evaluator_name == 'majority':
            evaluator = MajorityEvaluator(dataset)
            ps = evaluator.evaluate_workers(dataset.workers)
            confs = np.zeros(num_workers)
        else:
            evaluator = ConfidenceEvaluatorC(dataset)
            ps, confs = evaluator.evaluate_workers_with_confidence(
                dataset.workers,
                confidence=confidence,
            )


        ground_truth = dataset.get_ground_truth_for_samples(dataset.samples)
        maj_vote = VoteAggregator.majority_vote(dataset, dataset.samples)
        weighted_vote = VoteAggregator.weighted_vote(dataset, dataset.samples, ps, 0.5)

        min_limit, max_limit = ps-confs, ps+confs
        correct_interval_estimates[i*num_workers:(i+1)*num_workers] = np.where((min_limit <= p_true) & (p_true <= max_limit), 1, 0)
        int_sizes[i*num_workers:(i+1)*num_workers] = 2*confs
        error_rate_estimation_errors[i*num_workers:(i+1)*num_workers] = np.abs(ps - p_true)

        majority_vote_estimations[i*num_samples:(i+1)*num_samples] = (ground_truth == maj_vote)
        weighted_vote_estimations[i*num_samples:(i+1)*num_samples] = (ground_truth == weighted_vote)

    return correct_interval_estimates, int_sizes, error_rate_estimation_errors, majority_vote_estimations, weighted_vote_estimations

In [None]:
from tqdm import tqdm
import time

start = time.time()

# num_workers/num_samples configurations to analyze
num_workers = np.array([3, 5, 7, 9, 11, 15, 21])
num_tasks_configs = [1000]
iteration_count = 500
evaluater_names = ['new', 'old greedy']
confidence_level = 0.9

# Stores whether the interval contained the true error rate for every dataset.
correct_interval_estimates = dict()

# Stores the average accuracy for every configuration.
accuracies = dict()

# Stores all measured interval sizes for every dataset
interval_sizes = dict()

# Stores the average interval sizes for every configuration
average_interval_sizes = dict()

# Stores all estimation error for every dataset
estimation_errors = dict()

# Stores the average estimation error for every configuration
average_estimation_errors = dict()

majority_vote_estimation_accuracies = dict()

weighted_vote_estimation_accuracies = dict()

for num_tasks in num_tasks_configs:
    # n: num_samples
    # m: num_workers
    correct_interval_estimates[num_tasks] = dict()
    accuracies[num_tasks] = dict()
    interval_sizes[num_tasks] = dict()
    average_interval_sizes[num_tasks] = dict()
    estimation_errors[num_tasks] = dict()
    average_estimation_errors[num_tasks] = dict()
    majority_vote_estimation_accuracies[num_tasks] = dict()
    weighted_vote_estimation_accuracies[num_tasks] = dict()

    for t in evaluater_names:
        print('num_workers: ' + str(num_workers) + ', evaluator: ' + t)
        res = Parallel(n_jobs=7)(delayed(calc_fraction_of_wrong_interval_estimates)
                (num_tasks, n, confidence_level, t, iteration_count, [0.1, 0.2, 0.3]) for n in tqdm(num_workers))

        correct_interval_estimates_results = np.array([x[0] for x in res])
        interval_sizes_results = np.array([x[1] for x in res])
        est_errs_results = np.array([x[2] for x in res])
        maj_vote_results = np.array([x[3] for x in res])
        weighted_vote_results = np.array([x[4] for x in res])

        accuracies[num_tasks][t] = np.array([np.sum(correct_interval_estimates_results[i]) for i in range(len(num_tasks))]) / (num_workers * iteration_count)
        correct_interval_estimates[num_tasks][t] = correct_interval_estimates_results
        interval_sizes[num_tasks][t] = interval_sizes_results
        average_interval_sizes[num_tasks][t] = np.array([np.mean(interval_sizes_results[i]) for i in range(len(num_tasks))])
        estimation_errors[num_tasks][t] = est_errs_results
        average_estimation_errors[num_tasks][t] = np.array([np.mean(est_errs_results[i]) for i in range(len(num_tasks))])
        majority_vote_estimation_accuracies[num_tasks][t] = np.array([np.mean(maj_vote_results[i]) for i in range(len(num_tasks))])
        weighted_vote_estimation_accuracies[num_tasks][t] = np.array([np.mean(weighted_vote_results[i]) for i in range(len(num_tasks))])

end = time.time()
print('Time elapsed:', end-start)
say("Accuracy and interval size vs num tasks completed")

num_workers: [ 3  5  7  9 11 15 21], evaluator: new


100%|██████████| 7/7 [00:00<00:00, 47.91it/s]


In [None]:
mpl.rcParams['figure.dpi'] = 300

fig, ax1 = plt.subplots()

s = slice(0, len(num_workers))

color = 'tab:blue'
ax1.set_xlabel('number of workers')
ax1.set_ylabel('accuracy')
lns1 = ax1.plot(num_workers[s], accuracies[7]['new'][s], label='accuracy',
         color=color, marker='s', linestyle='--', linewidth=1, markersize=2)

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

color = 'tab:red'
ax2.set_ylabel('average interval size')  # we already handled the x-label with ax1
lns2 = ax2.plot(num_workers[s], average_interval_sizes[7]['new'][s], label='average interval size',
         color=color, marker='s', linestyle='--', linewidth=1, markersize=2)

lns = lns1 + lns2
labs = [l.get_label() for l in lns]
ax1.legend(lns, labs, loc="center right")