In [None]:
from datasets import SyntheticDataset
from crowd_evaluation import OldEvaluator, ConfidenceEvaluatorC, MajorityEvaluator
import numpy as np

import os
def say(msg = "Finish", voice = "Victoria"):
    os.system(f'say -v {voice} {msg}')

In [None]:
from joblib import Memory, Parallel, delayed
memory = Memory('accuracy_interval_vs_confidence', verbose=0)

@memory.cache
def calc_fraction_of_wrong_interval_estimates(
        num_samples: int,
        num_workers: int,
        confidence: float,
        evaluator_name: str,
        iter_count: int):

    correct_interval_estimates = np.zeros(iter_count * num_workers, dtype=np.float32)
    error_rate_estimation_errors = np.zeros(iter_count * num_workers, dtype=np.float32)
    int_sizes = np.zeros(iter_count * num_workers, dtype=np.float32)

    for i in range(iter_count):
        p_true = np.random.choice([0.1, 0.2, 0.3], size=num_workers)
        dataset = SyntheticDataset(num_samples=num_samples, num_workers=num_workers, p_true=p_true)

        if evaluator_name == 'old':
            evaluator = OldEvaluator(dataset)
            ps, confs = evaluator.evaluate_workers_with_confidence(
                dataset.workers,
                confidence=confidence,
                method='exhaustive'
            )
        elif evaluator_name == 'old greedy':
            evaluator = OldEvaluator(dataset)
            ps, confs = evaluator.evaluate_workers_with_confidence(
                dataset.workers,
                confidence=confidence,
                method='greedy'
            )
        elif evaluator_name == 'majority':
            evaluator = MajorityEvaluator(dataset)
            ps = evaluator.evaluate_workers(dataset.workers)
            confs = np.zeros(num_workers)
        else:
            evaluator = ConfidenceEvaluatorC(dataset)
            ps, confs = evaluator.evaluate_workers_with_confidence(
                dataset.workers,
                confidence=confidence,
            )

        min_limit, max_limit = ps-confs, ps+confs
        correct_interval_estimates[i*num_workers:(i+1)*num_workers] = np.where((min_limit <= p_true) & (p_true <= max_limit), 1, 0)
        int_sizes[i*num_workers:(i+1)*num_workers] = 2*confs
        error_rate_estimation_errors[i*num_workers:(i+1)*num_workers] = np.abs(ps - p_true)

    return correct_interval_estimates, int_sizes, error_rate_estimation_errors

In [None]:
from tqdm import tqdm

confidence_levels = np.arange(0, 1, 0.05)

# num_workers/num_samples configurations to analyze
configurations = [(3, 300), (7, 300), (3, 100), (7, 100)]
iteration_count = 500
evaluater_names = ['new', 'old greedy']

# Stores whether the interval contained the true error rate for every dataset.
correct_interval_estimates = dict()

# Stores the average accuracy for every configuration.
accuracies = dict()

# Stores all measured interval sizes for every dataset
interval_sizes = dict()

# Stores the average interval sizes for every configuration
average_interval_sizes = dict()

# Stores all estimation error for every dataset
estimation_errors = dict()

# Stores the average estimation error for every configuration
average_estimation_errors = dict()

for num_workers, num_samples in configurations:
    # n: num_samples
    # m: num_workers
    correct_interval_estimates[(num_samples, num_workers)] = dict()
    accuracies[(num_samples, num_workers)] = dict()
    interval_sizes[(num_samples, num_workers)] = dict()
    average_interval_sizes[(num_samples, num_workers)] = dict()
    estimation_errors[(num_samples, num_workers)] = dict()
    average_estimation_errors[(num_samples, num_workers)] = dict()

    for t in evaluater_names:
        print('num_workers: ' + str(num_workers) + ', num_tasks: ' + str(num_samples) + ', evaluator: ' + t)
        res = Parallel(n_jobs=7)(delayed(calc_fraction_of_wrong_interval_estimates)
                (num_samples, num_workers, c, t, iteration_count) for c in tqdm(confidence_levels))

        correct_interval_estimates_results = np.array([x[0] for x in res])
        interval_sizes_results = np.array([x[1] for x in res])
        est_errs_results = np.array([x[2] for x in res])

        accuracies[(num_samples, num_workers)][t] = np.array(
            [np.sum(correct_interval_estimates_results[i]) for i in range(len(confidence_levels))]) / (num_workers * iteration_count)
        correct_interval_estimates[(num_samples, num_workers)][t] = correct_interval_estimates_results
        interval_sizes[(num_samples, num_workers)][t] = interval_sizes_results
        average_interval_sizes[(num_samples, num_workers)][t] = np.array([np.mean(interval_sizes_results[i]) for i in range(len(confidence_levels))])
        estimation_errors[(num_samples, num_workers)][t] = est_errs_results
        average_estimation_errors[(num_samples, num_workers)][t] = np.array([np.mean(est_errs_results[i]) for i in range(len(confidence_levels))])

say("Accuracy and interval size vs confidence calculations completed")

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline

mpl.rcParams['figure.dpi'] = 600

plt.plot(confidence_levels, np.linspace(0, 1, len(confidence_levels)), label='Ideal interval-accuracy', linewidth=1)
for t in evaluater_names:
    for num_workers, num_samples in configurations:
        # n: num_samples
        # m: num_workers
        plt.plot(confidence_levels, accuracies[(num_samples, num_workers)][t], marker='s', linestyle='--',
                 label =t +', ' + str(num_workers) + ' workers, ' + str(num_samples) + ' tasks', linewidth=1, markersize=2)

plt.xlabel('Confidence level')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
# plt.savefig('confidence_vs_accuracy.svg')

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline

mpl.rcParams['figure.dpi'] = 600

plt.plot(confidence_levels, np.linspace(0, 1, len(confidence_levels)), label='Ideal interval-accuracy', linewidth=1)

plt.plot(confidence_levels, accuracies[(300, 3)]['new'], marker='s', linestyle='--',
             label = '3 workers, 300 tasks (new)', linewidth=1, markersize=2)
# plt.plot(confidence_levels, accuracies[(300, 7)]['new'], marker='s', linestyle='--',
#              label = '7 workers, 300 tasks', linewidth=1, markersize=2)
plt.plot(confidence_levels, accuracies[(300, 3)]['old greedy'], marker='s', linestyle='--',
             label = '3 workers, 300 tasks (old)', linewidth=1, markersize=2)

plt.plot(confidence_levels, accuracies[(100, 3)]['new'], marker='s', linestyle='--',
             label = '3 workers, 100 tasks (new)', linewidth=1, markersize=2)
plt.plot(confidence_levels, accuracies[(100, 3)]['old greedy'], marker='s', linestyle='--',
             label = '3 workers, 100 tasks (old)', linewidth=1, markersize=2)

# plt.plot(confidence_levels, confidence_levels - accuracies[(300, 7)]['new'], marker='s', linestyle='--',
#              label = '7 workers, 300 tasks', linewidth=1, markersize=2)

plt.xlabel('Confidence level')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
plt.plot(confidence_levels, np.linspace(0, 1, len(confidence_levels)), label='Ideal interval-accuracy', linewidth=1)

plt.plot(confidence_levels, accuracies[(300, 7)]['new'], marker='s', linestyle='--',
             label = '7 workers, 300 tasks (new)', linewidth=1, markersize=2)
plt.plot(confidence_levels, accuracies[(300, 7)]['old greedy'], marker='s', linestyle='--',
             label = '7 workers, 300 tasks (old)', linewidth=1, markersize=2)

plt.plot(confidence_levels, accuracies[(100, 7)]['new'], marker='s', linestyle='--',
             label = '7 workers, 100 tasks (new)', linewidth=1, markersize=2)
plt.plot(confidence_levels, accuracies[(100, 7)]['old greedy'], marker='s', linestyle='--',
             label = '7 workers, 100 tasks (old)', linewidth=1, markersize=2)

# plt.plot(confidence_levels, confidence_levels - accuracies[(300, 7)]['new'], marker='s', linestyle='--',
#              label = '7 workers, 300 tasks', linewidth=1, markersize=2)

plt.xlabel('Confidence level')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
for t in evaluater_names:
    for num_workers, num_samples in [(3, 100), (7,100)]:
        # n: num_samples
        # m: num_workers
        plt.plot(confidence_levels, average_interval_sizes[(num_samples, num_workers)][t], marker='s', linestyle='--',
                 label =t +',' + str(num_workers) + ' workers, ' + str(num_samples) + ' tasks', linewidth=1, markersize=2)

plt.xlabel('Confidence level')
plt.ylabel('Interval size')
plt.legend()
plt.show()
# plt.savefig('confidence_vs_interval_size.svg')

In [None]:
result = np.array(interval_sizes[(300, 3)]['new'][1:]) / np.array(interval_sizes[(300, 3)]['old greedy'][1:])
np.mean(result)

In [None]:
plt.plot(confidence_levels, average_interval_sizes[(100, 7)]['new'], marker='s', linestyle='--',
                 label ='7 workers, 100 tasks (new)', linewidth=1, markersize=2)
plt.plot(confidence_levels, average_interval_sizes[(300, 7)]['new'], marker='s', linestyle='--',
                 label ='7 workers, 300 tasks (new)', linewidth=1, markersize=2)

plt.plot(confidence_levels, average_interval_sizes[(300, 7)]['old greedy'], marker='s', linestyle='--',
                 label ='7 workers, 300 tasks (old)', linewidth=1, markersize=2)
plt.plot(confidence_levels, average_interval_sizes[(100, 7)]['old greedy'], marker='s', linestyle='--',
                 label ='7 workers, 100 tasks (old)', linewidth=1, markersize=2)

plt.xlabel('Confidence Level')
plt.ylabel('Average Interval Size')
plt.legend()
plt.show()
plt.savefig('confidence_vs_interval_size_7_workers.png')

In [None]:
plt.plot(confidence_levels, average_interval_sizes[(100, 3)]['new'], marker='s', linestyle='--',
                 label ='3 workers, 100 tasks (new)', linewidth=1, markersize=2)
plt.plot(confidence_levels, average_interval_sizes[(300, 3)]['new'], marker='s', linestyle='--',
                 label ='3 workers, 300 tasks (new)', linewidth=1, markersize=2)

plt.plot(confidence_levels, average_interval_sizes[(300, 3)]['old greedy'], marker='s', linestyle='--',
                 label ='3 workers, 300 tasks (old)', linewidth=1, markersize=2)
plt.plot(confidence_levels, average_interval_sizes[(100, 3)]['old greedy'], marker='s', linestyle='--',
                 label ='3 workers, 100 tasks (old)', linewidth=1, markersize=2)

plt.xlabel('Confidence Level')
plt.ylabel('Average Interval Size')
plt.legend()
plt.show()
plt.savefig('confidence_vs_interval_size_3_workers.png')

In [None]:
np.mean(estimation_errors[(100, 3)]['old greedy'])

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline

mpl.rcParams['figure.dpi'] = 600

for t in evaluater_names:
    for num_workers, num_samples in configurations:
        # n: num_samples
        # m: num_workers
        plt.plot(confidence_levels, estimation_errors[(num_samples, num_workers)][t], marker='s', linestyle='--',
                 label =t +', ' + str(num_workers) + ' workers, ' + str(num_samples) + ' tasks', linewidth=1, markersize=2)

plt.xlabel('Confidence level')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
# %%timeit -r 1 -n 1
#
# num_workers = 3
# p_true = np.random.choice([0.1, 0.08, 0.06], p=[0.3, 0.4, 0.3], size=num_workers)
# dataset = SyntheticDataset(num_samples=50, num_workers=num_workers, p_true=p_true)
# evaluator = OldEvaluator(dataset, debug=False)
#
# for i in range(0, 1000):
#     ps_exhaustive, confs_exhaustive = evaluator.evaluate_workers_with_confidence(
#         dataset.workers,
#         confidence=0.9,
#         method='exhaustive',
#     )
#
# ps_greedy, confs_greedy = evaluator.evaluate_workers_with_confidence(
#     dataset.workers,
#     confidence=0.9,
#     method='greedy',
# )
#
# from visualizations.utils import visualize_error_rates
#
# visualize_error_rates(
#     dataset=dataset,
#     workers=dataset.workers,
#     p_ests=[ps_exhaustive, ps_greedy],
#     confs=[confs_exhaustive, confs_greedy],
#     labels=[r'$p_{est, exhaustive}$', r'$p_{est, greedy}$'],
# )