In [None]:
%matplotlib inline

from pprint import pprint
from peer_review import *
import numpy as np
import matplotlib.pyplot as plt

In [None]:
def evaluate_vancouver(num_assignments, num_reviews, num_truths, min_quality=1, max_quality=5, use_cover=True,
                       vancouver_steps=10):
    # generate random groups, assignments, qualities, and reviews
    groups = { sub : [sub + x for x in ['1','2','3']] for sub in [ chr(ord('a') + z) for z in range(num_assignments)]}
    assignments, cover = peer_assignment(groups, num_reviews)
    true_qualities = {i: random.randint(min_quality, max_quality) for i in assignments}
    reviews = random_reviews(assignments, true_qualities)
    
    # generate a random ground truth value for all submissions
    truths = {i: 0.5 for i in groups}
    
    # make a truths_visible dictionary for the algorithm to have access to
    if use_cover:
        if len(cover) > num_truths:
            truths_visible = {i[0]: 0.5 for i in random.sample(list(cover), num_truths)}
        else:
            while len(cover.keys()) < num_truths:
                cover[random.choice(truths)] = 0.5
            truths_visible = cover
    else:
        truths_visible = {i[0]: 0.5 for i in random.sample(list(truths), num_truths)}

    # run vancouver and omniscient vancouver
    scores, qualities = vancouver(reviews, truths_visible, vancouver_steps)
    omni_scores, omni_qualities = vancouver(reviews, truths, vancouver_steps)
    
    # generate statistics on the data
    sub_score_error = [abs(scores[submission][0] - 0.5) for submission in scores]
    sub_var_error = [abs(scores[submission][1] - omni_scores[submission][1]) for submission in scores]
    grader_var_error = [abs(qualities[grader] - true_qualities[grader]) for grader in qualities]

    return sub_score_error, sub_var_error, grader_var_error

    
def vancouver_statistics(num_assignments, num_reviews, num_truths, num_runs, min_quality=1, max_quality=5,
                         use_cover=True, vancouver_steps=10):
    # generate each statistic for num_runs trials
    means_acc = []
    medians_acc = []
    maxes_acc = []
    for _ in range(num_runs):
        errors = evaluate_vancouver(num_assignments, num_reviews, num_truths, min_quality, max_quality,
                                    use_cover, vancouver_steps)
        means = [np.mean(stat) for stat in errors]
        means_acc.append(means)
        medians = [np.median(stat) for stat in errors]
        medians_acc.append(medians)
        maxes = [max(stat) for stat in errors]
        maxes_acc.append(maxes)
    
    # average the results of the statistics across the trials
    mean_average = np.mean(means_acc, axis=0)
    median_average = np.mean(medians_acc, axis=0)
    max_average = np.mean(maxes_acc, axis=0)
    
    # tidy up the output for user-friendliness
    mean_dict = {'sub_grade': mean_average[0], 'sub_var': mean_average[1], 'usr_var': mean_average[2]}
    median_dict = {'sub_grade': median_average[0], 'sub_var': median_average[1], 'usr_var': median_average[2]}
    max_dict = {'sub_grade': max_average[0], 'sub_var': max_average[1], 'usr_var': max_average[2]}
    
    return {'mean': mean_dict, 'median': median_dict, 'max': max_dict}

def print_stats(stats):
    print('Expectation of the Error in a Given Trial', '\n')
    
    print('Assignment Grades:')
    print('Mean Error: ', stats['mean']['sub_grade'])
    print('Maximum Error: ', stats['max']['sub_grade'])
    print('Median Error: ', stats['median']['sub_grade'], '\n')
    
    print('Assignment Variances:')
    print('Mean Error: ', stats['mean']['sub_var'])
    print('Maximum Error: ', stats['max']['sub_var'])
    print('Median Error: ', stats['median']['sub_var'], '\n')
    
    print('Grader Variances:')
    print('Mean Error: ', stats['mean']['usr_var'])
    print('Maximum Error: ', stats['max']['usr_var'])
    print('Median Error: ', stats['median']['usr_var'], '\n', '\n')

In [None]:
print_stats(vancouver_statistics(20, 3, 10, 3, use_cover=True))

In [None]:
def plot_stats(stat_type, stat_variable, min_quality=1, max_quality=5, use_cover=True,
               vancouver_steps=10, num_subs=20, num_grades_per_sub=3, num_trials=10, step_size=1):
    stats = []
    for num_true_grades in range(0, num_subs + step_size, step_size):
        #print(num_true_grades)
        stats.append(vancouver_statistics(num_subs, num_grades_per_sub, num_true_grades, num_trials, min_quality,
                                          max_quality, use_cover)[stat_type][stat_variable])
    
    plt.plot(range(0, num_subs + step_size, step_size), stats)
    plt.xlabel('Number of Ground-Truth Grades')
    plt.ylabel(stat_type + ' ' + stat_variable + ' Error')
    plt.title('Quality ranging from ' + str(min_quality) + ' to ' + str(max_quality))
    plt.show()

In [None]:
plot_stats('mean', 'sub_grade', min_quality=1, max_quality=5, num_trials=1, use_cover=False)

We now have a working plot function and another function which can display specific stats. This should be enough to accumulate reasonable amounts of data. I want to start with an examination of what happens without a cover as we increase the number of ground truth grades with graders at different variances.

In [None]:
plot_stats('mean', 'sub_grade', min_quality=1, max_quality=5, num_trials=10, use_cover=False)

That appears to decrease linearly, but I am not convinced, so let's run it with a higher number of trials.

In [None]:
plot_stats('mean', 'sub_grade', min_quality=1, max_quality=5, num_trials=100, use_cover=False)

This more precise examination shows a higher-order curve of some sort, albeit a very slight one. We had hoped this effect would be greater, but it looks like the actual amount of ground truth added in has much more effect than the "ripples" of this information (if there were no secondary effects, the graph would be a straight linear decrease). This may imply that the number of TA grades needed scales linearly with the accuracy desired.

### Working with Covers

My next step will be to look at whether assigning graders to a cover causes a deflection of any sort in the plot.

In [None]:
plot_stats('mean', 'sub_grade', min_quality=1, max_quality=5, num_trials=25, use_cover=True)

This result appears to be interesting enough to warrant a high-resolution version of the plot.

In [None]:
plot_stats('mean', 'sub_grade', min_quality=1, max_quality=5, num_trials=100, use_cover=True)