In [6]:
%matplotlib inline

from pprint import pprint
from simulations import *
import numpy as np
import matplotlib.pyplot as plt

ImportError: No module named simulations

In [None]:
def evaluate_vancouver(num_assignments, num_reviews, num_truths, min_quality=1, max_quality=5, use_cover=True,
                       vancouver_steps=10):
    # generate random groups, assignments, qualities, and reviews
    groups = { sub : [sub + x for x in ['1','2','3']] for sub in [ chr(ord('a') + z) for z in range(num_assignments)]}
    assignments, cover = peer_assignment(groups, num_reviews)
    true_qualities = {i: random.randint(min_quality, max_quality) for i in assignments}
    reviews = random_reviews(assignments, true_qualities)
    
    # generate a random ground truth value for all submissions
    truths = {i: 0.5 for i in groups}
    
    # make a truths_visible dictionary for the algorithm to have access to
    if use_cover:
        if len(cover) > num_truths:
            truths_visible = {i[0]: 0.5 for i in random.sample(list(cover), num_truths)}
        else:
            truths_visible = {i: 0.5 for i in cover}
            while len(truths_visible.keys()) < num_truths:
                truths_visible[random.choice(truths)] = 0.5
    else:
        truths_visible = {i[0]: 0.5 for i in random.sample(list(truths), num_truths)}
    
    # run vancouver and omniscient vancouver
    scores, qualities = vancouver(reviews, truths_visible, vancouver_steps)
    omni_scores, omni_qualities = vancouver(reviews, truths, vancouver_steps)
    
    # generate statistics on the data
    sub_score_error = [abs(scores[submission][0] - 0.5) for submission in scores]
    sub_var_error = [abs(scores[submission][1] - omni_scores[submission][1]) for submission in scores]
    grader_var_error = [abs(qualities[grader] - true_qualities[grader]) for grader in qualities]

    return sub_score_error, sub_var_error, grader_var_error

    
def vancouver_statistics(num_assignments, num_reviews, num_truths, num_runs, min_quality=1, max_quality=5,
                         use_cover=True, vancouver_steps=10):
    # generate each statistic for num_runs trials
    means_acc = []
    medians_acc = []
    maxes_acc = []
    for _ in range(num_runs):
        errors = evaluate_vancouver(num_assignments, num_reviews, num_truths, min_quality, max_quality,
                                    use_cover, vancouver_steps)
        means = [np.mean(stat) for stat in errors]
        means_acc.append(means)
        medians = [np.median(stat) for stat in errors]
        medians_acc.append(medians)
        maxes = [max(stat) for stat in errors]
        maxes_acc.append(maxes)
    
    # average the results of the statistics across the trials
    mean_average = np.mean(means_acc, axis=0)
    median_average = np.mean(medians_acc, axis=0)
    max_average = np.mean(maxes_acc, axis=0)
    
    # tidy up the output for user-friendliness
    mean_dict = {'sub_grade': mean_average[0], 'sub_var': mean_average[1], 'usr_var': mean_average[2]}
    median_dict = {'sub_grade': median_average[0], 'sub_var': median_average[1], 'usr_var': median_average[2]}
    max_dict = {'sub_grade': max_average[0], 'sub_var': max_average[1], 'usr_var': max_average[2]}
    
    return {'mean': mean_dict, 'median': median_dict, 'max': max_dict}

def print_stats(stats):
    print('Expectation of the Error in a Given Trial', '\n')
    
    print('Assignment Grades:')
    print('Mean Error: ', stats['mean']['sub_grade'])
    print('Maximum Error: ', stats['max']['sub_grade'])
    print('Median Error: ', stats['median']['sub_grade'], '\n')
    
    print('Assignment Variances:')
    print('Mean Error: ', stats['mean']['sub_var'])
    print('Maximum Error: ', stats['max']['sub_var'])
    print('Median Error: ', stats['median']['sub_var'], '\n')
    
    print('Grader Variances:')
    print('Mean Error: ', stats['mean']['usr_var'])
    print('Maximum Error: ', stats['max']['usr_var'])
    print('Median Error: ', stats['median']['usr_var'], '\n', '\n')

In [None]:
print_stats(vancouver_statistics(20, 3, 5, 3, use_cover=True))

In [None]:
def plot_stats(stat_type, stat_variable, min_quality=1, max_quality=5, use_cover=True,
               vancouver_steps=10, num_subs=20, num_grades_per_sub=3, num_trials=10, step_size=1):
    stats = []
    for num_true_grades in range(0, num_subs + step_size, step_size):
        #print(num_true_grades)
        stats.append(vancouver_statistics(num_subs, num_grades_per_sub, num_true_grades, num_trials, min_quality,
                                          max_quality, use_cover)[stat_type][stat_variable])
    
    plt.plot(range(0, num_subs + step_size, step_size), stats)
    plt.xlabel('Number of Ground-Truth Grades')
    plt.ylabel(stat_type + ' ' + stat_variable + ' Error')
    plt.title('Quality ranging from ' + str(min_quality) + ' to ' + str(max_quality))
    plt.show()

In [None]:
plot_stats('mean', 'sub_grade', min_quality=1, max_quality=5, num_trials=1, use_cover=False)

### Vancouver With Injected Ground Truth

We now have a working plot function and another function which can display specific stats. This should be enough to accumulate reasonable amounts of data. I want to start with an examination of what happens without a cover as we increase the number of ground truth grades with graders at different variances.

In [None]:
plot_stats('mean', 'sub_grade', min_quality=1, max_quality=5, num_trials=10, use_cover=False)

That appears to decrease linearly, but I am not convinced, so let's run it with a higher number of trials.

In [None]:
plot_stats('mean', 'sub_grade', min_quality=1, max_quality=5, num_trials=100, use_cover=False)

This does indeed show that the plot is linear without the use of covers. Any effects the TA grades have on the non-TA-graded submissions must be minimal, which is slightly disappointing.

### Working with Covers

My next step will be to look at whether assigning graders to a cover causes a deflection of any sort in the plot.

In [None]:
plot_stats('mean', 'sub_grade', min_quality=1, max_quality=5, num_trials=25, use_cover=True)

Once again, this appears linear, but I want to run a high-resolution plot.

In [None]:
plot_stats('mean', 'sub_grade', min_quality=1, max_quality=5, num_trials=100, use_cover=True)

This is pretty conclusive evidence that unless the code I am running is wrong (which is a possibility, despite my best efforts), the error mean submission grade error converges to zero linearly as ground truth grades increase. This means, unfortunately, that the error scales down linearly with time and effort, and there is no real "tipping point" of how much effort should be put in. This is actually a somewhat surprising result, since it means that Vancouver is not really able to use the information from ground truth very effectively. This may change when more people are grading the same assignment, so that is what I want to try next.

In [None]:
plot_stats('mean', 'sub_grade', min_quality=1, max_quality=5, num_trials=20, use_cover=True, num_grades_per_sub=6)

That about halves the error, which is what I would expect since each assignment gets double the number of input grades. It's still linear, or too close to linear to be reliably distinguished from it.

### Verification of Vancouver

I want to take a moment out now to verify that I am using the correct number of Vancouver steps, just to be sure that these results are valid. I can do that by comparing plots at different numbers of iterations, to see if the error at all points is lower or the same.

In [None]:
plot_stats('mean', 'sub_grade', min_quality=1, max_quality=5, num_trials=20, use_cover=True, vancouver_steps=20)

Doubling the number of Vancouver iterations doesn't appear to have done anything to the plot, so from now on I'm going to assume that for this sample size at least, ten iterations is sufficient.

### Summary of Results

I have conducted simulations in several areas. First, I wanted to verify Vancouver, and this was easy. For a sample size of twenty students grading three submissions each, ten iterations appears to be sufficient to allow the algorithm to appropriately converge. Second, I wanted to determine the manner in which injection of ground truth grades would cause the error to change. I have found that said injection causes the mean error to decrease linearly, which implies that non-ground-truth assignments are not affected by ground truth injection in a meaningful way. Third, I attempted to determine if the use of a planted cover would be different than not using a planted cover in this regard, to see if we might gain bonus benefits by having graded a cover. I was able to discern no such difference; the plots appear to be identical.

For the purpose of grading assignments, this means that the work required to be done by the TAs is linear with the error considered allowable by the instructors. The maximum error is even worse, decreasing slowly until it drops off sharply near the end, though this can be rendered largely irrelevant by the presence of an appeals process.