In [1]:
%cd ..
%matplotlib inline

D:\Projects\Python\PL-Heuristic


In [2]:
import pandas as pd
from scipy import stats 

from analysis.caching import get_cached_results

In [3]:
pd.set_option('display.max_rows', 500)
pd.options.display.float_format = '{:0.2f}'.format

# Preparation

In [4]:
data = get_cached_results()
data.describe()

Unnamed: 0,heuristic_classroom_utilisation,heuristic_instruction_size,heuristic_objective,heuristic_percentage_instruction,heuristic_percentage_self_study,heuristic_self_study_size,heuristic_teacher_utilisation,ilp_classroom_utilisation,ilp_instruction_size,ilp_objective,ilp_percentage_instruction,ilp_percentage_self_study,ilp_self_study_size,ilp_teacher_utilisation,experiment
count,7200.0,7200.0,7200.0,7200.0,7200.0,7200.0,7200.0,7199.0,7199.0,7199.0,7199.0,7199.0,7199.0,7199.0,7200.0
mean,88.24,10.67,6610.21,59.66,40.34,49.52,66.92,88.03,11.67,6748.48,65.4,34.6,44.58,67.23,36.5
std,11.33,3.55,2351.19,21.67,21.67,16.22,14.76,11.95,4.47,2425.31,21.49,21.49,16.18,17.16,20.78
min,60.24,5.4,3469.55,15.0,5.62,15.13,43.75,60.24,5.19,3540.31,18.62,1.12,5.0,39.38,1.0
25%,78.31,7.93,4371.92,43.42,19.5,39.81,53.75,78.31,8.41,4417.57,49.62,15.38,31.25,53.75,18.75
50%,90.36,10.28,6256.42,61.62,38.38,46.26,58.12,89.16,11.43,7713.88,65.5,34.5,44.0,62.5,36.5
75%,100.0,12.22,9011.09,80.5,56.58,63.5,81.25,100.0,14.0,9152.75,84.62,50.38,55.06,83.75,54.25
max,100.0,20.99,9932.6,94.38,85.0,80.0,100.0,100.0,24.36,10080.38,98.88,81.38,80.0,100.0,72.0


In [5]:
# All performance measure columns. In the data set, these are split by 
# method - one for the ILP, and another for the heuristic.
MEASURES = [
    "objective",
    "instruction_size", "self_study_size",
    "percentage_instruction", "percentage_self_study",
    "classroom_utilisation", "teacher_utilisation"]

EXPERIMENTS = list(range(1, 73))

# Analysis

In [6]:
def results_per_experiment(experiment):
    """
    Computes a dictionary of results per experiment. This is composed of the 
    means of each performance measure, per solution method type (one of ilp,
    heuristic).    
    """
    results = {}
    
    for measure in MEASURES:
        ilp_data = data["ilp_" + measure][data.experiment == experiment]
        heuristic_data = data["heuristic_" + measure][data.experiment == experiment]

        results[measure, 'heuristic'] = heuristic_data.mean()
        results[measure, 'ilp'] = ilp_data.mean()

        results[measure, 'difference'] = (heuristic_data - ilp_data).mean()
        results[measure, 'percentage'] = 100 * results[measure, 'difference'] / results[measure, 'ilp']
        results[measure, 't_stat'] = stats.ttest_1samp(heuristic_data - ilp_data, 0, nan_policy='omit').statistic

    return results

In [7]:
result = pd.DataFrame([results_per_experiment(experiment) for experiment in EXPERIMENTS],
                      columns=[(measure, sub)
                               for measure in MEASURES
                               for sub in ['heuristic', 'ilp', 'difference', 'percentage', 't_stat']])

# This presents only the mean results per experiment
means = result.drop(columns=[column for column in result.columns
                             if column[1] not in ['heuristic', 'ilp']])

# This sets the index starting at 1 (for the experiment numbers), and creates
# a hierarchical column index grouped by performance measure and method
means.index = EXPERIMENTS
means.columns = pd.MultiIndex.from_tuples(means.columns,
                                          names=['Measure', 'Method'])

  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


In [8]:
# This presents the pairwise-differences approach
differences = result.drop(columns=[column for column in result.columns
                             if column[1] not in ['difference', 'percentage', 't_stat']])

differences.index = EXPERIMENTS
differences.columns = pd.MultiIndex.from_tuples(differences.columns,
                                                names=['Measure', 'Statistics'])

## Experiment results

In [9]:
# Used to render this table in the paper
with open('experiment_results.tex', 'w') as file:
    means.to_latex(buf=file, float_format="{:0.2f}".format)

## Means

Mean results per experiment, for each method and performance measure.

In [10]:
experiments = pd.read_excel('experiments/experiments.xlsx')
experiments = experiments[['experiment', 'learners']]

means = means.join(experiments.set_index('experiment'))

means



Unnamed: 0,"(objective, heuristic)","(objective, ilp)","(instruction_size, heuristic)","(instruction_size, ilp)","(self_study_size, heuristic)","(self_study_size, ilp)","(percentage_instruction, heuristic)","(percentage_instruction, ilp)","(percentage_self_study, heuristic)","(percentage_self_study, ilp)","(classroom_utilisation, heuristic)","(classroom_utilisation, ilp)","(teacher_utilisation, heuristic)","(teacher_utilisation, ilp)",learners
1,4691.81,4726.15,14.65,15.14,70.1,64.8,73.1,75.7,26.9,24.3,100.0,100.0,53.75,53.75,800
2,4678.85,4717.15,14.48,15.05,70.22,66.06,72.03,75.23,27.96,24.77,99.98,100.0,53.74,53.75,800
3,4672.5,4720.07,14.51,15.15,65.33,69.74,69.28,73.85,30.72,26.15,97.67,97.67,52.5,52.5,800
4,4808.41,4845.05,10.77,12.17,54.39,38.78,85.82,88.5,14.18,11.5,79.43,73.55,82.41,76.31,800
5,4815.61,4853.04,10.8,12.1,54.79,34.51,85.23,88.06,14.77,11.94,78.83,74.69,81.79,77.49,800
6,4755.06,4800.44,11.83,12.83,62.41,36.33,79.3,83.1,20.7,16.9,67.98,67.95,70.53,70.5,800
7,9684.62,9773.15,18.23,22.5,62.06,47.34,89.72,92.19,10.28,7.81,94.8,79.92,50.96,42.96,1600
8,9656.9,9751.89,18.1,22.63,60.26,49.51,89.41,92.2,10.59,7.8,95.33,79.13,51.24,42.53,1600
9,9662.11,9751.2,18.17,22.68,63.32,49.33,87.72,90.41,12.28,9.59,93.6,78.85,50.31,42.38,1600
10,9569.82,9799.91,10.56,13.69,69.94,42.91,81.77,95.32,18.23,4.68,77.23,68.55,80.12,71.12,1600


### Averages by experiment size

In [11]:
means[means.learners == 800].mean()

(objective, heuristic)                4302.65
(objective, ilp)                      4366.55
(instruction_size, heuristic)            9.70
(instruction_size, ilp)                 10.18
(self_study_size, heuristic)            43.41
(self_study_size, ilp)                  42.09
(percentage_instruction, heuristic)     52.47
(percentage_instruction, ilp)           57.35
(percentage_self_study, heuristic)      47.53
(percentage_self_study, ilp)            42.65
(classroom_utilisation, heuristic)      88.67
(classroom_utilisation, ilp)            90.46
(teacher_utilisation, heuristic)        67.24
(teacher_utilisation, ilp)              68.99
learners                               800.00
dtype: float64

In [12]:
means[means.learners == 1600].mean()

(objective, heuristic)                8917.77
(objective, ilp)                      9129.74
(instruction_size, heuristic)           11.63
(instruction_size, ilp)                 13.15
(self_study_size, heuristic)            55.64
(self_study_size, ilp)                  47.08
(percentage_instruction, heuristic)     66.84
(percentage_instruction, ilp)           73.43
(percentage_self_study, heuristic)      33.16
(percentage_self_study, ilp)            26.57
(classroom_utilisation, heuristic)      87.80
(classroom_utilisation, ilp)            85.59
(teacher_utilisation, heuristic)        66.60
(teacher_utilisation, ilp)              65.47
learners                              1600.00
dtype: float64

# Differences

Pairwise differences between heuristic and ILP solutions, as means per experiment. A one-sample t-test is performed on the differences, to test the null hypothesis that their population mean is zero.

In [13]:
def format_t_stat(value):
    if pd.isna(value):
        return "0.00"
    
    return "{:0.2f}".format(value)

for measure in MEASURES:
    differences[measure] = differences[measure].apply(
        lambda x: "{0:0.2f}\% ({1})".format(x.percentage, format_t_stat(x.t_stat)),
        axis=1)

differences.columns = differences.columns.droplevel(1)
differences = differences.loc[:, ~differences.columns.duplicated()]

In [14]:
# Used to render this table in the paper
with open('pairwise_differences.tex', 'w') as file:
    differences.to_latex(buf=file, escape=False)

In [15]:
differences

Measure,objective,instruction_size,self_study_size,percentage_instruction,percentage_self_study,classroom_utilisation,teacher_utilisation
1,-0.73\% (-25.56),-3.25\% (-17.46),8.19\% (11.66),-3.43\% (-15.15),10.69\% (15.15),0.00\% (0.00),0.00\% (0.00)
2,-0.81\% (-26.47),-3.73\% (-20.22),6.31\% (6.90),-4.25\% (-17.90),12.89\% (17.90),-0.02\% (-1.00),-0.02\% (-1.00)
3,-1.01\% (-26.48),-4.19\% (-20.09),-6.33\% (-5.38),-6.18\% (-19.99),17.45\% (19.99),0.00\% (0.00),0.00\% (0.00)
4,-0.76\% (-43.83),-11.56\% (-37.44),40.26\% (9.63),-3.02\% (-31.50),23.26\% (31.50),7.99\% (17.84),7.99\% (17.84)
5,-0.77\% (-40.61),-10.79\% (-31.27),58.79\% (11.45),-3.21\% (-27.76),23.69\% (27.76),5.55\% (7.96),5.55\% (7.96)
6,-0.95\% (-40.30),-7.77\% (-32.44),71.81\% (12.49),-4.58\% (-28.88),22.52\% (28.88),0.04\% (0.07),0.04\% (0.07)
7,-0.91\% (-71.69),-18.99\% (-46.58),31.10\% (7.77),-2.68\% (-33.51),31.61\% (33.51),18.62\% (34.43),18.62\% (34.43)
8,-0.97\% (-68.34),-20.02\% (-54.39),21.72\% (5.73),-3.02\% (-38.94),35.74\% (38.94),20.47\% (45.19),20.47\% (45.19)
9,-0.91\% (-71.64),-19.89\% (-52.49),28.34\% (6.69),-2.97\% (-40.97),27.99\% (40.97),18.71\% (25.34),18.71\% (25.34)
10,-2.35\% (-166.27),-22.86\% (-81.81),63.00\% (12.06),-14.21\% (-175.21),289.48\% (175.21),12.65\% (31.72),12.65\% (31.72)


### Percentage differences across all experiments

In [16]:
# Average percentages: note that this works since each experiment has the 
# same number of instances, so we need not weigh the numbers explicitly.
columns = [(measure, 'percentage') for measure in MEASURES]

avg_percentage = result[columns]
avg_percentage.columns = [measure for measure, _ in avg_percentage.columns]

avg_percentage.mean()

objective                -1.89
instruction_size         -6.75
self_study_size          17.23
percentage_instruction   -9.93
percentage_self_study    56.24
classroom_utilisation     0.58
teacher_utilisation       0.58
dtype: float64