In [1]:
%cd ..
%matplotlib inline

D:\Projects\Python\PL-Heuristic


In [2]:
import pandas as pd
from scipy import stats 

from analysis.caching import get_cached_results

In [3]:
pd.set_option('display.max_rows', 500)

# Preparation

In [4]:
data = get_cached_results()
data.describe()

Unnamed: 0,heuristic_classroom_utilisation,heuristic_instruction_size,heuristic_objective,heuristic_percentage_instruction,heuristic_percentage_self_study,heuristic_self_study_size,heuristic_teacher_utilisation,ilp_classroom_utilisation,ilp_instruction_size,ilp_objective,ilp_percentage_instruction,ilp_percentage_self_study,ilp_self_study_size,ilp_teacher_utilisation,experiment
count,7200.0,7200.0,7200.0,7200.0,7200.0,7200.0,7200.0,7199.0,7199.0,7199.0,7199.0,7199.0,7199.0,7199.0,7200.0
mean,88.235918,10.668209,6610.209687,59.657344,40.342656,49.523178,66.920573,88.026393,11.666783,6748.484177,65.398423,34.601577,44.583444,67.23182,36.5
std,11.331711,3.545793,2351.192867,21.667173,21.667173,16.217169,14.763535,11.948423,4.471629,2425.308372,21.486972,21.486972,16.178022,17.161286,20.784048
min,60.240964,5.4,3469.548383,15.0,5.625,15.133333,43.75,60.240964,5.185185,3540.313588,18.625,1.125,5.0,39.375,1.0
25%,78.313253,7.931034,4371.918796,43.421875,19.5,39.8125,53.75,78.313253,8.408259,4417.573599,49.625,15.375,31.25,53.75,18.75
50%,90.361446,10.279151,6256.421236,61.625,38.375,46.263158,58.125,89.156627,11.428571,7713.877651,65.5,34.5,44.0,62.5,36.5
75%,100.0,12.220543,9011.085109,80.5,56.578125,63.5,81.25,100.0,14.0,9152.745958,84.625,50.375,55.055556,83.75,54.25
max,100.0,20.985075,9932.599786,94.375,85.0,80.0,100.0,100.0,24.360656,10080.377678,98.875,81.375,80.0,100.0,72.0


In [5]:
# All performance measure columns. In the data set, these are split by 
# method - one for the ILP, and another for the heuristic.
MEASURES = [
    "objective",
    "instruction_size", "self_study_size",
    "percentage_instruction", "percentage_self_study",
    "classroom_utilisation", "teacher_utilisation"]

EXPERIMENTS = list(range(1, 73))

# Analysis

In [6]:
def results_per_experiment(experiment):
    """
    Computes a dictionary of results per experiment. This is composed of the 
    means of each performance measure, per solution method type (one of ilp,
    heuristic).    
    """
    results = {}
    
    for measure in MEASURES:
        ilp_data = data["ilp_" + measure][data.experiment == experiment]
        heuristic_data = data["heuristic_" + measure][data.experiment == experiment]

        results[measure, 'heuristic'] = heuristic_data.mean()
        results[measure, 'ilp'] = ilp_data.mean()
        results[measure, 'difference'] = (heuristic_data - ilp_data).mean()
        results[measure, 't_stat'] = stats.ttest_1samp(heuristic_data - ilp_data, 0, nan_policy='omit').statistic

    return results

In [7]:
result = pd.DataFrame([results_per_experiment(experiment) for experiment in EXPERIMENTS],
                      columns=[(measure, sub)
                               for measure in MEASURES
                               for sub in ['heuristic', 'ilp', 'difference', 't_stat']])

# This presents only the mean results per experiment
means = result.drop(columns=[column for column in result.columns
                             if column[1] not in ['heuristic', 'ilp']])

# This sets the index starting at 1 (for the experiment numbers), and creates
# a hierarchical column index grouped by performance measure and method
means.index = EXPERIMENTS
means.columns = pd.MultiIndex.from_tuples(means.columns,
                                          names=['Measure', 'Method'])

  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


In [8]:
# This presents the pairwise-differences approach
differences = result.drop(columns=[column for column in result.columns
                             if column[1] not in ['difference', 't_stat']])

differences.index = EXPERIMENTS
differences.columns = pd.MultiIndex.from_tuples(differences.columns,
                                                names=['Measure', 'Statistics'])

## Experiment results

In [9]:
# Used to render this table in the paper
with open('experiment_results.tex', 'w') as file:
    means.to_latex(buf=file, float_format="{:0.2f}".format)

## Means

Mean results per experiment, for each method and performance measure.

In [10]:
means

Measure,objective,objective,instruction_size,instruction_size,self_study_size,self_study_size,percentage_instruction,percentage_instruction,percentage_self_study,percentage_self_study,classroom_utilisation,classroom_utilisation,teacher_utilisation,teacher_utilisation
Method,heuristic,ilp,heuristic,ilp,heuristic,ilp,heuristic,ilp,heuristic,ilp,heuristic,ilp,heuristic,ilp
1,4691.814259,4726.152851,14.648488,15.14025,70.1045,64.796667,73.1025,75.70125,26.8975,24.29875,100.0,100.0,53.75,53.75
2,4678.852938,4717.152393,14.484949,15.04575,70.223333,66.056667,72.035,75.22875,27.965,24.77125,99.976744,100.0,53.7375,53.75
3,4672.496956,4720.069631,14.513145,15.148205,65.328167,69.74,69.28375,73.8475,30.71625,26.1525,97.674419,97.674419,52.5,52.5
4,4808.414374,4845.053921,10.765734,12.172777,54.394167,38.781595,85.8225,88.4975,14.1775,11.5025,79.433735,73.554217,82.4125,76.3125
5,4815.6142,4853.035536,10.795859,12.102173,54.792167,34.505818,85.23,88.05875,14.77,11.94125,78.831325,74.686747,81.7875,77.4875
6,4755.060905,4800.435014,11.829425,12.825381,62.413833,36.326216,79.29875,83.10375,20.70125,16.89625,67.975904,67.951807,70.525,70.5
7,9684.617822,9773.145696,18.227771,22.50086,62.056667,47.335256,89.71625,92.18625,10.28375,7.81375,94.802326,79.918605,50.95625,42.95625
8,9656.898105,9751.891352,18.100133,22.630385,60.262333,49.507095,89.40875,92.1975,10.59125,7.8025,95.325581,79.127907,51.2375,42.53125
9,9662.10606,9751.204444,18.168942,22.679756,63.315333,49.332229,87.72375,90.408125,12.27625,9.591875,93.604651,78.848837,50.3125,42.38125
10,9569.822303,9799.907043,10.558738,13.688373,69.943833,42.911,81.7725,95.32,18.2275,4.68,77.228916,68.554217,80.125,71.125


# Differences

Pairwise differences between heuristic and ILP solutions, as means per experiment. A one-sample t-test is performed on the differences, to test the null hypothesis that their population mean is zero.

In [11]:
def format_t_stat(value):
    if pd.isna(value):
        return "0.00"
    
    return "{:0.2f}".format(value)

for measure in MEASURES:
    differences[measure] = differences[measure].apply(
        lambda x: "{0:0.2f} ({1})".format(x.difference, format_t_stat(x.t_stat)),
        axis=1)

differences.columns = differences.columns.droplevel(1)
differences = differences.loc[:, ~differences.columns.duplicated()]

In [12]:
# Used to render this table in the paper
with open('pairwise_differences.tex', 'w') as file:
    differences.to_latex(buf=file, float_format="{:0.2f}".format, escape=False)

In [13]:
differences

Measure,objective,instruction_size,self_study_size,percentage_instruction,percentage_self_study,classroom_utilisation,teacher_utilisation
1,-34.34 (-25.56),-0.49 (-17.46),5.31 (11.66),-2.60 (-15.15),2.60 (15.15),0.00 (0.00),0.00 (0.00)
2,-38.30 (-26.47),-0.56 (-20.22),4.17 (6.90),-3.19 (-17.90),3.19 (17.90),-0.02 (-1.00),-0.01 (-1.00)
3,-47.57 (-26.48),-0.64 (-20.09),-4.41 (-5.38),-4.56 (-19.99),4.56 (19.99),0.00 (0.00),0.00 (0.00)
4,-36.64 (-43.83),-1.41 (-37.44),15.61 (9.63),-2.67 (-31.50),2.67 (31.50),5.88 (17.84),6.10 (17.84)
5,-37.42 (-40.61),-1.31 (-31.27),20.29 (11.45),-2.83 (-27.76),2.83 (27.76),4.14 (7.96),4.30 (7.96)
6,-45.37 (-40.30),-1.00 (-32.44),26.09 (12.49),-3.81 (-28.88),3.81 (28.88),0.02 (0.07),0.03 (0.07)
7,-88.53 (-71.69),-4.27 (-46.58),14.72 (7.77),-2.47 (-33.51),2.47 (33.51),14.88 (34.43),8.00 (34.43)
8,-94.99 (-68.34),-4.53 (-54.39),10.76 (5.73),-2.79 (-38.94),2.79 (38.94),16.20 (45.19),8.71 (45.19)
9,-89.10 (-71.64),-4.51 (-52.49),13.98 (6.69),-2.68 (-40.97),2.68 (40.97),14.76 (25.34),7.93 (25.34)
10,-230.08 (-166.27),-3.13 (-81.81),27.03 (12.06),-13.55 (-175.21),13.55 (175.21),8.67 (31.72),9.00 (31.72)
