# Bpref Modification for Bpref@*x*

In [1]:
import sys
import logging

# Setup desired logging level and format
log_level = logging.INFO
log_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# Get hierarych top logger
logger = logging.getLogger()
# Alternatively work only on some specific logger inside the hierarchy
# log_gensim = logging.getLogger('gensim')
# log_gensim_word2vec = logging.getLogger('gensim.models.word2vec')

# Remove all the current log handlers to not log every message multiple times
[logger.removeHandler(h) for h in logger.handlers]

# Create new console log handler
log_ch = logging.StreamHandler()
# Set logging format of the log handler
log_ch.setFormatter(log_formatter)
# Log verbosity can be decreased per log handler, if necessary
#log_ch.setLevel(logging.INFO)
# Add our console log handler
logger.addHandler(log_ch)

# Set logger object verbosity
logger.setLevel(log_level)

# Input Data

In [2]:
at = 5

ground_truth = [
    # topic_id,  document_id,  judgement
    ('topic_1', 'document_a', 'relevant'),
    ('topic_1', 'document_b', 'irelevant'),
    ('topic_1', 'document_c', 'relevant'),
    ('topic_1', 'document_d', 'relevant'),
    ('topic_1', 'document_e', 'relevant'),
    ('topic_1', 'document_f', 'irelevant'),
    ('topic_1', 'document_h', 'relevant'),
    ('topic_1', 'document_i', 'relevant'),
    ('topic_1', 'document_j', 'relevant'),
    ('topic_1', 'document_k', 'relevant'),
    ('topic_1', 'document_l', 'irelevant'),
    ('topic_1', 'document_m', 'irelevant'),
    ('topic_1', 'document_n', 'relevant'),
    ('topic_1', 'document_p', 'irelevant'),
    ('topic_1', 'document_q', 'irelevant'),
    ('topic_1', 'document_r', 'relevant'),
    ('topic_1', 'document_s', 'irelevant'),
    ('topic_1', 'document_t', 'irelevant'),
    ('topic_1', 'document_u', 'irelevant'),
    ('topic_1', 'document_w', 'relevant'),
    ('topic_1', 'document_x', 'irelevant'),
    ('topic_1', 'document_z', 'relevant'),
]

results = [
    # topic_id,  document_id, position, score, judgement
    ('topic_1', 'document_a', 0,  1.00, 'relevant'),
    ('topic_1', 'document_b', 1,  1.00, 'irelevant'),
    ('topic_1', 'document_c', 2,  1.00, 'relevant'),
    ('topic_1', 'document_d', 3,  1.00, 'relevant'),
    ('topic_1', 'document_e', 4,  1.00, 'relevant'),
    ('topic_1', 'document_f', 5,  1.00, 'irelevant'),
    ('topic_1', 'document_g', 6,  1.00, 'not judged'),
    ('topic_1', 'document_h', 7,  1.00, 'relevant'),
    ('topic_1', 'document_i', 8,  0.98, 'relevant'),
    ('topic_1', 'document_j', 9,  0.97, 'relevant'),
    ('topic_1', 'document_k', 10, 0.93, 'relevant'),
    ('topic_1', 'document_l', 11, 0.93, 'irelevant'),
    ('topic_1', 'document_m', 12, 0.93, 'irelevant'),
    ('topic_1', 'document_n', 13, 0.93, 'relevant'),
    ('topic_1', 'document_o', 14, 0.93, 'not judged'),
    ('topic_1', 'document_p', 15, 0.93, 'irelevant'),
    ('topic_1', 'document_q', 16, 0.93, 'irelevant'),
    ('topic_1', 'document_r', 17, 0.93, 'relevant'),
    ('topic_1', 'document_s', 18, 0.92, 'irelevant'),
    ('topic_1', 'document_t', 19, 0.92, 'irelevant'),
    ('topic_1', 'document_u', 20, 0.91, 'irelevant'),
    ('topic_1', 'document_v', 21, 0.89, 'not judged'),
    ('topic_1', 'document_w', 22, 0.88, 'relevant'),
    ('topic_1', 'document_x', 23, 0.88, 'irelevant'),
    ('topic_1', 'document_y', 24, 0.83, 'not judged'),
    ('topic_1', 'document_z', 25, 0.83, 'relevant'),
]

# Bpref Implementation

    bpref = \frac{1}{R} \sum_r{\left(1 - \frac{number\ of\ n\ above\ r}{R}\right)}

See: https://dx.doi.org/10.1145/1008992.1009000

Actually using trac_eval implementation variant (see: http://icb.med.cornell.edu/wiki/index.php/BPrefTrecEval2006)

    bpref = \frac{1}{R} \sum_r{\left(1 - \frac{min(number\ of\ n\ above\ r, R)}{min(N, R)}\right)}

but further modified to work correctly on result lists with lower number of results (Bref@x) than the number of
known relevant results in ground truth (R):

    bpref = \frac{1}{min(R, x)} \sum_r{\left(1 - \frac{min(number\ of\ n\ above\ r, R)}{min(N, R)}\right)}

## Original Bpref ``trec_eval`` Implementation

In [3]:
def bpref_trec_eval(ground_truth, results, at=None):
    R = len([doc for doc in ground_truth if doc[2] == 'relevant'])
    logger.info("R is %d" % R)
    N = len([doc for doc in ground_truth if doc[2] == 'irelevant'])
    logger.info("N is %d" % N)
    # Iterate over list of the results and compute Bpref on documents with known (i)relevance
    bpref_inner_sum = 0.0
    bpref_inner_seen_non_relevant = 0.0
    for i, r in enumerate(results):
        if at and i >= at:
            break
        if r[4] == 'relevant':
            logger.debug("Bpref evaluation: using relevant result %s", r)
            sum = 1 - (float(min(bpref_inner_seen_non_relevant, R)) / (min(N, R)))
            logger.debug("Bpref evaluation: item sum = %s = 1 - ((min(%s, %s)) / (min(%s, %s)))",
                         sum, bpref_inner_seen_non_relevant, R, N, R)
            bpref_inner_sum += sum
            logger.debug("Bpref evaluation: current inner sum = %s", bpref_inner_sum)
        elif r[4] == 'irelevant':
            logger.debug("Bpref evaluation: incrementing counver for irrelevant result %s", r)
            bpref_inner_seen_non_relevant += 1
    bpref = (1.0 / R) * float(bpref_inner_sum)
    logger.info("Bpref result: bpref = %s = (1.0 / %s) * %s", bpref, R, bpref_inner_sum)

## Modified Bpref Implementation Suitable for Bpref@*x*

In [4]:
def bpref_at_suitable(ground_truth, results, at=None):
    R = len([doc for doc in ground_truth if doc[2] == 'relevant'])
    logger.info("R is %d" % R)
    N = len([doc for doc in ground_truth if doc[2] == 'irelevant'])
    logger.info("N is %d" % N)
    bpref_inner_sum = 0.0
    bpref_inner_seen_non_relevant = 0.0
    for i, r in enumerate(results):
        if at and i >= at:
            break
        if r[4] == 'relevant':
            logger.debug("Bpref evaluation: using relevant result %s", r)
            sum = 1 - (float(min(bpref_inner_seen_non_relevant, R)) / (min(N, R)))
            logger.debug("Bpref evaluation: item sum = %s = 1 - ((min(%s, %s)) / (min(%s, %s)))",
                         sum, bpref_inner_seen_non_relevant, R, N, R)
            bpref_inner_sum += sum
            logger.debug("Bpref evaluation: current inner sum = %s", bpref_inner_sum)
        elif r[4] == 'irelevant':
            logger.debug("Bpref evaluation: incrementing counver for irrelevant result %s", r)
            bpref_inner_seen_non_relevant += 1
    bpref = (1.0 / min(R, at if at else R)) * float(bpref_inner_sum)
    logger.info("Bpref result: bpref = %s = (1.0 / %s) * %s", bpref, min(R, at if at else R), bpref_inner_sum)

# Bpref Computation

## Original Bpref ``trec_eval`` Implementation

### All Results

Number of results (*x*) is larger than number of known relevant documents in ground truth (*R*).

In [5]:
# Iterate over list of the results and compute Bpref on documents with known (i)relevance
bpref_trec_eval(ground_truth, results)

2016-08-29 13:18:50,235 - root - INFO - R is 12
2016-08-29 13:18:50,236 - root - INFO - N is 10
2016-08-29 13:18:50,236 - root - INFO - Bpref result: bpref = 0.6666666666666665 = (1.0 / 12) * 7.999999999999999


### Bpref@5

Number of results (*x* = 5) is lower than number of known relevant documents in ground truth (*R*).

In [6]:
# Iterate over first `at` results in the list of the results and compute Bpref on documents with known (i)relevance
bpref_trec_eval(ground_truth, results, at)

2016-08-29 13:18:50,267 - root - INFO - R is 12
2016-08-29 13:18:50,267 - root - INFO - N is 10
2016-08-29 13:18:50,268 - root - INFO - Bpref result: bpref = 0.3083333333333333 = (1.0 / 12) * 3.6999999999999997


## Modified Bpref Implementation Suitable for Bpref@*x*

### All Results

Number of results (*x*) is larger than number of known relevant documents in ground truth (*R*).

In [7]:
# Iterate over first `at` results in the list of the results and compute Bpref on documents with known (i)relevance
bpref_at_suitable(ground_truth, results)

2016-08-29 13:18:50,298 - root - INFO - R is 12
2016-08-29 13:18:50,299 - root - INFO - N is 10
2016-08-29 13:18:50,299 - root - INFO - Bpref result: bpref = 0.6666666666666665 = (1.0 / 12) * 7.999999999999999


### Bpref@5

Number of results (*x* = 5) is lower than number of known relevant documents in ground truth (*R*).

In [8]:
# Iterate over first `at` results in the list of the results and compute Bpref on documents with known (i)relevance
bpref_at_suitable(ground_truth, results, at)

2016-08-29 13:18:50,330 - root - INFO - R is 12
2016-08-29 13:18:50,330 - root - INFO - N is 10
2016-08-29 13:18:50,331 - root - INFO - Bpref result: bpref = 0.74 = (1.0 / 5) * 3.6999999999999997


# Other Experiments

In [9]:
ground_truth = [
    # topic_id,  document_id,  judgement
    ('topic_1', 'document_a', 'relevant'),
    #('topic_1', 'document_b', 'irelevant'),
    ('topic_1', 'document_b', 'relevant'),
    ('topic_1', 'document_c', 'relevant'),
    ('topic_1', 'document_d', 'relevant'),
    ('topic_1', 'document_e', 'relevant'),
    ('topic_1', 'document_f', 'irelevant'),
    ('topic_1', 'document_h', 'relevant'),
    ('topic_1', 'document_i', 'relevant'),
    ('topic_1', 'document_j', 'relevant'),
    ('topic_1', 'document_k', 'relevant'),
    ('topic_1', 'document_l', 'irelevant'),
    ('topic_1', 'document_m', 'irelevant'),
    ('topic_1', 'document_n', 'relevant'),
    ('topic_1', 'document_p', 'irelevant'),
    ('topic_1', 'document_q', 'irelevant'),
    ('topic_1', 'document_r', 'relevant'),
    ('topic_1', 'document_s', 'irelevant'),
    ('topic_1', 'document_t', 'irelevant'),
    ('topic_1', 'document_u', 'irelevant'),
    ('topic_1', 'document_w', 'relevant'),
    ('topic_1', 'document_x', 'irelevant'),
    ('topic_1', 'document_z', 'relevant'),
]
results = [
    # topic_id,  document_id, position, score, judgement
    ('topic_1', 'document_a', 0,  1.00, 'relevant'),
    #('topic_1', 'document_b', 1,  1.00, 'irelevant'),
    ('topic_1', 'document_b', 1,  1.00, 'relevant'),
    ('topic_1', 'document_c', 2,  1.00, 'relevant'),
    ('topic_1', 'document_d', 3,  1.00, 'relevant'),
    ('topic_1', 'document_e', 4,  1.00, 'relevant'),
    ('topic_1', 'document_f', 5,  1.00, 'irelevant'),
    ('topic_1', 'document_g', 6,  1.00, 'not judged'),
    ('topic_1', 'document_h', 7,  1.00, 'relevant'),
    ('topic_1', 'document_i', 8,  0.98, 'relevant'),
    ('topic_1', 'document_j', 9,  0.97, 'relevant'),
    ('topic_1', 'document_k', 10, 0.93, 'relevant'),
    ('topic_1', 'document_l', 11, 0.93, 'irelevant'),
    ('topic_1', 'document_m', 12, 0.93, 'irelevant'),
    ('topic_1', 'document_n', 13, 0.93, 'relevant'),
    ('topic_1', 'document_o', 14, 0.93, 'not judged'),
    ('topic_1', 'document_p', 15, 0.93, 'irelevant'),
    ('topic_1', 'document_q', 16, 0.93, 'irelevant'),
    ('topic_1', 'document_r', 17, 0.93, 'relevant'),
    ('topic_1', 'document_s', 18, 0.92, 'irelevant'),
    ('topic_1', 'document_t', 19, 0.92, 'irelevant'),
    ('topic_1', 'document_u', 20, 0.91, 'irelevant'),
    ('topic_1', 'document_v', 21, 0.89, 'not judged'),
    ('topic_1', 'document_w', 22, 0.88, 'relevant'),
    ('topic_1', 'document_x', 23, 0.88, 'irelevant'),
    ('topic_1', 'document_y', 24, 0.83, 'not judged'),
    ('topic_1', 'document_z', 25, 0.83, 'relevant'),
]
bpref_trec_eval(ground_truth, results, at)
bpref_at_suitable(ground_truth, results, at)

2016-08-29 13:18:50,363 - root - INFO - R is 13
2016-08-29 13:18:50,364 - root - INFO - N is 9
2016-08-29 13:18:50,364 - root - INFO - Bpref result: bpref = 0.38461538461538464 = (1.0 / 13) * 5.0
2016-08-29 13:18:50,365 - root - INFO - R is 13
2016-08-29 13:18:50,365 - root - INFO - N is 9
2016-08-29 13:18:50,366 - root - INFO - Bpref result: bpref = 1.0 = (1.0 / 5) * 5.0


In [10]:
ground_truth = [
    # topic_id,  document_id,  judgement
    #('topic_1', 'document_a', 'relevant'),
    ('topic_1', 'document_a', 'irelevant'),
    ('topic_1', 'document_b', 'irelevant'),
    #('topic_1', 'document_c', 'relevant'),
    #('topic_1', 'document_d', 'relevant'),
    #('topic_1', 'document_e', 'relevant'),
    ('topic_1', 'document_c', 'irelevant'),
    ('topic_1', 'document_d', 'irelevant'),
    ('topic_1', 'document_e', 'irelevant'),
    ('topic_1', 'document_f', 'irelevant'),
    ('topic_1', 'document_h', 'relevant'),
    ('topic_1', 'document_i', 'relevant'),
    ('topic_1', 'document_j', 'relevant'),
    ('topic_1', 'document_k', 'relevant'),
    ('topic_1', 'document_l', 'irelevant'),
    ('topic_1', 'document_m', 'irelevant'),
    ('topic_1', 'document_n', 'relevant'),
    ('topic_1', 'document_p', 'irelevant'),
    ('topic_1', 'document_q', 'irelevant'),
    ('topic_1', 'document_r', 'relevant'),
    ('topic_1', 'document_s', 'irelevant'),
    ('topic_1', 'document_t', 'irelevant'),
    ('topic_1', 'document_u', 'irelevant'),
    ('topic_1', 'document_w', 'relevant'),
    ('topic_1', 'document_x', 'irelevant'),
    ('topic_1', 'document_z', 'relevant'),
]
results = [
    # topic_id,  document_id, position, score, judgement
    #('topic_1', 'document_a', 0,  1.00, 'relevant'),
    ('topic_1', 'document_a', 0,  1.00, 'irelevant'),
    ('topic_1', 'document_b', 1,  1.00, 'irelevant'),
    #('topic_1', 'document_c', 2,  1.00, 'relevant'),
    #('topic_1', 'document_d', 3,  1.00, 'relevant'),
    #('topic_1', 'document_e', 4,  1.00, 'relevant'),
    ('topic_1', 'document_c', 2,  1.00, 'irelevant'),
    ('topic_1', 'document_d', 3,  1.00, 'irelevant'),
    ('topic_1', 'document_e', 4,  1.00, 'irelevant'),
    ('topic_1', 'document_f', 5,  1.00, 'irelevant'),
    ('topic_1', 'document_g', 6,  1.00, 'not judged'),
    ('topic_1', 'document_h', 7,  1.00, 'relevant'),
    ('topic_1', 'document_i', 8,  0.98, 'relevant'),
    ('topic_1', 'document_j', 9,  0.97, 'relevant'),
    ('topic_1', 'document_k', 10, 0.93, 'relevant'),
    ('topic_1', 'document_l', 11, 0.93, 'irelevant'),
    ('topic_1', 'document_m', 12, 0.93, 'irelevant'),
    ('topic_1', 'document_n', 13, 0.93, 'relevant'),
    ('topic_1', 'document_o', 14, 0.93, 'not judged'),
    ('topic_1', 'document_p', 15, 0.93, 'irelevant'),
    ('topic_1', 'document_q', 16, 0.93, 'irelevant'),
    ('topic_1', 'document_r', 17, 0.93, 'relevant'),
    ('topic_1', 'document_s', 18, 0.92, 'irelevant'),
    ('topic_1', 'document_t', 19, 0.92, 'irelevant'),
    ('topic_1', 'document_u', 20, 0.91, 'irelevant'),
    ('topic_1', 'document_v', 21, 0.89, 'not judged'),
    ('topic_1', 'document_w', 22, 0.88, 'relevant'),
    ('topic_1', 'document_x', 23, 0.88, 'irelevant'),
    ('topic_1', 'document_y', 24, 0.83, 'not judged'),
    ('topic_1', 'document_z', 25, 0.83, 'relevant'),
]
bpref_trec_eval(ground_truth, results, at)
bpref_at_suitable(ground_truth, results, at)

2016-08-29 13:18:50,396 - root - INFO - R is 8
2016-08-29 13:18:50,396 - root - INFO - N is 14
2016-08-29 13:18:50,397 - root - INFO - Bpref result: bpref = 0.0 = (1.0 / 8) * 0.0
2016-08-29 13:18:50,397 - root - INFO - R is 8
2016-08-29 13:18:50,398 - root - INFO - N is 14
2016-08-29 13:18:50,398 - root - INFO - Bpref result: bpref = 0.0 = (1.0 / 5) * 0.0


In [11]:
ground_truth = [
    # topic_id,  document_id,  judgement
    ('topic_1', 'document_a', 'relevant'),
    ('topic_1', 'document_b', 'irelevant'),
    ('topic_1', 'document_c', 'relevant'),
    #('topic_1', 'document_d', 'relevant'),
    ('topic_1', 'document_d', 'irelevant'),
    ('topic_1', 'document_e', 'relevant'),
    ('topic_1', 'document_f', 'irelevant'),
    ('topic_1', 'document_h', 'relevant'),
    ('topic_1', 'document_i', 'relevant'),
    ('topic_1', 'document_j', 'relevant'),
    ('topic_1', 'document_k', 'relevant'),
    ('topic_1', 'document_l', 'irelevant'),
    ('topic_1', 'document_m', 'irelevant'),
    ('topic_1', 'document_n', 'relevant'),
    ('topic_1', 'document_p', 'irelevant'),
    ('topic_1', 'document_q', 'irelevant'),
    ('topic_1', 'document_r', 'relevant'),
    ('topic_1', 'document_s', 'irelevant'),
    ('topic_1', 'document_t', 'irelevant'),
    ('topic_1', 'document_u', 'irelevant'),
    ('topic_1', 'document_w', 'relevant'),
    ('topic_1', 'document_x', 'irelevant'),
    ('topic_1', 'document_z', 'relevant'),
]
results = [
    # topic_id,  document_id, position, score, judgement
    ('topic_1', 'document_a', 0,  1.00, 'relevant'),
    ('topic_1', 'document_b', 1,  1.00, 'irelevant'),
    ('topic_1', 'document_c', 2,  1.00, 'relevant'),
    #('topic_1', 'document_d', 3,  1.00, 'relevant'),
    ('topic_1', 'document_d', 3,  1.00, 'irelevant'),
    ('topic_1', 'document_e', 4,  1.00, 'relevant'),
    ('topic_1', 'document_f', 5,  1.00, 'irelevant'),
    ('topic_1', 'document_g', 6,  1.00, 'not judged'),
    ('topic_1', 'document_h', 7,  1.00, 'relevant'),
    ('topic_1', 'document_i', 8,  0.98, 'relevant'),
    ('topic_1', 'document_j', 9,  0.97, 'relevant'),
    ('topic_1', 'document_k', 10, 0.93, 'relevant'),
    ('topic_1', 'document_l', 11, 0.93, 'irelevant'),
    ('topic_1', 'document_m', 12, 0.93, 'irelevant'),
    ('topic_1', 'document_n', 13, 0.93, 'relevant'),
    ('topic_1', 'document_o', 14, 0.93, 'not judged'),
    ('topic_1', 'document_p', 15, 0.93, 'irelevant'),
    ('topic_1', 'document_q', 16, 0.93, 'irelevant'),
    ('topic_1', 'document_r', 17, 0.93, 'relevant'),
    ('topic_1', 'document_s', 18, 0.92, 'irelevant'),
    ('topic_1', 'document_t', 19, 0.92, 'irelevant'),
    ('topic_1', 'document_u', 20, 0.91, 'irelevant'),
    ('topic_1', 'document_v', 21, 0.89, 'not judged'),
    ('topic_1', 'document_w', 22, 0.88, 'relevant'),
    ('topic_1', 'document_x', 23, 0.88, 'irelevant'),
    ('topic_1', 'document_y', 24, 0.83, 'not judged'),
    ('topic_1', 'document_z', 25, 0.83, 'relevant'),
]
bpref_trec_eval(ground_truth, results, at)
bpref_at_suitable(ground_truth, results, at)

2016-08-29 13:18:50,429 - root - INFO - R is 11
2016-08-29 13:18:50,429 - root - INFO - N is 11
2016-08-29 13:18:50,430 - root - INFO - Bpref result: bpref = 0.2479338842975207 = (1.0 / 11) * 2.7272727272727275
2016-08-29 13:18:50,430 - root - INFO - R is 11
2016-08-29 13:18:50,430 - root - INFO - N is 11
2016-08-29 13:18:50,431 - root - INFO - Bpref result: bpref = 0.5454545454545455 = (1.0 / 5) * 2.7272727272727275


In [12]:
ground_truth = [
    # topic_id,  document_id,  judgement
    #('topic_1', 'document_a', 'relevant'),
    ('topic_1', 'document_a', 'irelevant'),
    ('topic_1', 'document_b', 'irelevant'),
    ('topic_1', 'document_c', 'relevant'),
    #('topic_1', 'document_d', 'relevant'),
    ('topic_1', 'document_d', 'irelevant'),
    ('topic_1', 'document_e', 'relevant'),
    ('topic_1', 'document_f', 'irelevant'),
    ('topic_1', 'document_h', 'relevant'),
    ('topic_1', 'document_i', 'relevant'),
    ('topic_1', 'document_j', 'relevant'),
    ('topic_1', 'document_k', 'relevant'),
    ('topic_1', 'document_l', 'irelevant'),
    ('topic_1', 'document_m', 'irelevant'),
    ('topic_1', 'document_n', 'relevant'),
    ('topic_1', 'document_p', 'irelevant'),
    ('topic_1', 'document_q', 'irelevant'),
    ('topic_1', 'document_r', 'relevant'),
    ('topic_1', 'document_s', 'irelevant'),
    ('topic_1', 'document_t', 'irelevant'),
    ('topic_1', 'document_u', 'irelevant'),
    ('topic_1', 'document_w', 'relevant'),
    ('topic_1', 'document_x', 'irelevant'),
    ('topic_1', 'document_z', 'relevant'),
]
results = [
    # topic_id,  document_id, position, score, judgement
    #('topic_1', 'document_a', 0,  1.00, 'relevant'),
    ('topic_1', 'document_a', 0,  1.00, 'irelevant'),
    ('topic_1', 'document_b', 1,  1.00, 'irelevant'),
    ('topic_1', 'document_c', 2,  1.00, 'relevant'),
    #('topic_1', 'document_d', 3,  1.00, 'relevant'),
    ('topic_1', 'document_d', 3,  1.00, 'irelevant'),
    ('topic_1', 'document_e', 4,  1.00, 'relevant'),
    ('topic_1', 'document_f', 5,  1.00, 'irelevant'),
    ('topic_1', 'document_g', 6,  1.00, 'not judged'),
    ('topic_1', 'document_h', 7,  1.00, 'relevant'),
    ('topic_1', 'document_i', 8,  0.98, 'relevant'),
    ('topic_1', 'document_j', 9,  0.97, 'relevant'),
    ('topic_1', 'document_k', 10, 0.93, 'relevant'),
    ('topic_1', 'document_l', 11, 0.93, 'irelevant'),
    ('topic_1', 'document_m', 12, 0.93, 'irelevant'),
    ('topic_1', 'document_n', 13, 0.93, 'relevant'),
    ('topic_1', 'document_o', 14, 0.93, 'not judged'),
    ('topic_1', 'document_p', 15, 0.93, 'irelevant'),
    ('topic_1', 'document_q', 16, 0.93, 'irelevant'),
    ('topic_1', 'document_r', 17, 0.93, 'relevant'),
    ('topic_1', 'document_s', 18, 0.92, 'irelevant'),
    ('topic_1', 'document_t', 19, 0.92, 'irelevant'),
    ('topic_1', 'document_u', 20, 0.91, 'irelevant'),
    ('topic_1', 'document_v', 21, 0.89, 'not judged'),
    ('topic_1', 'document_w', 22, 0.88, 'relevant'),
    ('topic_1', 'document_x', 23, 0.88, 'irelevant'),
    ('topic_1', 'document_y', 24, 0.83, 'not judged'),
    ('topic_1', 'document_z', 25, 0.83, 'relevant'),
]
bpref_trec_eval(ground_truth, results, at)
bpref_at_suitable(ground_truth, results, at)

2016-08-29 13:18:50,462 - root - INFO - R is 10
2016-08-29 13:18:50,462 - root - INFO - N is 12
2016-08-29 13:18:50,463 - root - INFO - Bpref result: bpref = 0.15000000000000002 = (1.0 / 10) * 1.5
2016-08-29 13:18:50,463 - root - INFO - R is 10
2016-08-29 13:18:50,464 - root - INFO - N is 12
2016-08-29 13:18:50,464 - root - INFO - Bpref result: bpref = 0.30000000000000004 = (1.0 / 5) * 1.5


In [13]:
ground_truth = [
    # topic_id,  document_id,  judgement
    #('topic_1', 'document_a', 'relevant'),
    ('topic_1', 'document_a', 'irelevant'),
    ('topic_1', 'document_b', 'irelevant'),
    ('topic_1', 'document_c', 'relevant'),
    #('topic_1', 'document_d', 'relevant'),
    ('topic_1', 'document_d', 'irelevant'),
    #('topic_1', 'document_e', 'relevant'),
    ('topic_1', 'document_e', 'irelevant'),
    ('topic_1', 'document_f', 'irelevant'),
    ('topic_1', 'document_h', 'relevant'),
    ('topic_1', 'document_i', 'relevant'),
    ('topic_1', 'document_j', 'relevant'),
    ('topic_1', 'document_k', 'relevant'),
    ('topic_1', 'document_l', 'irelevant'),
    ('topic_1', 'document_m', 'irelevant'),
    ('topic_1', 'document_n', 'relevant'),
    ('topic_1', 'document_p', 'irelevant'),
    ('topic_1', 'document_q', 'irelevant'),
    ('topic_1', 'document_r', 'relevant'),
    ('topic_1', 'document_s', 'irelevant'),
    ('topic_1', 'document_t', 'irelevant'),
    ('topic_1', 'document_u', 'irelevant'),
    ('topic_1', 'document_w', 'relevant'),
    ('topic_1', 'document_x', 'irelevant'),
    ('topic_1', 'document_z', 'relevant'),
]
results = [
    # topic_id,  document_id, position, score, judgement
    #('topic_1', 'document_a', 0,  1.00, 'relevant'),
    ('topic_1', 'document_a', 0,  1.00, 'irelevant'),
    ('topic_1', 'document_b', 1,  1.00, 'irelevant'),
    ('topic_1', 'document_c', 2,  1.00, 'relevant'),
    #('topic_1', 'document_d', 3,  1.00, 'relevant'),
    ('topic_1', 'document_d', 3,  1.00, 'irelevant'),
    #('topic_1', 'document_e', 4,  1.00, 'relevant'),
    ('topic_1', 'document_e', 4,  1.00, 'irelevant'),
    ('topic_1', 'document_f', 5,  1.00, 'irelevant'),
    ('topic_1', 'document_g', 6,  1.00, 'not judged'),
    ('topic_1', 'document_h', 7,  1.00, 'relevant'),
    ('topic_1', 'document_i', 8,  0.98, 'relevant'),
    ('topic_1', 'document_j', 9,  0.97, 'relevant'),
    ('topic_1', 'document_k', 10, 0.93, 'relevant'),
    ('topic_1', 'document_l', 11, 0.93, 'irelevant'),
    ('topic_1', 'document_m', 12, 0.93, 'irelevant'),
    ('topic_1', 'document_n', 13, 0.93, 'relevant'),
    ('topic_1', 'document_o', 14, 0.93, 'not judged'),
    ('topic_1', 'document_p', 15, 0.93, 'irelevant'),
    ('topic_1', 'document_q', 16, 0.93, 'irelevant'),
    ('topic_1', 'document_r', 17, 0.93, 'relevant'),
    ('topic_1', 'document_s', 18, 0.92, 'irelevant'),
    ('topic_1', 'document_t', 19, 0.92, 'irelevant'),
    ('topic_1', 'document_u', 20, 0.91, 'irelevant'),
    ('topic_1', 'document_v', 21, 0.89, 'not judged'),
    ('topic_1', 'document_w', 22, 0.88, 'relevant'),
    ('topic_1', 'document_x', 23, 0.88, 'irelevant'),
    ('topic_1', 'document_y', 24, 0.83, 'not judged'),
    ('topic_1', 'document_z', 25, 0.83, 'relevant'),
]
bpref_trec_eval(ground_truth, results, at)
bpref_at_suitable(ground_truth, results, at)

2016-08-29 13:18:50,495 - root - INFO - R is 9
2016-08-29 13:18:50,495 - root - INFO - N is 13
2016-08-29 13:18:50,496 - root - INFO - Bpref result: bpref = 0.08641975308641975 = (1.0 / 9) * 0.7777777777777778
2016-08-29 13:18:50,496 - root - INFO - R is 9
2016-08-29 13:18:50,497 - root - INFO - N is 13
2016-08-29 13:18:50,497 - root - INFO - Bpref result: bpref = 0.15555555555555556 = (1.0 / 5) * 0.7777777777777778


In [14]:
ground_truth = [
    # topic_id,  document_id,  judgement
    #('topic_1', 'document_a', 'relevant'),
    #('topic_1', 'document_b', 'irelevant'),
    ('topic_1', 'document_a', 'irelevant'),
    ('topic_1', 'document_b', 'relevant'),
    ('topic_1', 'document_c', 'relevant'),
    ('topic_1', 'document_d', 'relevant'),
    ('topic_1', 'document_e', 'relevant'),
    ('topic_1', 'document_f', 'irelevant'),
    ('topic_1', 'document_h', 'relevant'),
    ('topic_1', 'document_i', 'relevant'),
    ('topic_1', 'document_j', 'relevant'),
    ('topic_1', 'document_k', 'relevant'),
    ('topic_1', 'document_l', 'irelevant'),
    ('topic_1', 'document_m', 'irelevant'),
    ('topic_1', 'document_n', 'relevant'),
    ('topic_1', 'document_p', 'irelevant'),
    ('topic_1', 'document_q', 'irelevant'),
    ('topic_1', 'document_r', 'relevant'),
    ('topic_1', 'document_s', 'irelevant'),
    ('topic_1', 'document_t', 'irelevant'),
    ('topic_1', 'document_u', 'irelevant'),
    ('topic_1', 'document_w', 'relevant'),
    ('topic_1', 'document_x', 'irelevant'),
    ('topic_1', 'document_z', 'relevant'),
]
results = [
    # topic_id,  document_id, position, score, judgement
    #('topic_1', 'document_a', 0,  1.00, 'relevant'),
    #('topic_1', 'document_b', 1,  1.00, 'irelevant'),
    ('topic_1', 'document_a', 0,  1.00, 'irelevant'),
    ('topic_1', 'document_b', 1,  1.00, 'relevant'),
    ('topic_1', 'document_c', 2,  1.00, 'relevant'),
    ('topic_1', 'document_d', 3,  1.00, 'relevant'),
    ('topic_1', 'document_e', 4,  1.00, 'relevant'),
    ('topic_1', 'document_f', 5,  1.00, 'irelevant'),
    ('topic_1', 'document_g', 6,  1.00, 'not judged'),
    ('topic_1', 'document_h', 7,  1.00, 'relevant'),
    ('topic_1', 'document_i', 8,  0.98, 'relevant'),
    ('topic_1', 'document_j', 9,  0.97, 'relevant'),
    ('topic_1', 'document_k', 10, 0.93, 'relevant'),
    ('topic_1', 'document_l', 11, 0.93, 'irelevant'),
    ('topic_1', 'document_m', 12, 0.93, 'irelevant'),
    ('topic_1', 'document_n', 13, 0.93, 'relevant'),
    ('topic_1', 'document_o', 14, 0.93, 'not judged'),
    ('topic_1', 'document_p', 15, 0.93, 'irelevant'),
    ('topic_1', 'document_q', 16, 0.93, 'irelevant'),
    ('topic_1', 'document_r', 17, 0.93, 'relevant'),
    ('topic_1', 'document_s', 18, 0.92, 'irelevant'),
    ('topic_1', 'document_t', 19, 0.92, 'irelevant'),
    ('topic_1', 'document_u', 20, 0.91, 'irelevant'),
    ('topic_1', 'document_v', 21, 0.89, 'not judged'),
    ('topic_1', 'document_w', 22, 0.88, 'relevant'),
    ('topic_1', 'document_x', 23, 0.88, 'irelevant'),
    ('topic_1', 'document_y', 24, 0.83, 'not judged'),
    ('topic_1', 'document_z', 25, 0.83, 'relevant'),
]
bpref_trec_eval(ground_truth, results, at)
bpref_at_suitable(ground_truth, results, at)

2016-08-29 13:18:50,528 - root - INFO - R is 12
2016-08-29 13:18:50,528 - root - INFO - N is 10
2016-08-29 13:18:50,529 - root - INFO - Bpref result: bpref = 0.3 = (1.0 / 12) * 3.6
2016-08-29 13:18:50,529 - root - INFO - R is 12
2016-08-29 13:18:50,530 - root - INFO - N is 10
2016-08-29 13:18:50,530 - root - INFO - Bpref result: bpref = 0.7200000000000001 = (1.0 / 5) * 3.6
