## Query Marketplace

This notebook constructs a "query marketplace", where our system selects the fingerprints with the highest utility to cost ratio at runtime and does lookups only on those fingerprints.

In [1]:
import pickle
import numpy as np
from ExtractBootlegScores import *
import itertools
import numba as nb
from numba import njit
from collections import defaultdict
import dill
from glob import iglob

**Load in databases and counts**

In [2]:
# this dictionary contains all the databases for each n-gram type included in our marketplace database
db_dir = "/data1/kji/databases_v4d/105mill"

In [3]:
# this dictionary maps each n-gram type to its probability of correctness
probabilities_dir = "/data1/kji/databases_random/probabilities.pkl"

In [6]:
# this dictionary maps each number to the piece name (a string)
#piece_mapping_dir = "num_to_piece.pkl"
piece_mapping_dir = "/home/kji/ImprovedSheetID/num_to_piece.pkl"

In [7]:
d = {}

In [None]:
for filename in iglob(f"{db_dir}/*.pkl", recursive=True):
    combination = filename.split('/')[-1][:-4]
    with open(filename, "rb") as f:
        d[combination] = pickle.load(f)
    print(f"finished {combination}")

finished 015
finished 014
finished 035
finished 02
finished 03
finished 05
finished 013
finished 04
finished 023
finished 025
finished 045


In [7]:
with open(probabilities_dir, "rb") as f:
    utilities = pickle.load(f)

In [8]:
with open(piece_mapping_dir, 'rb') as f:
    num_to_piece = pickle.load(f)

In [9]:
combinations = []
for n_gram in range(1, 4):
    combinations += [[0] + list(tup) for tup in itertools.combinations(range(1, 6), n_gram-1)]

In [10]:
combinations = ["".join(str(num) for num in combination) for combination in combinations]

In [11]:
powers = 1 << np.arange(62)[::-1]

In [12]:
def compute_fingerprint(cols):
    fp = []
    equals_Zero = True
    for column in cols:
        hashint = int(column.dot(powers))
        fp.append(hashint)
        if hashint != 0:
            equals_Zero = False
    if equals_Zero == True:
        return None
    return tuple(fp)

Our utility to cost ratio is defined as the probability of correctness (utility) divided by the number of matches (cost), because the number of matches equals the number of lookups we would have to do for that fingerprint.

In [13]:
def utility(combination, matches):
    return utilities[combination] / matches

In [14]:
def get_ratios(bscore_query, rindex_dict):
    """Inputs: an L x 62 bootleg score query and our dictionary, where
               rindex_dict[fp] = (count, {dictionary of pieces and offsets})
        Output: a 16 X L table where each element is a tuple of (utility:cost ratio, combination, n_gram)"""
    l = len(bscore_query)
    # ratios[i][j] is a pair of (ratio, combination, fingerprint)
    ratios = np.array([[(0, None, None) for _ in range(l)] for _ in range(16)])
    for j in range(l):
        # calculate utility to cost ratio for all 16 n-grams
        for idx, combination in enumerate(combinations):
            cols = []
            # we need at least enough fingerprints for all the indices in our combination
            try:
                for i in combination:
                    cols.append(bscore_query[j+int(i)])
            except IndexError:
                continue
            fp = compute_fingerprint(cols)
            if not fp or combination not in rindex_dict or fp not in rindex_dict[combination]:
                continue
            matches = rindex_dict[combination][fp][0]
            if matches == 0:
                continue
            ratios[idx][j] = (utility(combination, matches), combination, fp)
    return ratios

In [15]:
def update_offset_dict(offset_dict, pieces_and_offsets, i, num_lookups):
    """Input: a dictionary mapping pieces to offsets for a given n-gram and the number of lookups we can do.
       Output: an updated dictionary of offsets containing all the lookups we just did. This dictionary
               will be used in the histogram of offsets."""
    if num_lookups == 0:
        return
    for piece in pieces_and_offsets:
        if num_lookups <= len(pieces_and_offsets[piece]):
            offset = [j - i for j in pieces_and_offsets[piece][:num_lookups]]
        else:
            offset = [j - i for j in pieces_and_offsets[piece]]
        offset_dict[num_to_piece[piece]].extend(offset)
        num_lookups -= len(pieces_and_offsets[piece])
        if num_lookups <= 0:
            break

In [16]:
def get_fingerprints(bscore_query, rindex_dict, ratios, runtime_budget):
    """This takes in a bootleg score for a query as its input, and does a certain number of lookups for each
       column of the bootleg score in accordance with our runtime budget. It then returns the updated dictionary of
       offsets containing all the lookups performed for the query, which is then used to compute the histogram of 
       offsets."""
    l = len(bscore_query)
    aisle_budget = runtime_budget // l
    cur_budget = aisle_budget
    offset_dict = defaultdict(list)
    matches_processed = 0
    for i in range(l):
        fingerprints = []
        col = ratios[:, i]
        lookups = sorted(col, key = lambda x: x[0], reverse = True)
        for _, combination, n_gram in lookups:
            if not n_gram or cur_budget < 0:
                break
            matches, pieces_and_offsets = rindex_dict[combination][n_gram]
            if cur_budget - matches < 0:
                num_lookups = cur_budget
            else:
                num_lookups = matches
            update_offset_dict(offset_dict, pieces_and_offsets, i, num_lookups)
            cur_budget -= num_lookups
            matches_processed += num_lookups
        cur_budget += aisle_budget
    return offset_dict, matches_processed

In [17]:
def rankHistograms(offset_dict, bin_size=10):
    """This implements the histogram of offsets method for ranking the predicted pieces."""
    bin_size = 3
    pieceScores = []
    for key, h in offset_dict.items():
        if not h:
            continue
        maxh = max(h)
        minh = min(h)
        hist = np.zeros(int((maxh-minh)/bin_size)+2)
        for i in h:
            hist[int((i-minh)/bin_size)] += 1
        score = np.max(hist)
        pieceScores.append((key, score))
            
    pieceScores = sorted(pieceScores, key = lambda x:x[1], reverse=True)
    return pieceScores

In [18]:
def saveToFile(outfile, imagefile, pieceScores, profileDur, matches_processed):
    if outfile:
        with open(outfile, 'wb') as f:
            query = os.path.splitext(os.path.basename(imagefile))[0]
            pickle.dump((query,pieceScores, profileDur, matches_processed),f)

In [47]:
def processQuery_cameraWrapper(queryfile, rindex, outdir, runtime_budget):
    # wrapper for running multiple jobs in parallel
    basename = os.path.splitext(os.path.basename(queryfile))[0] # e.g. p1_q1
    hyp_outfile = "{}/{}.hyp".format(outdir, basename)
    # might change later to print to outfile
    return processSingleQuery(queryfile, rindex, runtime_budget, False, hyp_outfile)

In [53]:
def processQuery_scannedWrapper(queryfile, rindex, outdir, runtime_budget):
    # wrapper for running multiple jobs in parallel
    basename = os.path.splitext(os.path.basename(queryfile))[0] # e.g. p1_q1
    hyp_outfile = "{}/{}.hyp".format(outdir, basename)
    # might change later to print to outfile
    return processSingleQuery(queryfile, rindex, runtime_budget, True, hyp_outfile)

In [57]:
def processSingleQuery(imagefile, rindex, runtime_budget, PDF = False, outfile = None):
    """Inputs: a file representing a query image, a reverse index dictionary mapping each n-gram to its 
               offsets in IMSLP, and a runtime budget for the query.
       Output: a sorted list of predicted pieces and their scores based on the histogram of offsets method."""
    profileStart = time.time()
    
    # Get Bootleg Score
    if PDF:
        bscore_query = processQueryPDF(imagefile)
        imagefile = imagefile.replace('-','_q')
    else:
        bscore_query = processQuery(imagefile)
    bscore_query = bscore_query.T
    print(bscore_query.shape)
    
    searchStart = time.time()
    # Generate and rank histograms
    
    ratios = get_ratios(bscore_query, rindex)
    offset_dict, matches_processed = get_fingerprints(bscore_query, rindex, ratios, runtime_budget)
    pieceScores = rankHistograms(offset_dict)
    # Profile & save to file
    profileEnd = time.time()
    
    profileDur = profileEnd - profileStart
    print(matches_processed)
    print(profileDur)
    saveToFile(outfile, imagefile, pieceScores, profileDur, matches_processed)
    return pieceScores

In [58]:
processSingleQuery('data/CameraDataset/queries/p1_q1.jpg', d, 1000)

Processing data/CameraDataset/queries/p1_q1.jpg
(131, 62)
910
0.8897643089294434


[('p1', 115.0),
 ('dGouin,_PierreNocturnes,_Op.9_86550', 74.0),
 ('dGouin,_PierreNocturnes,_Op.9_112335', 45.0),
 ('dChopin,_Fr%C3%A9d%C3%A9ricNocturnes,_Op.9_86550', 43.0),
 ('dChopin,_Fr%C3%A9d%C3%A9ricNocturnes,_Op.9_00470', 37.0),
 ('dGouin,_PierreNocturnes,_Op.9_00470', 28.0),
 ('dGouin,_PierreNocturnes,_Op.9_34915', 23.0),
 ('dGouin,_PierreNocturnes,_Op.9_34916', 23.0),
 ('dChopin,_Fr%C3%A9d%C3%A9ricNocturnes,_Op.9_113996', 19.0),
 ('dGouin,_PierreNocturnes,_Op.9_80717', 16.0),
 ('dChopin,_Fr%C3%A9d%C3%A9ricNocturnes,_Op.9_112335', 11.0),
 ('dChopin,_Fr%C3%A9d%C3%A9ricNocturnes,_Op.9_34915', 10.0),
 ('dGouin,_PierreNocturnes,_Op.9_113996', 8.0),
 ('dChopin,_Fr%C3%A9d%C3%A9ricNocturnes,_Op.9_34916', 5.0),
 ('dClementi,_Muzio6_Piano_Sonatas,_Op.25_400279', 2.0),
 ('dScriabin,_Aleksandr24_Preludes,_Op.11_10496', 2.0),
 ('dBach,_Johann_SebastianPrelude_and_Fugue_in_A-flat_major,_BWV_886_02194',
  2.0),
 ('dTeilman,_ChristianNocturne_in_F_minor_296208', 2.0),
 ('dLiszt,_FranzSoir%C3%A

In [59]:
processSingleQuery('data/ScannedDataset/p1/p1-0.jpg', d, 1000, True)

Processing data/ScannedDataset/p1/p1-0.jpg
(283, 62)
843
1.28297758102417


[('p1', 154.0),
 ('dGouin,_PierreNocturnes,_Op.9_86550', 61.0),
 ('dGouin,_PierreNocturnes,_Op.9_112335', 53.0),
 ('dChopin,_Fr%C3%A9d%C3%A9ricNocturnes,_Op.9_86550', 32.0),
 ('dChopin,_Fr%C3%A9d%C3%A9ricNocturnes,_Op.9_00470', 28.0),
 ('dGouin,_PierreNocturnes,_Op.9_34915', 16.0),
 ('dGouin,_PierreNocturnes,_Op.9_34916', 8.0),
 ('dChopin,_Fr%C3%A9d%C3%A9ricNocturnes,_Op.9_113996', 5.0),
 ('dChopin,_Fr%C3%A9d%C3%A9ricNocturnes,_Op.9_34915', 4.0),
 ('dGouin,_PierreNocturnes,_Op.9_80717', 3.0),
 ('dDelacroix,_AugusteOffrande_505979', 3.0),
 ('dScriabin,_Aleksandr24_Preludes,_Op.11_10496', 2.0),
 ('dGouin,_PierreNocturnes,_Op.9_00470', 2.0),
 ('dGouin,_PierreNocturnes,_Op.9_113996', 2.0),
 ('dCzerny,_Carl3_Brillante_Fantasien_%C3%BCber_%27B%C3%A4bu%27,_Op.540_360617',
  2.0),
 ('dMandyczewski,_Eusebius8_Klavierst%C3%BCcke,_Op.76_84325', 2.0),
 ('dBeethoven,_Ludwig_vanPiano_Sonata_No.17,_Op.31_No.2_328874', 1.0),
 ('dMerikanto,_OskarPiano_Pieces,_Op.86_515812', 1.0),
 ('dHummel,_Johann_Nep

Here we set the runtime budget to 65000, which means we can process at most 65000 matches per query.

In [60]:
runtime_budget = 1000

In [61]:
query_list = 'cfg_new/scanned.train' # list of query images
outdir = f'experiments/scannedDataset/hyp' # where to save hypothesis output files

In [62]:
# prep output directory
if not os.path.isdir(outdir):
    os.makedirs(outdir)

# load reverse index. Recommend keeping load=False and loading it earlier.
load = False
if load:
    print("LOADING RINDEX")
    rindex1 = []
    with open(pickle_file, 'rb') as f:
        rindex1 = pickle.load(f)
    rindex_filter = rindex1

print("STARTING PROCESSING")
# number of cores to use
multiprocess = False
if multiprocess:
    n_cores = 30 #multiprocessing.cpu_count()
    pool = multiprocessing.Pool(processes=n_cores)

inputs = []
with open(query_list, 'r') as f:
    for line in f:
        inputs.append((line.rstrip(), outdir))

#########################
# Make sure to edit the wrapper to fit the dataset
#######################
if multiprocess:
    # process queries in parallel
    outputs = list(pool.starmap(processQuery_scannedWrapper, inputs))
else:
    for i in inputs:
        processQuery_scannedWrapper(i[0], d, i[1], runtime_budget)

STARTING PROCESSING
Processing data/ScannedDataset/p111/p111-1.jpg
(228, 62)
912
1.841083288192749
Processing data/ScannedDataset/p111/p111-0.jpg
(166, 62)
990
1.2156226634979248
Processing data/ScannedDataset/p141/p141-1.jpg
(229, 62)
912
1.1152005195617676
Processing data/ScannedDataset/p141/p141-2.jpg
(218, 62)
872
1.1586480140686035
Processing data/ScannedDataset/p141/p141-4.jpg
(237, 62)
944
1.1873936653137207
Processing data/ScannedDataset/p141/p141-0.jpg
(251, 62)
750
1.2124426364898682
Processing data/ScannedDataset/p141/p141-3.jpg
(233, 62)
920
1.1364452838897705
Processing data/ScannedDataset/p45/p45-1.jpg
(89, 62)
968
0.9568631649017334
Processing data/ScannedDataset/p45/p45-2.jpg
(84, 62)
913
0.9612371921539307
Processing data/ScannedDataset/p45/p45-3.jpg
(76, 62)
968
0.9585294723510742
Processing data/ScannedDataset/p45/p45-0.jpg
(86, 62)
924
0.9779052734375
Processing data/ScannedDataset/p181/p181-7.jpg
(235, 62)
936
1.191288948059082
Processing data/ScannedDataset/p181/p

(157, 62)
936
1.3204140663146973
Processing data/ScannedDataset/p125/p125-1.jpg
(167, 62)
830
1.3649499416351318
Processing data/ScannedDataset/p125/p125-4.jpg
(140, 62)
973
1.2018399238586426
Processing data/ScannedDataset/p125/p125-2.jpg
(184, 62)
915
1.395003080368042
Processing data/ScannedDataset/p125/p125-3.jpg
(202, 62)
800
4.41144323348999
Processing data/ScannedDataset/p125/p125-5.jpg
(187, 62)
930
1.2236106395721436
Processing data/ScannedDataset/p165/p165-1.jpg
(300, 62)
891
1.3172976970672607
Processing data/ScannedDataset/p165/p165-2.jpg
(302, 62)
903
1.584329605102539
Processing data/ScannedDataset/p165/p165-0.jpg
(86, 62)
924
0.7902061939239502
Processing data/ScannedDataset/p165/p165-3.jpg
(283, 62)
846
1.5806910991668701
Processing data/ScannedDataset/p115/p115-5.jpg
(201, 62)
804
1.6008546352386475
Processing data/ScannedDataset/p115/p115-4.jpg
(194, 62)
965
1.2414801120758057
Processing data/ScannedDataset/p115/p115-1.jpg
(227, 62)
900
1.2760655879974365
Processing d

  kmeans = KMeans(n_clusters=numClusters, n_init=1, random_state=0).fit(r.reshape(-1, 1))


(194, 62)
965
1.2929625511169434
Processing data/ScannedDataset/p61/p61-9.jpg
(160, 62)
960
1.350198745727539
Processing data/ScannedDataset/p61/p61-10.jpg
(165, 62)
984
1.4765689373016357
Processing data/ScannedDataset/p61/p61-4.jpg
(182, 62)
905
1.289121150970459
Processing data/ScannedDataset/p61/p61-0.jpg
(165, 62)
984
1.3666374683380127
Processing data/ScannedDataset/p61/p61-6.jpg
(185, 62)
925
1.1991572380065918
Processing data/ScannedDataset/p61/p61-7.jpg
(191, 62)
950
1.3604748249053955
Processing data/ScannedDataset/p61/p61-5.jpg
(174, 62)
865
1.3358383178710938
Processing data/ScannedDataset/p61/p61-8.jpg
(187, 62)
930
1.304861068725586
Processing data/ScannedDataset/p121/p121-1.jpg
(120, 62)
952
1.5898685455322266
Processing data/ScannedDataset/p121/p121-0.jpg
(90, 62)
979
1.3542070388793945
Processing data/ScannedDataset/p121/p121-5.jpg
(110, 62)
972
1.1812734603881836
Processing data/ScannedDataset/p121/p121-4.jpg
(94, 62)
940
1.3369765281677246
Processing data/ScannedData

  kmeans = KMeans(n_clusters=numClusters, n_init=1, random_state=0).fit(r.reshape(-1, 1))


(221, 62)
884
1.3327476978302002
Processing data/ScannedDataset/p151/p151-2.jpg


  kmeans = KMeans(n_clusters=numClusters, n_init=1, random_state=0).fit(r.reshape(-1, 1))


(236, 62)
936
1.4173409938812256
Processing data/ScannedDataset/p151/p151-1.jpg
(250, 62)
996
1.2993733882904053
Processing data/ScannedDataset/p55/p55-2.jpg
(36, 62)
972
0.7692539691925049
Processing data/ScannedDataset/p55/p55-0.jpg
(150, 62)
894
1.0515408515930176
Processing data/ScannedDataset/p55/p55-1.jpg
(178, 62)
880
0.9378688335418701
Processing data/ScannedDataset/p71/p71-0.jpg
(161, 62)
960
1.1033196449279785
Processing data/ScannedDataset/p71/p71-1.jpg
(182, 62)
910
1.056628942489624
Processing data/ScannedDataset/p41/p41-1.jpg
(254, 62)
762
1.4234552383422852
Processing data/ScannedDataset/p41/p41-0.jpg
(244, 62)
972
1.202993392944336
Processing data/ScannedDataset/p85/p85-2.jpg
(175, 62)
870
1.2143192291259766
Processing data/ScannedDataset/p85/p85-4.jpg
(152, 62)
912
1.1767516136169434
Processing data/ScannedDataset/p85/p85-0.jpg
(199, 62)
990
1.1962246894836426
Processing data/ScannedDataset/p85/p85-1.jpg
(160, 62)
954
1.2639071941375732
Processing data/ScannedDataset/p