In [3]:
import pickle
import numpy as np
# from ExtractBootlegFeatures import *
from ExtractTest import *
import itertools
import numba as nb
from numba import njit
from collections import defaultdict

In [4]:
import dill
from glob import iglob

**Load in databases and counts**

In [5]:
db_dir = "/data1/kji/databases_v3/100mill"

In [6]:
d = {}

In [7]:
for filename in iglob(f"{db_dir}/*.pkl", recursive=True):
    combination = filename.split('/')[-1][:-4]
    with open(filename, "rb") as f:
        d[combination] = pickle.load(f)
    print(f"finished {combination}")

finished 015
finished 014
finished 035
finished 02
finished 03
finished 05
finished 013
finished 04
finished 023
finished 025
finished 045
finished 034
finished 01
finished 012
finished 0
finished 024


In [13]:
with open("/data1/kji/databases/probabilities.pkl", "rb") as f:
    utilities = pickle.load(f)

In [14]:
with open("num_to_piece.pkl", 'rb') as f:
    num_to_piece = pickle.load(f)

In [15]:
combinations = []
for n_gram in range(1, 4):
    combinations += [[0] + list(tup) for tup in itertools.combinations(range(1, 6), n_gram-1)]

In [16]:
combinations = ["".join(str(num) for num in combination) for combination in combinations]

In [17]:
powers = 1 << np.arange(62)[::-1]

In [18]:
def compute_fingerprint(cols):
    fp = []
    equals_Zero = True
    for column in cols:
        hashint = int(column.dot(powers))
        fp.append(hashint)
        if hashint != 0:
            equals_Zero = False
    if equals_Zero == True:
        return None
    return tuple(fp)

In [19]:
def utility(combination, matches):
    return utilities[combination] / matches

In [51]:
def get_ratios(bscore_query, rindex_dict):
    """Inputs: an L x 62 bootleg score query and our dictionary, where
               rindex_dict[fp] = (count, {dictionary of pieces and offsets})
        Output: a table where each element is a tuple of (utility:cost ratio, combination, n_gram)"""
    l = len(bscore_query)
    # ratios[i][j] is a pair of (ratio, combination, fingerprint)
    ratios = np.array([[(0, None, None) for _ in range(l)] for _ in range(16)])
    for j in range(l):
        # calculate utility to cost ratio for all 16 n-grams
        for idx, combination in enumerate(combinations):
            cols = []
            # we need at least enough fingerprints for all the indices in our combination
            try:
                for i in combination:
                    cols.append(bscore_query[j+int(i)])
            except IndexError:
                continue
            fp = compute_fingerprint(cols)
            if not fp or combination not in rindex_dict or fp not in rindex_dict[combination]:
                continue
            matches = rindex_dict[combination][fp][0]
#             # TODO: modified
#             if matches > 15000:
#                 continue
            ratios[idx][j] = (utility(combination, matches), combination, fp)
    return ratios

In [27]:
def update_offset_dict(offset_dict, pieces_and_offsets, i, num_lookups):
    for piece in pieces_and_offsets:
#         offset = [(j - i, num_lookups) for j in pieces_and_offsets[piece]]
        offset = [j - i for j in pieces_and_offsets[piece]]
        offset_dict[num_to_piece[piece]].extend(offset)
        num_lookups -= len(pieces_and_offsets[piece])
        if num_lookups < 0:
            break

In [22]:
def get_fingerprints(bscore_query, rindex_dict, ratios, runtime_budget):
    l = len(bscore_query)
    aisle_budget = runtime_budget / l
    cur_budget = aisle_budget
    offset_dict = defaultdict(list)
    for i in range(l):
        fingerprints = []
        col = ratios[:, i]
#         num_lookups = np.count_nonzero(col)
        lookups = sorted(col, key = lambda x: x[0], reverse = True)
        for _, combination, n_gram in lookups:
            if not n_gram or cur_budget < 0:
                break
            matches, pieces_and_offsets = rindex_dict[combination][n_gram]
            if cur_budget - matches < 0:
                num_lookups = cur_budget
            else:
                num_lookups = matches
            update_offset_dict(offset_dict, pieces_and_offsets, i, num_lookups)
            cur_budget -= matches
        cur_budget += aisle_budget
    return offset_dict

In [31]:
def rankHistograms(offset_dict, bin_size=10):
    bin_size = 3
    pieceScores = []
    for key in offset_dict:
        h = offset_dict[key]
        maxh = max(h)
        minh = min(h)
        hist = np.zeros(int((maxh-minh)/bin_size)+2)
        for i in h:
            hist[int((i-minh)/bin_size)] += 1
        score = np.max(hist)
        pieceScores.append((key, score))
            
    pieceScores = sorted(pieceScores, key = lambda x:x[1], reverse=True)
    return pieceScores

In [39]:
def processSingleQuery(imagefile, rindex, runtime_budget, outfile = None):
    profileStart = time.time()
    
    # Get Bootleg Score
    bscore_query = processQuery(imagefile)
    bscore_query = bscore_query.T
    
    searchStart = time.time()
    # Generate and rank histograms
    
    ratios = get_ratios(bscore_query, rindex)
    offset_dict = get_fingerprints(bscore_query, rindex, ratios, runtime_budget)
    pieceScores = rankHistograms(offset_dict)
    # Profile & save to file
    profileEnd = time.time()
    
    profileDur = profileEnd - profileStart
    print(profileDur)
    saveToFile(outfile, imagefile, pieceScores, profileDur)
    return pieceScores

In [38]:
processSingleQuery('data/queries/p1_q1.jpg', d, 15000)

Processing data/queries/p1_q1.jpg
0.8835165500640869


[('p1', 925.0),
 ('dGouin,_PierreNocturnes,_Op.9_86550', 464.0),
 ('dChopin,_Fr%C3%A9d%C3%A9ricNocturnes,_Op.9_86550', 450.0),
 ('dChopin,_Fr%C3%A9d%C3%A9ricNocturnes,_Op.9_113996', 282.0),
 ('dGouin,_PierreNocturnes,_Op.9_113996', 264.0),
 ('dGouin,_PierreNocturnes,_Op.9_34916', 240.0),
 ('dChopin,_Fr%C3%A9d%C3%A9ricNocturnes,_Op.9_34916', 224.0),
 ('dChopin,_Fr%C3%A9d%C3%A9ricNocturnes,_Op.9_00470', 208.0),
 ('dGouin,_PierreNocturnes,_Op.9_00470', 193.0),
 ('dGouin,_PierreNocturnes,_Op.9_34915', 176.0),
 ('dGouin,_PierreNocturnes,_Op.9_112335', 174.0),
 ('dChopin,_Fr%C3%A9d%C3%A9ricNocturnes,_Op.9_34915', 164.0),
 ('dChopin,_Fr%C3%A9d%C3%A9ricNocturnes,_Op.9_112335', 143.0),
 ('dGouin,_PierreNocturnes,_Op.9_80717', 116.0),
 ('dChopin,_Fr%C3%A9d%C3%A9ricNocturnes,_Op.9_80717', 104.0),
 ('p128', 7.0),
 ('dWolff,_%C3%89douardNocturne_et_romanesca,_Op.109_32805', 6.0),
 ('dClementi,_Muzio6_Piano_Sonatas,_Op.25_400279', 4.0),
 ('dBungart,_HeinrichPr%C3%A4ludien-Album_f%C3%BCr_Orgel_oder_H

In [40]:
def saveToFile(outfile, imagefile, pieceScores, profileDur):
    if outfile:
        with open(outfile, 'wb') as f:
            query = os.path.splitext(os.path.basename(imagefile))[0]
            pickle.dump((query,pieceScores, profileDur),f)

In [41]:
def processQuery_wrapper(queryfile, rindex, outdir, runtime_budget):
    # wrapper for running multiple jobs in parallel
    basename = os.path.splitext(os.path.basename(queryfile))[0] # e.g. p1_q1
    hyp_outfile = "{}/{}.hyp".format(outdir, basename)
    piece = basename.split('_')[0]
    # might change later to print to outfile
    return processSingleQuery(queryfile, rindex, runtime_budget, hyp_outfile)

In [54]:
runtime_budget = 80000

In [55]:
query_list = 'cfg_files/query.train.list' # list of query images
outdir = 'experiments/v3_100mill_test/hyp' # where to save hypothesis output files

# prep output directory
if not os.path.isdir(outdir):
    os.makedirs(outdir)

# load reverse index. Recommend keeping load=False and loading it earlier.
load = False
if load:
    print("LOADING RINDEX")
    rindex1 = []
    with open(pickle_file, 'rb') as f:
        rindex1 = pickle.load(f)
    rindex_filter = rindex1
    
print("STARTING PROCESSING")
# number of cores to use
multiprocess = False
if multiprocess:
    n_cores = 25 #multiprocessing.cpu_count()
    pool = multiprocessing.Pool(processes=n_cores)

inputs = []
with open(query_list, 'r') as f:
    for line in f:
        inputs.append((line.rstrip(), outdir))
        
if multiprocess:
    # process queries in parallel
    outputs = list(pool.starmap(processQuery_wrapper, inputs))
else:
    for i in inputs:
        processQuery_wrapper(i[0], d, i[1], runtime_budget)

STARTING PROCESSING
Processing data/queries/p1_q1.jpg
0.9776628017425537
Processing data/queries/p1_q2.jpg
1.1445624828338623
Processing data/queries/p1_q3.jpg
1.0990922451019287
Processing data/queries/p1_q4.jpg
0.8505363464355469
Processing data/queries/p1_q5.jpg
1.0592858791351318
Processing data/queries/p1_q6.jpg
0.8120021820068359
Processing data/queries/p1_q7.jpg
1.0669198036193848
Processing data/queries/p1_q8.jpg
1.1034445762634277
Processing data/queries/p1_q9.jpg
0.8795375823974609
Processing data/queries/p1_q10.jpg
0.7478196620941162
Processing data/queries/p5_q1.jpg
1.1108477115631104
Processing data/queries/p5_q2.jpg
0.9782388210296631
Processing data/queries/p5_q3.jpg
1.2532539367675781
Processing data/queries/p5_q4.jpg
1.1347575187683105
Processing data/queries/p5_q5.jpg
1.287778377532959
Processing data/queries/p5_q6.jpg
1.0436620712280273
Processing data/queries/p5_q7.jpg
1.1183760166168213
Processing data/queries/p5_q8.jpg
0.8969571590423584
Processing data/queries/p5

0.8897702693939209
Processing data/queries/p75_q4.jpg
1.1049864292144775
Processing data/queries/p75_q5.jpg
1.4371554851531982
Processing data/queries/p75_q6.jpg
0.9268512725830078
Processing data/queries/p75_q7.jpg
1.2466247081756592
Processing data/queries/p75_q8.jpg
1.2883086204528809
Processing data/queries/p75_q9.jpg
1.313241958618164
Processing data/queries/p75_q10.jpg
1.347487211227417
Processing data/queries/p81_q1.jpg
1.1918296813964844
Processing data/queries/p81_q2.jpg
0.840686559677124
Processing data/queries/p81_q3.jpg
1.1622340679168701
Processing data/queries/p81_q4.jpg
1.1638538837432861
Processing data/queries/p81_q5.jpg
1.033616542816162
Processing data/queries/p81_q6.jpg
1.0959069728851318
Processing data/queries/p81_q7.jpg
0.8609483242034912
Processing data/queries/p81_q8.jpg
1.1573572158813477
Processing data/queries/p81_q9.jpg
0.8653967380523682
Processing data/queries/p81_q10.jpg
0.9474050998687744
Processing data/queries/p85_q1.jpg
1.0704026222229004
Processing 

0.7456061840057373
Processing data/queries/p151_q5.jpg
0.8157694339752197
Processing data/queries/p151_q6.jpg
0.922234058380127
Processing data/queries/p151_q7.jpg
1.0710475444793701
Processing data/queries/p151_q8.jpg
0.9091172218322754
Processing data/queries/p151_q9.jpg
0.7842602729797363
Processing data/queries/p151_q10.jpg
0.8819808959960938
Processing data/queries/p155_q1.jpg
1.048748254776001
Processing data/queries/p155_q2.jpg
0.8526928424835205
Processing data/queries/p155_q3.jpg
0.9588112831115723
Processing data/queries/p155_q4.jpg
0.8592550754547119
Processing data/queries/p155_q5.jpg
0.8511083126068115
Processing data/queries/p155_q6.jpg
0.8372888565063477
Processing data/queries/p155_q7.jpg
0.22833728790283203
Processing data/queries/p155_q8.jpg
1.0826313495635986
Processing data/queries/p155_q9.jpg
0.9873802661895752
Processing data/queries/p155_q10.jpg
0.8960647583007812
Processing data/queries/p161_q1.jpg
0.6345846652984619
Processing data/queries/p161_q2.jpg
0.6591887

In [13]:
len(d)

16

In [7]:
n_gram_count = defaultdict(int)

In [8]:
for i in d:
    n_gram_count[i] = len(d[i])

In [9]:
n_gram_count

defaultdict(int, {'three_grams': 46654034, 'two_grams': 11721009, '0': 616977})

In [10]:
total_matches = 0

In [11]:
for combination in d:
    for fp in d[combination]:
        total_matches += d[combination][fp][0]

In [12]:
total_matches

5734709365

## Test code

In [13]:
def get_counts(d):
    """given a database, returns a dictionary mapping each piece to the number of unique fingerprints it has in IMSLP"""
    counts = defaultdict(int)
    for combination in d:
        db = d[combination]
        for fp in db:
            for piece in db[fp][1]:
                counts[piece] += 1
    return counts

In [14]:
def get_coverage(counts, total_pieces):
    """
    given a dictionary of counts, return an array where a[i] is the percent of 
    items in IMSLP with at least i+1 unique fingerprints
    """
    num_pieces_per_count = defaultdict(int)
    for num in counts.values():
        num_pieces_per_count[num] += 1
    n = max(num_pieces_per_count)
    cumulative_counts = []
    cur_count = 0
    for i in reversed(range(1, n+1)):
        cur_count += num_pieces_per_count[i]
        cumulative_counts.append(cur_count)
    return [count / total_pieces for count in cumulative_counts[::-1]]

In [15]:
counts = get_counts(d)

In [17]:
with open("data/merged_23_grams_counts.pkl", "wb") as f:
    pickle.dump(counts, f)

In [18]:
total = 30275

In [19]:
percents = get_coverage(counts, total)

In [20]:
with open("data/percent_merged_23_grams.pkl", "wb") as f:
    pickle.dump(percents, f)