In [4]:
import pickle
import numpy as np
# from ExtractBootlegFeatures import *
from ExtractTest import *
import itertools
import numba as nb
from numba import jit
from collections import defaultdict

In [5]:
import dill
from glob import iglob

**Load in databases and counts**

In [6]:
db_dir = "/data1/kji/databases_v2/200mill"

In [7]:
d = {}

In [9]:
for filename in iglob(f"{db_dir}/*.pkl", recursive=True):
    combination = filename.split('/')[-1][:-4]
    with open(filename, "rb") as f:
        d[combination] = pickle.load(f)

In [11]:
with open("/data1/kji/databases/probabilities.pkl", "rb") as f:
    utilities = pickle.load(f)

In [12]:
with open("num_to_piece.pkl", 'rb') as f:
    num_to_piece = pickle.load(f)

In [13]:
combinations = []
for n_gram in range(1, 4):
    combinations += [[0] + list(tup) for tup in itertools.combinations(range(1, 6), n_gram-1)]

In [14]:
combinations = ["".join(str(num) for num in combination) for combination in combinations]

In [15]:
powers = 1 << np.arange(62)[::-1]

In [16]:
def compute_fingerprint(cols):
    fp = []
    equals_Zero = True
    for column in cols:
        hashint = int(column.dot(powers))
        fp.append(hashint)
        if hashint != 0:
            equals_Zero = False
    if equals_Zero == True:
        return None
    return tuple(fp)

In [17]:
def utility(combination, matches):
    return utilities[combination] / matches

In [54]:
def get_ratios(bscore_query, rindex_dict):
    """Inputs: an L x 62 bootleg score query and our dictionary, where
               rindex_dict[fp] = (count, {dictionary of pieces and offsets})"""
    l = len(bscore_query)
    # ratios[i][j] is a pair of (ratio, combination, fingerprint)
    ratios = np.array([[(0, None, None) for _ in range(l)] for _ in range(16)])
    for j in range(l):
        # calculate utility to cost ratio for all 16 n-grams
        for idx, combination in enumerate(combinations):
            cols = []
            # we need at least enough fingerprints for all the indices in our combination
            try:
                for i in combination:
                    cols.append(bscore_query[j+int(i)])
            except IndexError:
                continue
            fp = compute_fingerprint(cols)
            if not fp or combination not in rindex_dict or fp not in rindex_dict[combination]:
                continue
            matches = rindex_dict[combination][fp][0]
            ratios[idx][j] = (utility(combination, matches), combination, fp)
    return ratios

In [55]:
def update_offset_dict(offset_dict, pieces_and_offsets, num_lookups, i):
    for piece in pieces_and_offsets:
        offset = [(j - i, num_lookups) for j in pieces_and_offsets[piece]]
        offset_dict[num_to_piece[piece]].extend(offset)

In [79]:
def get_fingerprints(bscore_query, rindex_dict, ratios, runtime_budget):
    l = len(bscore_query)
    aisle_budget = runtime_budget / l
    cur_budget = aisle_budget
    offset_dict = defaultdict(list)
    for i in range(l):
        fingerprints = []
        col = ratios[:, i]
        num_lookups = np.count_nonzero(col)
        lookups = sorted(col, key = lambda x: x[0], reverse = True)
        for _, combination, n_gram in lookups:
            if not n_gram or cur_budget < 0:
                break
            pieces_and_offsets = rindex_dict[combination][n_gram][1]
            update_offset_dict(offset_dict, pieces_and_offsets, num_lookups, i)
            cur_budget -= 1
        cur_budget += aisle_budget
    return offset_dict

In [98]:
def rankHistograms(offset_dict, bin_size=10):
    bin_size = 2
    pieceScores = []
    numShow = 5
    for key in offset_dict:
        h = offset_dict[key]
        maxh = max(h)[0]
        minh = min(h)[0]
        if(maxh > minh+bin_size):
            hist = np.zeros(int((maxh-minh)/bin_size)+2)
            for i in h:
                # i[1] is the number of lookups we did for that column
                hist[int((i[0]-minh)/bin_size)] += 1/i[1]
            score = np.max(hist)
            pieceScores.append((key, score))
        else:
            pieceScores.append((key, 1))
            
    pieceScores = sorted(pieceScores, key = lambda x:x[1], reverse=True)
    return pieceScores

In [102]:
def processSingleQuery(imagefile, rindex, runtime_budget, outfile = None):
    profileStart = time.time()
    
    # Get Bootleg Score
    bscore_query = processQuery(imagefile)
    bscore_query = bscore_query.T
    
    searchStart = time.time()
    # Generate and rank histograms
    
    ratios = get_ratios(bscore_query, rindex)
    offset_dict = get_fingerprints(bscore_query, rindex, ratios, runtime_budget)
    pieceScores = rankHistograms(offset_dict)
    
    # Profile & save to file
    profileEnd = time.time()
    
    profileDur = profileEnd - profileStart
    print(profileDur)
    saveToFile(outfile, imagefile, pieceScores, profileDur)
    return pieceScores

In [103]:
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [104]:
processSingleQuery('data/queries/p1_q3.jpg', d, 1000)

Processing data/queries/p1_q3.jpg
0.8124935626983643


[('dGoethe,_Walter4_Impromptus,_Op.6_505845', 1),
 ('dEckard,_Johann_Gottfried2_Keyboard_Sonatas,_Op.2_384198', 1),
 ('dWidor,_Charles-MarieCarnaval,_Op.61_103951', 1),
 ('dChopin,_Fr%C3%A9d%C3%A9ricNocturnes,_Op.55_112745', 1),
 ('dL%C3%BCders,_Conrad12_Grandes_%C3%A9tudes,_Op.26_357338', 1),
 ('dKullak,_TheodorKinderleben,_Op.62_365353', 1),
 ('dCramer,_Henri6_Fantaisies_%C3%A9l%C3%A9gantes_sur_des_th%C3%A8mes_favoris,_Op.74_521971',
  1),
 ('dWeiss,_JosefCarmenfantasie_nach_Bizets_Oper_10106', 1),
 ('dCzerny,_CarlPiano_Sonata_No.10,_Op.268_267371', 1),
 ('dBlumenfeld,_FelixValse-Impromptu,_Op.16_02918', 1),
 ('dTchaikovsky,_PyotrValse-Caprice,_Op.4_180667', 1),
 ('dSteibelt,_DanielCommemorative_Overture_485021', 1),
 ('dSchubert,_FranzPiano_Sonata_No.21,_D.960_31446', 1),
 ('dKowalski,_HenriBarcarolle,_Op.20_408195', 1),
 ('dPrisovsky,_VasilyChanson_du_Printemps,_Op.191_408229', 1),
 ('dBloch,_Ernest6_Preludes,_B.79_384108', 1)]

In [105]:
def saveToFile(outfile, imagefile, pieceScores, profileDur):
    if outfile:
        with open(outfile, 'wb') as f:
            query = os.path.splitext(os.path.basename(imagefile))[0]
            pickle.dump((query,pieceScores, profileDur),f)

In [110]:
def processQuery_wrapper(queryfile, rindex, outdir, runtime_budget):
    # wrapper for running multiple jobs in parallel
    basename = os.path.splitext(os.path.basename(queryfile))[0] # e.g. p1_q1
    hyp_outfile = "{}/{}.hyp".format(outdir, basename)
    piece = basename.split('_')[0]
    # might change later to print to outfile
    return processSingleQuery(queryfile, rindex, runtime_budget, hyp_outfile)

In [111]:
runtime_budget = 100000

In [112]:
query_list = 'cfg_files/query.train.list' # list of query images
outdir = 'experiments/v2_200mill/hyp' # where to save hypothesis output files

# prep output directory
if not os.path.isdir(outdir):
    os.makedirs(outdir)

# load reverse index. Recommend keeping load=False and loading it earlier.
load = False
if load:
    print("LOADING RINDEX")
    rindex1 = []
    with open(pickle_file, 'rb') as f:
        rindex1 = pickle.load(f)
    rindex_filter = rindex1
    
print("STARTING PROCESSING")
# number of cores to use
multiprocess = False
if multiprocess:
    n_cores = 25 #multiprocessing.cpu_count()
    pool = multiprocessing.Pool(processes=n_cores)

inputs = []
with open(query_list, 'r') as f:
    for line in f:
        inputs.append((line.rstrip(), outdir))
        
if multiprocess:
    # process queries in parallel
    outputs = list(pool.starmap(processQuery_wrapper, inputs))
else:
    for i in inputs:
        processQuery_wrapper(i[0], d, i[1], runtime_budget)

STARTING PROCESSING
Processing data/queries/p1_q1.jpg
0.8815197944641113
Processing data/queries/p1_q2.jpg
0.5904567241668701
Processing data/queries/p1_q3.jpg
0.6073172092437744
Processing data/queries/p1_q4.jpg
0.5808777809143066
Processing data/queries/p1_q5.jpg
0.6070592403411865
Processing data/queries/p1_q6.jpg
0.5969784259796143
Processing data/queries/p1_q7.jpg
0.637418270111084
Processing data/queries/p1_q8.jpg
0.6267921924591064
Processing data/queries/p1_q9.jpg
0.7610173225402832
Processing data/queries/p1_q10.jpg
0.9325578212738037
Processing data/queries/p5_q1.jpg
0.6955029964447021
Processing data/queries/p5_q2.jpg
0.7891812324523926
Processing data/queries/p5_q3.jpg
0.8269245624542236
Processing data/queries/p5_q4.jpg
0.6973311901092529
Processing data/queries/p5_q5.jpg
0.7275066375732422
Processing data/queries/p5_q6.jpg
0.6883652210235596
Processing data/queries/p5_q7.jpg
0.9359574317932129
Processing data/queries/p5_q8.jpg
0.8989143371582031
Processing data/queries/p5

0.7109479904174805
Processing data/queries/p75_q4.jpg
0.6351916790008545
Processing data/queries/p75_q5.jpg
0.8547673225402832
Processing data/queries/p75_q6.jpg
0.7719199657440186
Processing data/queries/p75_q7.jpg
0.7771697044372559
Processing data/queries/p75_q8.jpg
0.7796361446380615
Processing data/queries/p75_q9.jpg
0.8234496116638184
Processing data/queries/p75_q10.jpg
0.9048278331756592
Processing data/queries/p81_q1.jpg
0.7382659912109375
Processing data/queries/p81_q2.jpg
0.6766402721405029
Processing data/queries/p81_q3.jpg
0.9021804332733154
Processing data/queries/p81_q4.jpg
0.7563443183898926
Processing data/queries/p81_q5.jpg
0.814875602722168
Processing data/queries/p81_q6.jpg
0.6906352043151855
Processing data/queries/p81_q7.jpg
0.6940679550170898
Processing data/queries/p81_q8.jpg
0.6749036312103271
Processing data/queries/p81_q9.jpg
0.712554931640625
Processing data/queries/p81_q10.jpg
0.5266304016113281
Processing data/queries/p85_q1.jpg
0.6167874336242676
Processin

0.7460205554962158
Processing data/queries/p151_q4.jpg
0.6108944416046143
Processing data/queries/p151_q5.jpg
0.6945722103118896
Processing data/queries/p151_q6.jpg
0.6814079284667969
Processing data/queries/p151_q7.jpg
0.6455521583557129
Processing data/queries/p151_q8.jpg
0.8593747615814209
Processing data/queries/p151_q9.jpg
0.7544386386871338
Processing data/queries/p151_q10.jpg
0.7301678657531738
Processing data/queries/p155_q1.jpg
0.7742505073547363
Processing data/queries/p155_q2.jpg
0.7578487396240234
Processing data/queries/p155_q3.jpg
1.0797295570373535
Processing data/queries/p155_q4.jpg
0.7820754051208496
Processing data/queries/p155_q5.jpg
0.7109434604644775
Processing data/queries/p155_q6.jpg
0.692011833190918
Processing data/queries/p155_q7.jpg
0.2183387279510498
Processing data/queries/p155_q8.jpg
0.6699185371398926
Processing data/queries/p155_q9.jpg
0.6642918586730957
Processing data/queries/p155_q10.jpg
0.7060999870300293
Processing data/queries/p161_q1.jpg
0.4603257

In [53]:
len(database)

138390672

In [202]:
n_gram_count = defaultdict(int)

In [203]:
for i in database:
    n_gram_count[len(i)] += 1

In [204]:
n_gram_count

defaultdict(int, {1: 380992, 2: 28502993, 3: 109506687})

In [54]:
sum(counts.values())

249398153

## Test code

In [62]:
piece_counts = defaultdict(int)
for fp in database:
    for piece in database[fp]:
        piece_counts[piece] += 1

In [63]:
def get_coverage(counts, total_pieces):
    """
    given a dictionary of counts, return an array where a[i] is the percent of 
    items in IMSLP with at least i+1 unique fingerprints
    """
    num_pieces_per_count = defaultdict(int)
    for num in counts.values():
        num_pieces_per_count[num] += 1
    n = max(num_pieces_per_count)
    cumulative_counts = []
    cur_count = 0
    for i in reversed(range(1, n+1)):
        cur_count += num_pieces_per_count[i]
        cumulative_counts.append(cur_count)
    return [count / total_pieces for count in cumulative_counts[::-1]]

In [64]:
total = len(num_to_piece)

In [67]:
percents = get_coverage(piece_counts, total)

In [69]:
with open("data/percent_140mill.pkl", "wb") as f:
    pickle.dump(percents, f)

In [76]:
from collections import Counter

In [83]:
total

32234

In [123]:
num_to_piece[32144]

'p111'

In [87]:
query_count = 0
for i in range(32034, 32234):
    if i in piece_counts:
        query_count += 1

In [194]:
max(piece_counts.values())

198965

In [195]:
piece_counts[32144]

768

In [82]:
len(piece_counts)

29539