In [1]:
import pickle
import numpy as np
# from ExtractBootlegFeatures import *
from ExtractTest import *
import itertools
import numba as nb
from numba import jit
from collections import defaultdict

In [2]:
import dill
from glob import iglob

**Load in databases and counts**

In [3]:
db_dir = "/data1/kji/databases_v2/200mill"

In [4]:
d = {}

In [5]:
for filename in iglob(f"{db_dir}/*.pkl", recursive=True):
    combination = filename.split('/')[-1][:-4]
    with open(filename, "rb") as f:
        d[combination] = pickle.load(f)
    print(f"finished {combination}")

finished 015
finished 014
finished 02
finished 03
finished 05
finished 013
finished 04
finished 023
finished 045
finished 034
finished 01
finished 012
finished 0
finished 024


In [6]:
with open("/data1/kji/databases/probabilities.pkl", "rb") as f:
    utilities = pickle.load(f)

In [7]:
with open("num_to_piece.pkl", 'rb') as f:
    num_to_piece = pickle.load(f)

In [8]:
combinations = []
for n_gram in range(1, 4):
    combinations += [[0] + list(tup) for tup in itertools.combinations(range(1, 6), n_gram-1)]

In [9]:
combinations = ["".join(str(num) for num in combination) for combination in combinations]

In [10]:
powers = 1 << np.arange(62)[::-1]

In [11]:
def compute_fingerprint(cols):
    fp = []
    equals_Zero = True
    for column in cols:
        hashint = int(column.dot(powers))
        fp.append(hashint)
        if hashint != 0:
            equals_Zero = False
    if equals_Zero == True:
        return None
    return tuple(fp)

In [12]:
def utility(combination, matches):
    return utilities[combination] / matches

In [13]:
def get_ratios(bscore_query, rindex_dict):
    """Inputs: an L x 62 bootleg score query and our dictionary, where
               rindex_dict[fp] = (count, {dictionary of pieces and offsets})"""
    l = len(bscore_query)
    # ratios[i][j] is a pair of (ratio, combination, fingerprint)
    ratios = np.array([[(0, None, None) for _ in range(l)] for _ in range(16)])
    for j in range(l):
        # calculate utility to cost ratio for all 16 n-grams
        for idx, combination in enumerate(combinations):
            cols = []
            # we need at least enough fingerprints for all the indices in our combination
            try:
                for i in combination:
                    cols.append(bscore_query[j+int(i)])
            except IndexError:
                continue
            fp = compute_fingerprint(cols)
            if not fp or combination not in rindex_dict or fp not in rindex_dict[combination]:
                continue
            matches = rindex_dict[combination][fp][0]
            ratios[idx][j] = (utility(combination, matches), combination, fp)
    return ratios

In [14]:
def update_offset_dict(offset_dict, pieces_and_offsets, num_lookups, i):
    for piece in pieces_and_offsets:
        offset = [(j - i, num_lookups) for j in pieces_and_offsets[piece]]
        offset_dict[num_to_piece[piece]].extend(offset)

In [15]:
def get_fingerprints(bscore_query, rindex_dict, ratios, runtime_budget):
    l = len(bscore_query)
    aisle_budget = runtime_budget / l
    cur_budget = aisle_budget
    offset_dict = defaultdict(list)
    for i in range(l):
        fingerprints = []
        col = ratios[:, i]
        num_lookups = np.count_nonzero(col)
        lookups = sorted(col, key = lambda x: x[0], reverse = True)
        for _, combination, n_gram in lookups:
            if not n_gram or cur_budget < 0:
                break
            pieces_and_offsets = rindex_dict[combination][n_gram][1]
            update_offset_dict(offset_dict, pieces_and_offsets, num_lookups, i)
            cur_budget -= 1
        cur_budget += aisle_budget
    return offset_dict

In [20]:
def rankHistograms(offset_dict, bin_size=10):
    bin_size = 2
    pieceScores = []
    numShow = 5
    for key in offset_dict:
        h = offset_dict[key]
        maxh = max(h)[0]
        minh = min(h)[0]
        if(maxh > minh+bin_size):
            hist = np.zeros(int((maxh-minh)/bin_size)+2)
            for i in h:
                # i[1] is the number of lookups we did for that column
#                 hist[int((i[0]-minh)/bin_size)] += 1/i[1]
                hist[int((i[0]-minh)/bin_size)] += 1
            score = np.max(hist)
            pieceScores.append((key, score))
        else:
            pieceScores.append((key, 1))
            
    pieceScores = sorted(pieceScores, key = lambda x:x[1], reverse=True)
    return pieceScores

In [21]:
def processSingleQuery(imagefile, rindex, runtime_budget, outfile = None):
    profileStart = time.time()
    
    # Get Bootleg Score
    bscore_query = processQuery(imagefile)
    bscore_query = bscore_query.T
    
    searchStart = time.time()
    # Generate and rank histograms
    
    ratios = get_ratios(bscore_query, rindex)
    offset_dict = get_fingerprints(bscore_query, rindex, ratios, runtime_budget)
    pieceScores = rankHistograms(offset_dict)
    
    # Profile & save to file
    profileEnd = time.time()
    
    profileDur = profileEnd - profileStart
    print(profileDur)
    saveToFile(outfile, imagefile, pieceScores, profileDur)
    return pieceScores

In [22]:
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [23]:
processSingleQuery('data/queries/p115_q2.jpg', d, 50)

Processing data/queries/p115_q2.jpg
1.406092643737793


[('dSaint-Sa%C3%ABns,_CamilleCaprice_sur_les_airs_de_ballet_d%27Alceste_de_Gluck_26963',
  1),
 ('dHeller,_Stephen3_St%C3%A4ndchen,_Op.131_253466', 1),
 ('dWidor,_Charles-MarieOrgan_Symphony_No.7,_Op.42_No.3_82911', 1),
 ('dBach,_August_WilhelmOrgelst%C3%BCcke,_Pr%C3%A4ludien_und_Fugen_96259', 1),
 ('dLachner,_Franz_PaulPiano_Sonata,_Op.25_468417', 1),
 ('dKullak,_TheodorLieder_aus_alter_Zeit,_Op.111_17356', 1),
 ('dReger,_MaxAus_meinem_Tagebuch,_Op.82_428964', 1),
 ('dLiszt,_FranzR%C3%A9miniscences_de_Don_Juan,_S.418_30802', 1),
 ('dKuhe,_WilhelmFantasia_on_%27Lucrezia_Borgia%27,_Op.96_418415', 1),
 ('dKnuth,_J%C3%BCrgenMusicus_Theoretico-Practicus_463740', 1),
 ('dHartmann,_Johan_Peter_EmiliusFantasy_for_Organ_134078', 1),
 ('dSmart,_Henry_Thomas12_Pieces_for_the_Organ_100242', 1),
 ('dSchytte,_Ludvig6_Brillante_Vortragset%C3%BCden,_Op.73_412935', 1),
 ('dBatiste,_Edouard12_Offertoires,_Opp.36-41_441857', 1),
 ('dKalkbrenner,_Christian12_Variations_on_%27Freut_euch_des_Lebens%27_5118

In [24]:
def saveToFile(outfile, imagefile, pieceScores, profileDur):
    if outfile:
        with open(outfile, 'wb') as f:
            query = os.path.splitext(os.path.basename(imagefile))[0]
            pickle.dump((query,pieceScores, profileDur),f)

In [25]:
def processQuery_wrapper(queryfile, rindex, outdir, runtime_budget):
    # wrapper for running multiple jobs in parallel
    basename = os.path.splitext(os.path.basename(queryfile))[0] # e.g. p1_q1
    hyp_outfile = "{}/{}.hyp".format(outdir, basename)
    piece = basename.split('_')[0]
    # might change later to print to outfile
    return processSingleQuery(queryfile, rindex, runtime_budget, hyp_outfile)

In [26]:
runtime_budget = 75

In [88]:
query_list = 'cfg_files/query.train.list' # list of query images
outdir = 'experiments/v2_200mill/hyp' # where to save hypothesis output files

# prep output directory
if not os.path.isdir(outdir):
    os.makedirs(outdir)

# load reverse index. Recommend keeping load=False and loading it earlier.
load = False
if load:
    print("LOADING RINDEX")
    rindex1 = []
    with open(pickle_file, 'rb') as f:
        rindex1 = pickle.load(f)
    rindex_filter = rindex1
    
print("STARTING PROCESSING")
# number of cores to use
multiprocess = False
if multiprocess:
    n_cores = 25 #multiprocessing.cpu_count()
    pool = multiprocessing.Pool(processes=n_cores)

inputs = []
with open(query_list, 'r') as f:
    for line in f:
        inputs.append((line.rstrip(), outdir))
        
if multiprocess:
    # process queries in parallel
    outputs = list(pool.starmap(processQuery_wrapper, inputs))
else:
    for i in inputs:
        processQuery_wrapper(i[0], d, i[1], runtime_budget)

STARTING PROCESSING
Processing data/queries/p1_q1.jpg
0.8999388217926025
Processing data/queries/p1_q2.jpg
1.8985929489135742
Processing data/queries/p1_q3.jpg
1.3613214492797852
Processing data/queries/p1_q4.jpg
1.3633308410644531
Processing data/queries/p1_q5.jpg
1.7088799476623535
Processing data/queries/p1_q6.jpg
2.067652702331543
Processing data/queries/p1_q7.jpg
1.4007627964019775
Processing data/queries/p1_q8.jpg
1.581275463104248
Processing data/queries/p1_q9.jpg
1.5418422222137451
Processing data/queries/p1_q10.jpg
0.7797176837921143
Processing data/queries/p5_q1.jpg
1.5250487327575684
Processing data/queries/p5_q2.jpg
2.003523826599121
Processing data/queries/p5_q3.jpg
0.8271844387054443
Processing data/queries/p5_q4.jpg
1.9034366607666016
Processing data/queries/p5_q5.jpg
0.849172830581665
Processing data/queries/p5_q6.jpg
0.8464789390563965
Processing data/queries/p5_q7.jpg
1.0548293590545654
Processing data/queries/p5_q8.jpg
0.8119087219238281
Processing data/queries/p5_q9

3.716280221939087
Processing data/queries/p75_q4.jpg
0.6981306076049805
Processing data/queries/p75_q5.jpg
2.4975273609161377
Processing data/queries/p75_q6.jpg
1.2180607318878174
Processing data/queries/p75_q7.jpg
2.4001853466033936
Processing data/queries/p75_q8.jpg
3.5896670818328857
Processing data/queries/p75_q9.jpg
2.1268727779388428
Processing data/queries/p75_q10.jpg
0.9617490768432617
Processing data/queries/p81_q1.jpg
1.1448235511779785
Processing data/queries/p81_q2.jpg
3.0903913974761963
Processing data/queries/p81_q3.jpg
1.238372802734375
Processing data/queries/p81_q4.jpg
1.1835963726043701
Processing data/queries/p81_q5.jpg
1.1767463684082031
Processing data/queries/p81_q6.jpg
2.085871696472168
Processing data/queries/p81_q7.jpg
1.3498504161834717
Processing data/queries/p81_q8.jpg
0.7390382289886475
Processing data/queries/p81_q9.jpg
2.169781446456909
Processing data/queries/p81_q10.jpg
0.7418463230133057
Processing data/queries/p85_q1.jpg
0.6723167896270752
Processing 

1.1141505241394043
Processing data/queries/p151_q5.jpg
2.168172597885132
Processing data/queries/p151_q6.jpg
0.7283785343170166
Processing data/queries/p151_q7.jpg
1.834730863571167
Processing data/queries/p151_q8.jpg
1.1541404724121094
Processing data/queries/p151_q9.jpg
1.7682068347930908
Processing data/queries/p151_q10.jpg
0.8107888698577881
Processing data/queries/p155_q1.jpg
0.8016142845153809
Processing data/queries/p155_q2.jpg
2.246748685836792
Processing data/queries/p155_q3.jpg
1.020923137664795
Processing data/queries/p155_q4.jpg
1.0847489833831787
Processing data/queries/p155_q5.jpg
0.980384111404419
Processing data/queries/p155_q6.jpg
0.7284810543060303
Processing data/queries/p155_q7.jpg
0.34597301483154297
Processing data/queries/p155_q8.jpg
1.018390417098999
Processing data/queries/p155_q9.jpg
1.0567305088043213
Processing data/queries/p155_q10.jpg
2.6816747188568115
Processing data/queries/p161_q1.jpg
2.3513641357421875
Processing data/queries/p161_q2.jpg
0.61529088020

In [22]:
query_list = 'cfg_files/query.train.list' # list of query images
outdir = 'experiments/v3_75mill/hyp' # where to save hypothesis output files

# prep output directory
if not os.path.isdir(outdir):
    os.makedirs(outdir)

# load reverse index. Recommend keeping load=False and loading it earlier.
load = False
if load:
    print("LOADING RINDEX")
    rindex1 = []
    with open(pickle_file, 'rb') as f:
        rindex1 = pickle.load(f)
    rindex_filter = rindex1
    
print("STARTING PROCESSING")
# number of cores to use
multiprocess = False
if multiprocess:
    n_cores = 25 #multiprocessing.cpu_count()
    pool = multiprocessing.Pool(processes=n_cores)

inputs = []
with open(query_list, 'r') as f:
    for line in f:
        inputs.append((line.rstrip(), outdir))
        
if multiprocess:
    # process queries in parallel
    outputs = list(pool.starmap(processQuery_wrapper, inputs))
else:
    for i in inputs:
        processQuery_wrapper(i[0], d, i[1], runtime_budget)

STARTING PROCESSING
Processing data/queries/p1_q1.jpg
1.1822164058685303
Processing data/queries/p1_q2.jpg
1.7531392574310303
Processing data/queries/p1_q3.jpg
1.4765760898590088
Processing data/queries/p1_q4.jpg
2.1240317821502686
Processing data/queries/p1_q5.jpg
1.6033704280853271
Processing data/queries/p1_q6.jpg
1.945582628250122
Processing data/queries/p1_q7.jpg
1.4316473007202148
Processing data/queries/p1_q8.jpg
1.3464686870574951
Processing data/queries/p1_q9.jpg
2.2000513076782227
Processing data/queries/p1_q10.jpg
0.7445323467254639
Processing data/queries/p5_q1.jpg
2.328204870223999
Processing data/queries/p5_q2.jpg
2.010500192642212
Processing data/queries/p5_q3.jpg
1.0483293533325195
Processing data/queries/p5_q4.jpg
3.1438090801239014
Processing data/queries/p5_q5.jpg
0.7531285285949707
Processing data/queries/p5_q6.jpg
1.7487502098083496
Processing data/queries/p5_q7.jpg
0.8355906009674072
Processing data/queries/p5_q8.jpg
0.8786613941192627
Processing data/queries/p5_q

3.2837178707122803
Processing data/queries/p75_q4.jpg
1.848721981048584
Processing data/queries/p75_q5.jpg
3.9737536907196045
Processing data/queries/p75_q6.jpg
1.0850961208343506
Processing data/queries/p75_q7.jpg
2.741612434387207
Processing data/queries/p75_q8.jpg
3.387824058532715
Processing data/queries/p75_q9.jpg
3.155956268310547
Processing data/queries/p75_q10.jpg
2.209009885787964
Processing data/queries/p81_q1.jpg
1.6094176769256592
Processing data/queries/p81_q2.jpg
4.445210933685303
Processing data/queries/p81_q3.jpg
0.9958322048187256
Processing data/queries/p81_q4.jpg
1.3492743968963623
Processing data/queries/p81_q5.jpg
1.750840187072754
Processing data/queries/p81_q6.jpg
3.1721482276916504
Processing data/queries/p81_q7.jpg
0.9957451820373535
Processing data/queries/p81_q8.jpg
2.0240020751953125
Processing data/queries/p81_q9.jpg
3.1776304244995117
Processing data/queries/p81_q10.jpg
0.9619016647338867
Processing data/queries/p85_q1.jpg
2.2059648036956787
Processing dat

2.672525644302368
Processing data/queries/p151_q5.jpg
2.0040667057037354
Processing data/queries/p151_q6.jpg
0.7360286712646484
Processing data/queries/p151_q7.jpg
1.69679856300354
Processing data/queries/p151_q8.jpg
2.4695727825164795
Processing data/queries/p151_q9.jpg
1.7701573371887207
Processing data/queries/p151_q10.jpg
1.974191665649414
Processing data/queries/p155_q1.jpg
1.1156551837921143
Processing data/queries/p155_q2.jpg
2.167680263519287
Processing data/queries/p155_q3.jpg
1.185950756072998
Processing data/queries/p155_q4.jpg
0.948591947555542
Processing data/queries/p155_q5.jpg
1.2476933002471924
Processing data/queries/p155_q6.jpg
0.7319552898406982
Processing data/queries/p155_q7.jpg
0.21937179565429688
Processing data/queries/p155_q8.jpg
1.0278651714324951
Processing data/queries/p155_q9.jpg
2.115509271621704
Processing data/queries/p155_q10.jpg
3.238645553588867
Processing data/queries/p161_q1.jpg
3.1037344932556152
Processing data/queries/p161_q2.jpg
0.99881982803344

In [27]:
len(d)

14

In [28]:
n_gram_count = defaultdict(int)

In [29]:
for i in d:
    n_gram_count[i] = len(d[i])

In [30]:
n_gram_count

defaultdict(int,
            {'015': 20196344,
             '014': 19725622,
             '02': 6740511,
             '03': 6989825,
             '05': 7381172,
             '013': 18899889,
             '04': 7343753,
             '023': 18862532,
             '045': 10377125,
             '034': 19690242,
             '01': 5969618,
             '012': 17775101,
             '0': 380992,
             '024': 19667274})

In [31]:
total_matches = 0

In [32]:
for combination in d:
    for fp in d[combination]:
        total_matches += d[combination][fp][0]

In [33]:
total_matches

180000000

## Test code

In [34]:
def get_counts(d):
    """given a database, returns a dictionary mapping each piece to the number of unique fingerprints it has in IMSLP"""
    counts = defaultdict(int)
    for combination in d:
        db = d[combination]
        for fp in db:
            for piece in db[fp][1]:
                counts[piece] += 1
    return counts

In [35]:
def get_coverage(counts, total_pieces):
    """
    given a dictionary of counts, return an array where a[i] is the percent of 
    items in IMSLP with at least i+1 unique fingerprints
    """
    num_pieces_per_count = defaultdict(int)
    for num in counts.values():
        num_pieces_per_count[num] += 1
    n = max(num_pieces_per_count)
    cumulative_counts = []
    cur_count = 0
    for i in reversed(range(1, n+1)):
        cur_count += num_pieces_per_count[i]
        cumulative_counts.append(cur_count)
    return [count / total_pieces for count in cumulative_counts[::-1]]

In [36]:
counts = get_counts(d)

In [37]:
with open("data/v2_counts.pkl", "wb") as f:
    pickle.dump(counts, f)

In [38]:
total = 30275

In [39]:
percents = get_coverage(counts, total)

In [40]:
with open("data/percent_180mill_v2.pkl", "wb") as f:
    pickle.dump(percents, f)