In [3]:
import pickle
import numpy as np
# from ExtractBootlegFeatures import *
from ExtractTest import *
import itertools
import numba as nb
from numba import njit
from collections import defaultdict

In [4]:
import dill
from glob import iglob

**Load in databases and counts**

In [1]:
db_dir = "/data1/kji/databases/merged_dbs"

In [2]:
d = {}

In [5]:
for filename in iglob(f"{db_dir}/*.pkl", recursive=True):
    combination = filename.split('/')[-1][:-4]
    with open(filename, "rb") as f:
        d[combination] = pickle.load(f)
    print(f"finished {combination}")

finished three_grams
finished two_grams
finished 0


In [3]:
db_dir = "/data1/kji/databases_v3/100mill"

In [4]:
d = {}

In [5]:
for filename in iglob(f"{db_dir}/*.pkl", recursive=True):
    combination = filename.split('/')[-1][:-4]
    with open(filename, "rb") as f:
        d[combination] = pickle.load(f)
    print(f"finished {combination}")

finished 015
finished 014
finished 035
finished 02
finished 03
finished 05
finished 013
finished 04
finished 023
finished 025
finished 045
finished 034
finished 01
finished 012
finished 0
finished 024


In [23]:
with open("/data1/kji/databases/probabilities.pkl", "rb") as f:
    utilities = pickle.load(f)

In [24]:
with open("num_to_piece.pkl", 'rb') as f:
    num_to_piece = pickle.load(f)

In [8]:
combinations = []
for n_gram in range(1, 4):
    combinations += [[0] + list(tup) for tup in itertools.combinations(range(1, 6), n_gram-1)]

In [9]:
combinations = ["".join(str(num) for num in combination) for combination in combinations]

In [10]:
powers = 1 << np.arange(62)[::-1]

In [11]:
def compute_fingerprint(cols):
    fp = []
    equals_Zero = True
    for column in cols:
        hashint = int(column.dot(powers))
        fp.append(hashint)
        if hashint != 0:
            equals_Zero = False
    if equals_Zero == True:
        return None
    return tuple(fp)

In [12]:
def utility(combination, matches):
    return utilities[combination] / matches

In [145]:
def get_ratios(bscore_query, rindex_dict):
    """Inputs: an L x 62 bootleg score query and our dictionary, where
               rindex_dict[fp] = (count, {dictionary of pieces and offsets})"""
    l = len(bscore_query)
    # ratios[i][j] is a pair of (ratio, combination, fingerprint)
    ratios = np.array([[(0, None, None) for _ in range(l)] for _ in range(16)])
    for j in range(l):
        # calculate utility to cost ratio for all 16 n-grams
        for idx, combination in enumerate(combinations):
            cols = []
            # we need at least enough fingerprints for all the indices in our combination
            try:
                for i in combination:
                    cols.append(bscore_query[j+int(i)])
            except IndexError:
                continue
            fp = compute_fingerprint(cols)
            if not fp or combination not in rindex_dict or fp not in rindex_dict[combination]:
                continue
            matches = rindex_dict[combination][fp][0]
#             # TODO: modified
            if matches > 15000:
                continue
            ratios[idx][j] = (utility(combination, matches), combination, fp)
    return ratios

In [103]:
def update_offset_dict(offset_dict, pieces_and_offsets, num_lookups, i):
    for piece in pieces_and_offsets:
#         offset = [(j - i, num_lookups) for j in pieces_and_offsets[piece]]
        offset = [j - i for j in pieces_and_offsets[piece]]
        offset_dict[num_to_piece[piece]].extend(offset)

In [104]:
def get_fingerprints(bscore_query, rindex_dict, ratios, runtime_budget):
    l = len(bscore_query)
    aisle_budget = runtime_budget / l
    cur_budget = aisle_budget
    offset_dict = defaultdict(list)
    for i in range(l):
        fingerprints = []
        col = ratios[:, i]
        num_lookups = np.count_nonzero(col)
        lookups = sorted(col, key = lambda x: x[0], reverse = True)
        for _, combination, n_gram in lookups:
            if not n_gram or cur_budget < 0:
                break
            pieces_and_offsets = rindex_dict[combination][n_gram][1]
            update_offset_dict(offset_dict, pieces_and_offsets, num_lookups, i)
            cur_budget -= 1
        cur_budget += aisle_budget
    return offset_dict

In [126]:
def rankHistograms(offset_dict, bin_size=10):
    bin_size = 2
    pieceScores = []
    for key in offset_dict:
        h = offset_dict[key]
        maxh = max(h)
        minh = min(h)
        hist = np.zeros(int((maxh-minh)/bin_size)+2)
        for i in h:
            hist[int((i-minh)/bin_size)] += 1
        score = np.max(hist)
        pieceScores.append((key, score))
            
    pieceScores = sorted(pieceScores, key = lambda x:x[1], reverse=True)
    return pieceScores

In [132]:
def processSingleQuery(imagefile, rindex, runtime_budget, outfile = None):
    profileStart = time.time()
    
    # Get Bootleg Score
    bscore_query = processQuery(imagefile)
    bscore_query = bscore_query.T
    
    searchStart = time.time()
    # Generate and rank histograms
    
    ratios = get_ratios(bscore_query, rindex)
    offset_dict = get_fingerprints(bscore_query, rindex, ratios, runtime_budget)
    pieceScores = rankHistograms(offset_dict)
    # Profile & save to file
    profileEnd = time.time()
    
    profileDur = profileEnd - profileStart
    print(profileDur)
    saveToFile(outfile, imagefile, pieceScores, profileDur)
    return pieceScores

In [133]:
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [None]:
processSingleQuery('data/queries/p185_q7.jpg', d, 100)

In [21]:
def saveToFile(outfile, imagefile, pieceScores, profileDur):
    if outfile:
        with open(outfile, 'wb') as f:
            query = os.path.splitext(os.path.basename(imagefile))[0]
            pickle.dump((query,pieceScores, profileDur),f)

In [22]:
def processQuery_wrapper(queryfile, rindex, outdir, runtime_budget):
    # wrapper for running multiple jobs in parallel
    basename = os.path.splitext(os.path.basename(queryfile))[0] # e.g. p1_q1
    hyp_outfile = "{}/{}.hyp".format(outdir, basename)
    piece = basename.split('_')[0]
    # might change later to print to outfile
    return processSingleQuery(queryfile, rindex, runtime_budget, hyp_outfile)

In [23]:
runtime_budget = 125

In [147]:
query_list = 'cfg_files/query.train.list' # list of query images
outdir = 'experiments/v3_100mill_test/hyp' # where to save hypothesis output files

# prep output directory
if not os.path.isdir(outdir):
    os.makedirs(outdir)

# load reverse index. Recommend keeping load=False and loading it earlier.
load = False
if load:
    print("LOADING RINDEX")
    rindex1 = []
    with open(pickle_file, 'rb') as f:
        rindex1 = pickle.load(f)
    rindex_filter = rindex1
    
print("STARTING PROCESSING")
# number of cores to use
multiprocess = False
if multiprocess:
    n_cores = 25 #multiprocessing.cpu_count()
    pool = multiprocessing.Pool(processes=n_cores)

inputs = []
with open(query_list, 'r') as f:
    for line in f:
        inputs.append((line.rstrip(), outdir))
        
if multiprocess:
    # process queries in parallel
    outputs = list(pool.starmap(processQuery_wrapper, inputs))
else:
    for i in inputs:
        processQuery_wrapper(i[0], d, i[1], runtime_budget)

STARTING PROCESSING
Processing data/queries/p1_q1.jpg
0.8061671257019043
Processing data/queries/p1_q2.jpg
0.6152687072753906
Processing data/queries/p1_q3.jpg
0.6072547435760498
Processing data/queries/p1_q4.jpg
0.6217086315155029
Processing data/queries/p1_q5.jpg
0.6363704204559326
Processing data/queries/p1_q6.jpg
0.631310224533081
Processing data/queries/p1_q7.jpg
0.6830594539642334
Processing data/queries/p1_q8.jpg
0.7114200592041016
Processing data/queries/p1_q9.jpg
0.6784603595733643
Processing data/queries/p1_q10.jpg
0.6146197319030762
Processing data/queries/p5_q1.jpg
0.6479716300964355
Processing data/queries/p5_q2.jpg
0.7547199726104736
Processing data/queries/p5_q3.jpg
0.7756764888763428
Processing data/queries/p5_q4.jpg
0.7104859352111816
Processing data/queries/p5_q5.jpg
0.7629561424255371
Processing data/queries/p5_q6.jpg
0.888373613357544
Processing data/queries/p5_q7.jpg
0.8237276077270508
Processing data/queries/p5_q8.jpg
0.8671705722808838
Processing data/queries/p5_

0.7956392765045166
Processing data/queries/p75_q4.jpg
0.6962921619415283
Processing data/queries/p75_q5.jpg
1.0720677375793457
Processing data/queries/p75_q6.jpg
0.8312463760375977
Processing data/queries/p75_q7.jpg
0.8642957210540771
Processing data/queries/p75_q8.jpg
0.9245352745056152
Processing data/queries/p75_q9.jpg
0.9954955577850342
Processing data/queries/p75_q10.jpg
0.8083291053771973
Processing data/queries/p81_q1.jpg
1.0982096195220947
Processing data/queries/p81_q2.jpg
0.9284722805023193
Processing data/queries/p81_q3.jpg
0.7963066101074219
Processing data/queries/p81_q4.jpg
0.9010531902313232
Processing data/queries/p81_q5.jpg
0.6743431091308594
Processing data/queries/p81_q6.jpg
0.7792637348175049
Processing data/queries/p81_q7.jpg
0.7626378536224365
Processing data/queries/p81_q8.jpg
0.7519230842590332
Processing data/queries/p81_q9.jpg
0.8452973365783691
Processing data/queries/p81_q10.jpg
0.6176803112030029
Processing data/queries/p85_q1.jpg
0.6974177360534668
Process

0.8806617259979248
Processing data/queries/p151_q4.jpg
0.7694265842437744
Processing data/queries/p151_q5.jpg
0.714806318283081
Processing data/queries/p151_q6.jpg
0.7243373394012451
Processing data/queries/p151_q7.jpg
0.6732492446899414
Processing data/queries/p151_q8.jpg
0.8032639026641846
Processing data/queries/p151_q9.jpg
0.6788501739501953
Processing data/queries/p151_q10.jpg
0.7704250812530518
Processing data/queries/p155_q1.jpg
0.78951096534729
Processing data/queries/p155_q2.jpg
0.8250391483306885
Processing data/queries/p155_q3.jpg
0.8628067970275879
Processing data/queries/p155_q4.jpg
0.9075124263763428
Processing data/queries/p155_q5.jpg
1.080026626586914
Processing data/queries/p155_q6.jpg
0.7155900001525879
Processing data/queries/p155_q7.jpg
0.2205491065979004
Processing data/queries/p155_q8.jpg
0.945547342300415
Processing data/queries/p155_q9.jpg
0.7025768756866455
Processing data/queries/p155_q10.jpg
0.8485629558563232
Processing data/queries/p161_q1.jpg
0.60346007347

In [13]:
len(d)

16

In [7]:
n_gram_count = defaultdict(int)

In [8]:
for i in d:
    n_gram_count[i] = len(d[i])

In [9]:
n_gram_count

defaultdict(int, {'three_grams': 46654034, 'two_grams': 11721009, '0': 616977})

In [10]:
total_matches = 0

In [11]:
for combination in d:
    for fp in d[combination]:
        total_matches += d[combination][fp][0]

In [12]:
total_matches

5734709365

## Test code

In [13]:
def get_counts(d):
    """given a database, returns a dictionary mapping each piece to the number of unique fingerprints it has in IMSLP"""
    counts = defaultdict(int)
    for combination in d:
        db = d[combination]
        for fp in db:
            for piece in db[fp][1]:
                counts[piece] += 1
    return counts

In [14]:
def get_coverage(counts, total_pieces):
    """
    given a dictionary of counts, return an array where a[i] is the percent of 
    items in IMSLP with at least i+1 unique fingerprints
    """
    num_pieces_per_count = defaultdict(int)
    for num in counts.values():
        num_pieces_per_count[num] += 1
    n = max(num_pieces_per_count)
    cumulative_counts = []
    cur_count = 0
    for i in reversed(range(1, n+1)):
        cur_count += num_pieces_per_count[i]
        cumulative_counts.append(cur_count)
    return [count / total_pieces for count in cumulative_counts[::-1]]

In [15]:
counts = get_counts(d)

In [17]:
with open("data/merged_23_grams_counts.pkl", "wb") as f:
    pickle.dump(counts, f)

In [18]:
total = 30275

In [19]:
percents = get_coverage(counts, total)

In [20]:
with open("data/percent_merged_23_grams.pkl", "wb") as f:
    pickle.dump(percents, f)