## Construct database for marketplace fingerprinting

This notebook constructs the database we use during marketplace fingerprinting. It uses the 22 n-gram databases that we previously generated and selects the fingerprint with the highest utility to include in the marketplace database.

In [None]:
%matplotlib inline

In [None]:
import itertools
import pickle
from collections import defaultdict
import ast
import numpy as np
from contextlib import closing
import numba as nb
import pickle
import itertools
import pandas as pd
import multiprocessing
import csv
import os
import time
import dill
from glob import iglob

### Load in dictionary of probabilities

In [None]:
with open("/data1/kji/databases/probabilities.pkl", "rb") as f:
    utilities = pickle.load(f)

### Load in dictionary of n-grams and their counts

In [None]:
combinations = []
for n_gram in range(1, 4):
    combinations += [[0] + list(tup) for tup in itertools.combinations(range(1, 7), n_gram-1)]

In [None]:
combinations = ["".join(str(num) for num in combination) for combination in combinations]

In [None]:
def store_DB(db, combination, outdir):
    with open(f"{outdir}/{combination}.pkl", "wb") as f:
        pickle.dump(db, f, protocol=pickle.HIGHEST_PROTOCOL)

## Test database with in memory dictionary

Store each n-gram, utility, and the number of matches for an n-gram in a separate array for memory efficiency.

In [None]:
mapping = {}

In [None]:
for i, c in enumerate(combinations):
    mapping[c] = i

In [None]:
total_pdfs = 29310

Each n-gram's utility is computed by $P(correctness) \times m$, where $m$ is the number of times it appears in IMSLP.

In [None]:
def generate_arrays(fp_matches):
    """Takes in a fingerprint matches table as input and returns four separate arrays containing the table's 
       fingerprints, the probability of correctness for each fingerprint, the number of matches for each fingerprint
       in IMSLP, and the type of each fingerprint."""
    n_grams = np.array([], dtype = object)
    probabilities = np.array([], dtype = float)
    matches = np.array([], dtype = int)
    types = np.array([], dtype = int)
    # iterate over every n-gram type and add its information to the arrays
    for combination in fp_matches:
        d = fp_matches[combination]
        length = len(d)       
        # update indices for the given combination
        fps = np.empty(length, dtype = object)
        utils = np.empty(length, dtype = float)
        values = np.empty(length, dtype = int)
        cur_types = np.full(length, mapping[combination])
        for i, (n_gram, (count, num_pdfs)) in enumerate(d.items()):
            fps[i] = n_gram
            values[i] = count
            utils[i] = utilities[combination] * count
        d.clear()
        n_grams = np.concatenate([n_grams, fps])
        probabilities = np.concatenate([probabilities, utils])
        matches = np.concatenate([matches, values])
        types = np.concatenate([types, cur_types])
        print(f"finished processing {combination}")
    return n_grams, probabilities, matches, types

Load in fingerprint matches table.

In [None]:
db_dir = '/data1/kji/databases_v4'

In [None]:
fp_matches_file = f"{db_dir}/fp_matches.pkl"

In [None]:
with open(fp_matches_file, "rb") as f:
    fp_matches = pickle.load(f)

In [None]:
n_grams, probs, matches, types = generate_arrays(fp_matches)

After generating our four arrays, we store them to use in generating the database construction file.

In [None]:
db_dir = '/data1/kji/databases_v4_pdfs'

In [None]:
store_DB(n_grams, "fps", db_dir)

In [None]:
store_DB(probs, "utils", db_dir)

In [None]:
store_DB(matches, "matches", db_dir)

In [None]:
store_DB(types, "n_gram_types", db_dir)

## Generate database construction file

Instead of sorting all the n-grams and generating the database in the same step, first we write the sorted list of n-grams to a file for fast iteration speed (instead of regenerating the database every time, we can create multiple files and only generate the database for the finalized file).

In [None]:
combinations = []
for n_gram in range(1, 4):
    combinations += [[0] + list(tup) for tup in itertools.combinations(range(1, 7), n_gram-1)]

In [None]:
combinations = ["".join(str(num) for num in combination) for combination in combinations]

In [None]:
threshold = 10000

In [None]:
def construct_db_thresholded(n_grams, utilities, types, matches, threshold, outdir, outfile_name):
    """
    Input: an array of n-grams, an array of each n-gram's utility, an array of the number of matches
           for each n-gram in IMSLP, and an array of each n-gram's type (e.g. '012')
    Output: a file specifying an ordered list of n-grams to include in the final database that have at most threshold matches
    """
    # sort fingerprints in descending order
    idx = np.argsort(-utilities)
            
    # write all used fingerprints to a file
    with open(f"{outdir}/{outfile_name}.txt", "w") as out:
        for i in idx:
            if matches[i] > threshold:
                continue
            out.write(f"{n_grams[i]} {types[i]}\n")

In [None]:
db_dir = '/data1/kji/databases_v4_pdfs'

In [None]:
with open(f"{db_dir}/fps.pkl", "rb") as f:
    n_grams = pickle.load(f)
with open(f"{db_dir}/utils.pkl", "rb") as f:
    utilities = pickle.load(f)
with open(f"{db_dir}/n_gram_types.pkl", "rb") as f:
    types = pickle.load(f)
with open(f"{db_dir}/matches.pkl", "rb") as f:
    matches = pickle.load(f)

In [None]:
construct_db_thresholded(n_grams, utilities, types, matches, threshold, "/data1/kji/construction_lists", "all_v4.0.d_pdfs")

## Construct database of offsets

Now that we have selected all the fingerprints, we construct the database containing each fingerprint and their offsets in IMSLP. This file is the output of the previous step.

In [None]:
fp_file = "/data1/kji/construction_lists/103mill_v4.0d_pdfs.txt"

Construct a dictionary with all the n-grams in our database.

In [None]:
reverse_mapping = {}

In [None]:
for i, c in enumerate(combinations):
    reverse_mapping[str(i)] = c

In [None]:
def initialize_entry(line):
    line = line.rstrip().split()
    n_gram, combination = ''.join(line[:-1]), line[-1]
    return ast.literal_eval(n_gram), reverse_mapping[combination]

In [None]:
def make_db(fp_file):
    with open(fp_file) as f:
        lines = f.readlines()
    n_cores = 30
    pool = multiprocessing.Pool(n_cores)
    keys = pool.map(initialize_entry, lines)
    dbs = {combination: {} for combination in combinations}
    for fp, combination in keys:
        dbs[combination][fp] = {}
    return dbs

In [None]:
dbs = make_db(fp_file)

Now load in every single n-gram database and update our current database with the real offsets for each n-gram.

In [None]:
db_dir = '/data1/kji/databases_random'

In [None]:
def add_db_values(file, db):
    with open(file, "rb") as f:
        d = dill.load(f)
    for n_gram in d.keys():
        if n_gram in db:
            db[n_gram] = dict(d[n_gram])
            total_count = 0
            for piece in db[n_gram]:
                total_count += len(db[n_gram][piece])
                db[n_gram][piece] = tuple(db[n_gram][piece])
            db[n_gram] = (total_count, db[n_gram])
    d.clear()

In [None]:
for combination in dbs:
    if dbs[combination]:
        add_db_values(f"{db_dir}/{combination}.pkl", dbs[combination])
        with open(f"/data1/kji/databases_v4_pdfs/103mill/{combination}.pkl", "wb") as f:
            pickle.dump(dbs[combination], f, protocol = pickle.HIGHEST_PROTOCOL)
        dbs[combination].clear()