In [1]:
%matplotlib inline

In [2]:
import itertools
import pickle
from collections import defaultdict
import ast
import numpy as np
from contextlib import closing
import numba as nb
import pickle
import itertools
import pandas as pd
import multiprocessing
import csv
import os
import time
import dill
from glob import iglob

### Load in dictionary of probabilities

In [5]:
with open("/data1/kji/databases/probabilities.pkl", "rb") as f:
    utilities = pickle.load(f)

### Load in dictionary of n-grams and their counts

In [7]:
combinations = []
for n_gram in range(1, 4):
    combinations += [[0] + list(tup) for tup in itertools.combinations(range(1, 6), n_gram-1)]

In [8]:
combinations = ["".join(str(num) for num in combination) for combination in combinations]

In [9]:
def store_DB(db, combination, outdir):
    with open(f"{outdir}/{combination}.pkl", "wb") as f:
        pickle.dump(db, f, protocol=pickle.HIGHEST_PROTOCOL)

## Test database with in memory dictionary

Store each n-gram, utility, and the number of matches for an n-gram in a separate array.

In [10]:
mapping = {}

In [11]:
for i, c in enumerate(combinations):
    mapping[c] = i

In [12]:
total_pdfs = 30275

In [13]:
def generate_arrays(fp_matches):
    n_grams = np.array([], dtype = object)
    probabilities = np.array([], dtype = float)
    matches = np.array([], dtype = int)
    types = np.array([], dtype = int)
    for combination in fp_matches:
        d = fp_matches[combination]
        length = len(d)       
        # update indices for the given combination
        fps = np.empty(length, dtype = object)
        utils = np.empty(length, dtype = float)
        values = np.empty(length, dtype = int)
        cur_types = np.full(length, mapping[combination])
        for i, (n_gram, (count, num_pdfs)) in enumerate(d.items()):
            fps[i] = n_gram
            values[i] = count
#             utils[i] = -utilities[combination] * np.log2(num_pdfs / total_pdfs)
#             utils[i] = utilities[combination] * count
            utils[i] = utilities[combination] * num_pdfs
        d.clear()
        n_grams = np.concatenate([n_grams, fps])
        probabilities = np.concatenate([probabilities, utils])
        matches = np.concatenate([matches, values])
        types = np.concatenate([types, cur_types])
        print(f"finished processing {combination}")
    return n_grams, probabilities, matches, types

Mark indices as boundaries for each type of n-gram.

In [14]:
db_dir = '/data1/kji/databases_v4'

In [15]:
fp_matches_file = f"{db_dir}/fp_matches.pkl"

In [16]:
with open(fp_matches_file, "rb") as f:
    fp_matches = pickle.load(f)

In [17]:
n_grams, probs, matches, types = generate_arrays(fp_matches)

finished processing 0
finished processing 01
finished processing 02
finished processing 03
finished processing 04
finished processing 05
finished processing 012
finished processing 013
finished processing 014
finished processing 015
finished processing 023
finished processing 024
finished processing 025
finished processing 034
finished processing 035
finished processing 045


In [18]:
db_dir = '/data1/kji/databases_v4_pdfs'

In [19]:
store_DB(n_grams, "fps", db_dir)

In [20]:
store_DB(probs, "utils", db_dir)

In [21]:
store_DB(matches, "matches", db_dir)

In [22]:
store_DB(types, "n_gram_types", db_dir)

## Generate database construction file

In [3]:
combinations = []
for n_gram in range(1, 4):
    combinations += [[0] + list(tup) for tup in itertools.combinations(range(1, 6), n_gram-1)]

In [4]:
combinations = ["".join(str(num) for num in combination) for combination in combinations]

In [16]:
average_bscore_length = 104

In [10]:
threshold = 10000

In [6]:
def construct_db_thresholded(n_grams, utilities, types, matches, threshold, outdir, outfile_name):
    """
    Input: an array of n-grams, an array of each n-gram's utility, an array of the number of matches
           for each n-gram in IMSLP, and an array of each n-gram's type (e.g. '012')
    Output: a file specifying an ordered list of n-grams to include in the final database that have at most threshold matches
    """
    # sort fingerprints in descending order
    idx = np.argsort(-utilities)
            
    # write all used fingerprints to a file
    with open(f"{outdir}/{outfile_name}.txt", "w") as out:
        for i in idx:
            if matches[i] > threshold:
                continue
            out.write(f"{n_grams[i]} {types[i]}\n")

In [5]:
def construct_db(n_grams, utilities, types, outdir, outfile_name):
    """
    Input: an array of n-grams, an array of each n-gram's utility, and an array of each n-gram's type (e.g. '012')
    Output: a file specifying an ordered list of n-grams to include in the final database
    """
    # sort fingerprints in descending order
    idx = np.argsort(utilities)
            
    # write all used fingerprints to a file
    count = 0
    with open(f"{outdir}/{outfile_name}.txt", "w") as out:
        for i in idx:
            if count > 80000000:
                break
            count += 1
            out.write(f"{n_grams[i]} {types[i]} {-utilities[i]}\n")

In [9]:
def construct_db(n_grams, utilities, types, outdir, outfile_name):
    """
    Input: an array of n-grams, an array of each n-gram's utility, and an array of each n-gram's type (e.g. '012')
    Output: a file specifying an ordered list of n-grams to include in the final database
    """
    # sort fingerprints in descending order
    idx = np.argsort(-utilities)
            
    # write all used fingerprints to a file
    with open(f"{outdir}/{outfile_name}.txt", "w") as out:
        for i in idx:
            out.write(f"{n_grams[i]} {types[i]}\n")

In [7]:
db_dir = '/data1/kji/databases_v4_pdfs'

In [8]:
with open(f"{db_dir}/fps.pkl", "rb") as f:
    n_grams = pickle.load(f)
with open(f"{db_dir}/utils.pkl", "rb") as f:
    utilities = pickle.load(f)
with open(f"{db_dir}/n_gram_types.pkl", "rb") as f:
    types = pickle.load(f)
with open(f"{db_dir}/matches.pkl", "rb") as f:
    matches = pickle.load(f)

In [9]:
construct_db_thresholded(n_grams, utilities, types, matches, 10000, "/data1/kji/construction_lists", "all_v4.0.d_pdfs")

In [8]:
construct_db(n_grams, utilities, types, "/data1/kji", "v3_test_10mill")

## Construct database of offsets

Now that we have selected all the fingerprints, we construct the database containing each fingerprint and their offsets in IMSLP.

In [12]:
fp_file = "/data1/kji/construction_lists/103mill_v4.0d_pdfs.txt"

Construct a dictionary with all the n-grams in our database.

In [13]:
reverse_mapping = {}

In [14]:
for i, c in enumerate(combinations):
    reverse_mapping[str(i)] = c

In [15]:
def initialize_entry(line):
    line = line.rstrip().split()
    n_gram, combination = ''.join(line[:-1]), line[-1]
    return ast.literal_eval(n_gram), reverse_mapping[combination]

In [16]:
def make_db(fp_file):
    with open(fp_file) as f:
        lines = f.readlines()
    n_cores = 30
    pool = multiprocessing.Pool(n_cores)
    keys = pool.map(initialize_entry, lines)
    dbs = {combination: {} for combination in combinations}
    for fp, combination in keys:
        dbs[combination][fp] = {}
    return dbs

In [17]:
dbs = make_db(fp_file)

Now load in every single database file and update our current database with the real offsets.

In [18]:
db_dir = '/data1/kji/databases_random'

In [19]:
def add_db_values(file, db):
    with open(file, "rb") as f:
        d = dill.load(f)
    for n_gram in d.keys():
        if n_gram in db:
            db[n_gram] = dict(d[n_gram])
            total_count = 0
            for piece in db[n_gram]:
                total_count += len(db[n_gram][piece])
                db[n_gram][piece] = tuple(db[n_gram][piece])
            db[n_gram] = (total_count, db[n_gram])
    d.clear()

In [20]:
for combination in dbs:
    if dbs[combination]:
        add_db_values(f"{db_dir}/{combination}.pkl", dbs[combination])
        with open(f"/data1/kji/databases_v4_pdfs/103mill/{combination}.pkl", "wb") as f:
            pickle.dump(dbs[combination], f, protocol = pickle.HIGHEST_PROTOCOL)
        dbs[combination].clear()