In [1]:
%matplotlib inline

In [2]:
import itertools
import pickle
from collections import defaultdict
import ast
import numpy as np
from contextlib import closing
import numba as nb
import pickle
import itertools
import pandas as pd
import multiprocessing
import csv
import os
import time
import dill
from glob import iglob

### Load in dictionary of probabilities

In [3]:
with open("/data1/kji/databases_v2/probabilities.pkl", "rb") as f:
    utilities = pickle.load(f)

### Load in dictionary of n-grams and their counts

In [4]:
combinations = []
for n_gram in range(1, 4):
    combinations += [[0] + list(tup) for tup in itertools.combinations(range(1, 6), n_gram-1)]

In [5]:
combinations = ["".join(str(num) for num in combination) for combination in combinations]

In [6]:
def store_DB(db, combination, outdir):
    with open(f"{outdir}/{combination}.pkl", "wb") as f:
        pickle.dump(db, f, protocol=pickle.HIGHEST_PROTOCOL)

## Test database with in memory dictionary

Store each n-gram, utility, and the number of matches for an n-gram in a separate array.

In [8]:
mapping = {}

In [9]:
for i, c in enumerate(combinations):
    mapping[c] = i

In [20]:
total_pdfs = 30275

In [27]:
def generate_arrays(fp_matches):
    n_grams = np.array([], dtype = object)
    probabilities = np.array([], dtype = float)
    matches = np.array([], dtype = int)
    types = np.array([], dtype = int)
    for combination in fp_matches:
        d = fp_matches[combination]
        length = len(d)       
        # update indices for the given combination
        fps = np.empty(length, dtype = object)
        utils = np.empty(length, dtype = float)
        values = np.empty(length, dtype = int)
        cur_types = np.full(length, mapping[combination])
        for i, (n_gram, (count, num_pdfs)) in enumerate(d.items()):
            fps[i] = n_gram
            values[i] = count
            utils[i] = utilities[combination] * np.log2(num_pdfs / total_pdfs)
        d.clear()
        n_grams = np.concatenate([n_grams, fps])
        probabilities = np.concatenate([probabilities, utils])
        matches = np.concatenate([matches, values])
        types = np.concatenate([types, cur_types])
        print(f"finished processing {combination}")
    return n_grams, probabilities, matches, types

Mark indices as boundaries for each type of n-gram.

In [None]:
db_dir = '/data1/kji/databases_v3'

In [11]:
fp_matches_file = f"{db_dir}/fp_matches.pkl"

In [12]:
with open(fp_matches_file, "rb") as f:
    fp_matches = pickle.load(f)

In [28]:
n_grams, probs, matches, types = generate_arrays(fp_matches)

finished processing 0
finished processing 01
finished processing 02
finished processing 03
finished processing 04
finished processing 05
finished processing 012
finished processing 013
finished processing 014
finished processing 015
finished processing 023
finished processing 024
finished processing 025
finished processing 034
finished processing 035
finished processing 045


In [29]:
store_DB(n_grams, "fps", db_dir)

In [30]:
store_DB(probs, "utils", db_dir)

In [31]:
store_DB(matches, "matches", db_dir)

In [32]:
store_DB(types, "n_gram_types", db_dir)

## Generate database construction file

In [3]:
@nb.njit(parallel=True)
def parallel_ratio(matches, utils):
    z=np.empty(matches.shape)
    for i in nb.prange(len(matches)):
        z[i] = utils[i] / matches[i]
    return z

In [18]:
combinations = []
for n_gram in range(1, 4):
    combinations += [[0] + list(tup) for tup in itertools.combinations(range(1, 6), n_gram-1)]

In [19]:
combinations = ["".join(str(num) for num in combination) for combination in combinations]

In [6]:
def construct_db(n_grams, utilities, types, outdir, outfile_name):
    """
    Input: an array of n-grams, an array of each n-gram's utility, an array of the number of matches
           for each n-gram in IMSLP, and an array of each n-gram's type (e.g. '012')
    Output: a file specifying an ordered list of n-grams to include in the final database
    """
    # sort fingerprints in descending order
    idx = np.argsort(-utilities)
            
    # write all used fingerprints to a file
    with open(f"{outdir}/{outfile_name}.txt", "w") as out:
        for i in idx:
            out.write(f"{n_grams[i]} {types[i]}\n")

In [8]:
db_dir = '/data1/kji/databases_v3'

In [9]:
with open(f"{db_dir}/fps.pkl", "rb") as f:
    n_grams = pickle.load(f)
with open(f"{db_dir}/utils.pkl", "rb") as f:
    utilities = pickle.load(f)
with open(f"{db_dir}/n_gram_types.pkl", "rb") as f:
    types = pickle.load(f)

In [10]:
construct_db(n_grams, utilities, types, "/data1/kji/construction_lists", "all_v3")

## Construct database of offsets

Now that we have selected all the fingerprints, we construct the database containing each fingerprint and their offsets in IMSLP.

In [25]:
fp_file = "/data1/kji/construction_lists/75mill_v3.txt"

Construct a dictionary with all the n-grams in our database.

In [26]:
reverse_mapping = {}

In [27]:
for i, c in enumerate(combinations):
    reverse_mapping[str(i)] = c

In [28]:
def initialize_entry(line):
    line = line.rstrip().split()
    n_gram, combination = ''.join(line[:-1]), line[-1]
    return ast.literal_eval(n_gram), reverse_mapping[combination]

In [29]:
def make_db(fp_file):
    with open(fp_file) as f:
        lines = f.readlines()
    n_cores = 30
    pool = multiprocessing.Pool(n_cores)
    keys = pool.map(initialize_entry, lines)
    dbs = {combination: {} for combination in combinations}
    for fp, combination in keys:
        dbs[combination][fp] = {}
    return dbs

In [30]:
dbs = make_db(fp_file)

Now load in every single database file and update our current database with the real offsets.

In [31]:
db_dir = '/data1/kji/databases'

In [32]:
def add_db_values(file, db):
    with open(file, "rb") as f:
        d = dill.load(f)
    for n_gram in d.keys():
        if n_gram in db:
            if db[n_gram] is None:
                db[n_gram] = d[n_gram]
            else:
                db[n_gram].update(d[n_gram])
    d.clear()

In [33]:
for combination in combinations:
    if dbs[combination]:
        add_db_values(f"{db_dir}/{combination}.pkl", dbs[combination])
        d = dbs[combination]
        for fp in d:
            total_count = sum([len(d[fp][piece]) for piece in d[fp]])
            dbs[combination][fp] = (total_count, d[fp])
        with open(f"/data1/kji/databases_v3/75mill/{combination}.pkl", "wb") as f:
            pickle.dump(dbs[combination], f, protocol = pickle.HIGHEST_PROTOCOL)
        dbs[combination].clear()

## SQL database

In [5]:
engine = sqlalchemy.create_engine('sqlite:////data1/kji/databases/test.db')

In [12]:
test_files = ['/data1/kji/databases/0_counts.pkl', 
              '/data1/kji/databases/01_counts.pkl',
              '/data1/kji/databases/012_counts.pkl',
              '/data1/kji/databases/0123_counts.pkl']

In [6]:
for filename in test_files:
    with open(filename, "rb") as f:
        d = pickle.load(f)
        df_dict = defaultdict(list)
        combination = filename.split('/')[-1].split("_")[0]
        for n_gram, count in d.items():
            df_dict['combination'].append(combination)
            df_dict['n_gram'].append(str(n_gram))
            df_dict['matches'].append(count)
            df_dict['utility'].append(utilities[combination])
            df_dict['used'].append(0)
        d.clear()
        df = pd.DataFrame.from_dict(df_dict)
        with engine.begin() as connection:
            df.to_sql(combination, con=connection, index=False, if_exists='replace')
        print(f"finished processing {filename}")
        f.flush()
        df_dict.clear()

finished processing /data1/kji/databases/0_counts.pkl
finished processing /data1/kji/databases/01_counts.pkl
finished processing /data1/kji/databases/012_counts.pkl
finished processing /data1/kji/databases/0123_counts.pkl


### Generate database construction plan

In [4]:
def marginal_memory(num_matches, remaining_budget):
    return num_matches / remaining_budget

In [5]:
def marginal_runtime(matches_squared, num_matches, total_matches, total_avg_runtime, runtime_budget):
    cost = (matches_squared + num_matches ** 2) / (total_matches + num_matches) - total_avg_runtime
    return cost / (runtime_budget - total_avg_runtime)

In [6]:
def metric(utility, num_matches, memory_budget, runtime_budget, total_matches, matches_squared, total_avg_runtime):
    marginal_memory_cost = marginal_memory(num_matches, memory_budget-total_matches)
    marginal_runtime_cost = marginal_runtime(matches_squared, num_matches, total_matches, total_avg_runtime, runtime_budget)
    return utility / max(marginal_memory_cost, marginal_runtime_cost)

In [14]:
conn = sqlite3.connect("/data1/kji/databases/test.db")
conn.create_function("metric", 7, metric)
c = conn.cursor()

In [10]:
db_items = ['(2048, 0, 0, 0, 0, 0)', '(1073741824, 0, 0, 0, 0, 0)']

In [11]:
query = "select * from fingerprints where used == 0 order by utility desc limit 1"

In [15]:
result = c.execute(query).fetchone()

In [7]:
def query(db_path, table_name, memory_budget, runtime_budget, total_matches, matches_squared, total_avg_runtime):
    with closing(sqlite3.connect(db_path)) as con, con,  \
            closing(con.cursor()) as cur:
        con.create_function("metric", 7, metric)
        query = f"select * from '{table_name}' where used = 0 order by metric(utility, matches, ?, ?, ?, ?, ?) desc limit 1"
        cur.execute(query, (memory_budget, runtime_budget, total_matches, matches_squared, total_avg_runtime))
        result = cur.fetchone()
        ratio = metric(result[3], result[2], memory_budget, runtime_budget, total_matches, matches_squared, total_avg_runtime)
        return (result, ratio)

In [8]:
def construct_db(db_path, memory_budget, runtime_budget, outdir):
    """
    Inputs: a database which contains n-grams, their utility, and number of matches,
            a memory budget, and a runtime budget
    Output: a file specifying an ordered list of n-grams to include in the final database
    """
    conn = sqlite3.connect(db_path)
    
    with closing(conn.cursor()) as c:
        c.execute("SELECT name FROM sqlite_master WHERE type='table'")
        tables = c.fetchall()
        
    # clear all used bits in the database
    for table in tables:
        with closing(conn.cursor()) as cur:
            cur.execute(f"UPDATE '{table[0]}' SET used = 0")
    
    total_matches = 0
    matches_squared = 0
    total_avg_runtime = 0
    with open(f"{outdir}/Bm_10k_Bt_8k_fingerprints.txt", "w") as f1, open(f"{outdir}/Bm_10k_Bt_8k_info.csv", "w") as f2:
        writer = csv.writer(f2)
        writer.writerow(["fingerprint", "ratio", "m_i", "Ct_i", "Bm-Dm", "Bt-Dt"])
        while memory_budget - total_matches > 0 and runtime_budget - total_avg_runtime > 0:
            start = time.time()
            inputs = [(db_path, table[0], memory_budget, runtime_budget, total_matches, matches_squared, total_avg_runtime) for table in tables]
            with multiprocessing.Pool(processes = 26) as pool:
                results = pool.starmap(query, inputs)
            # get the pair with the highest ratio
            result = max(results, key = lambda pair: pair[1])
            n_gram = result[0][0]
            f1.write(f"{n_gram}\n")
            ratio = result[1]
            m_i = result[0][2]
            cost = (matches_squared + m_i ** 2) / (total_matches + m_i) - total_avg_runtime
            remaining_memory = memory_budget - total_matches
            remaining_runtime = runtime_budget - total_avg_runtime
            print(remaining_memory)
            print(remaining_runtime)
            writer.writerow([result[0][1], ratio, m_i, cost, remaining_memory, remaining_runtime])
            total_matches += m_i
            matches_squared += m_i**2
            total_avg_runtime += matches_squared / total_matches  
            combination = result[0][0]
            with closing(conn.cursor()) as c:
                c.execute(f"UPDATE '{combination}' SET used = 1 WHERE combination = '{combination}' and n_gram = '{n_gram}'")
            print(f"finished in {time.time() - start} seconds")

In [9]:
db_path = "/data1/kji/databases/test.db"

In [10]:
memory_budget = 10000

In [11]:
runtime_budget = 8000

In [11]:
%load_ext line_profiler

No multithreading takes 166 seconds per fingerprint.

In [21]:
%lprun -f construct_db construct_db(db_path, memory_budget, runtime_budget, "experiments/db_tests")

UsageError: Line magic function `%lprun` not found.


In [12]:
construct_db(db_path, memory_budget, runtime_budget, "experiments/db_tests")

10000
8000
finished in 49.985451221466064 seconds
9999
7999.0
finished in 46.40796089172363 seconds
9998
7998.0
finished in 44.28696370124817 seconds
9997
7997.0
finished in 46.907904386520386 seconds
9996
7996.0
finished in 46.40547823905945 seconds
9995
7995.0
finished in 52.72178411483765 seconds
9994
7994.0
finished in 45.79409456253052 seconds
9993
7993.0
finished in 46.2898633480072 seconds
9992
7992.0
finished in 46.557454109191895 seconds
9991
7991.0
finished in 44.22396373748779 seconds
9990
7990.0
finished in 49.21182632446289 seconds


Process ForkPoolWorker-311:
Process ForkPoolWorker-308:
Process ForkPoolWorker-297:
Process ForkPoolWorker-293:
Process ForkPoolWorker-310:
Process ForkPoolWorker-309:
Process ForkPoolWorker-304:
Process ForkPoolWorker-291:
Process ForkPoolWorker-305:
Process ForkPoolWorker-296:
Process ForkPoolWorker-307:
Process ForkPoolWorker-312:
Process ForkPoolWorker-306:
Process ForkPoolWorker-300:
Process ForkPoolWorker-298:
Process ForkPoolWorker-294:
Process ForkPoolWorker-299:
Process ForkPoolWorker-301:
Process ForkPoolWorker-303:


KeyboardInterrupt: 