## Databases for each n-gram combination

There are 22 different n-gram combinations (1-3 grams) with maximum context 6. This notebook constructs a database for each type of n-gram and also a database storing the counts for each n-gram type.

In [None]:
%matplotlib inline

In [None]:
import itertools
import pickle
from collections import defaultdict
import numpy as np
import pickle
import pandas as pd
import os
import time
import dill
from glob import iglob

In [None]:
combinations = []
for n_gram in range(1, 4):
    combinations += [[0] + list(tup) for tup in itertools.combinations(range(1, 6), n_gram-1)]

In [None]:
combinations = ["".join(str(num) for num in combination) for combination in combinations]

In [None]:
def getTotalBscore(bscore_file):
    bscore_array = []
    with open(bscore_file,'rb') as f:
        bscore_array = pickle.load(f)
    total_bscore = np.array([]).reshape(62,0)
    for page in bscore_array:
        try:
            total_page = unpackbits(np.array(page), 62)
        except TypeError:
            total_page = np.array([]).reshape(62,0)
            for num in page:
                col = np.array(decodeColumn(num)).reshape(62,-1)
                total_page = np.concatenate((total_page,col),axis=1)
        total_bscore = np.concatenate((total_bscore,total_page),axis=1)
    return total_bscore

In [None]:
def decodeColumn(num):
    col = []
    for i in range(62):
        col.insert(0,num%2)
        num = int(num/2)
    return col

In [None]:
def unpackbits(x, num_bits):
    xshape = list(x.shape)
    x = x.reshape([-1, 1])
    mask = 2**np.arange(num_bits, dtype=x.dtype).reshape([1, num_bits])
    return np.flip((x & mask).astype(bool).astype(float), 1).T

We precompute powers of 2 from $2^0$ to $2^{61}$ to speed up calculating hash values.

In [None]:
powers = 1 << np.arange(62)[::-1]

In [None]:
def make_fps(data, combinations, dbs, piece):
    for colindex in range(len(data)):
        for combination in combinations:
            cols = []
            # we need at least enough fingerprints for all the indices in our combination
            try:
                for i in combination:
                    cols.append(data[colindex+int(i)])
            except IndexError:
                continue
            fp = []
            equals_Zero = True
            for column in cols:
                hashint = int(column.dot(powers))
                fp.append(hashint)
                if hashint != 0:
                    equals_Zero = False
            if equals_Zero == True:
                continue
            dbs[combination][tuple(fp)][piece].append(colindex)

Now we iterate over every query in our list of queries, compute the bootleg score for that query, generate all possible n-grams from the bootleg score, and store it in our database, which is an in-memory Python dictionary.

In [None]:
def make_DB(filelist, combinations, dbs = None):
    if not dbs:
        dbs = {combination: defaultdict(lambda : defaultdict(list)) for combination in combinations}
    with open(filelist, 'r') as f:
        failed = []
        for i, curfile in enumerate(f):
            curfile = curfile.strip().strip('\n')
            try:
                num = curfile.split('/')[-1][0]
                if num == 'd':
                    data = getTotalBscore(curfile)
                else:
                    with open(curfile, 'rb') as pickle_file:
                        data = pickle.load(pickle_file)
                make_fps(data.T, combinations, dbs, i)
            except:
                failed.append(curfile)
    return dbs

In [None]:
def store_DB(db, combination, outdir):
    with open(f"{outdir}/{combination}.pkl", "wb") as f:
        dill.dump(db, f)

In [None]:
filelist = 'cfg_files/db.list'

In [None]:
db_dir = '/data1/kji/databases_random'

In [None]:
os.makedirs(db_dir, exist_ok=True)

We make a mapping from a number to each piece to save memory.

In [None]:
num_to_piece = {}

In [None]:
with open(filelist, 'r') as f:
    failed = []
    for i, curfile in enumerate(f):
        curfile = curfile.strip().strip('\n')
        piece = curfile.split('/')[-1][:-4]
        num_to_piece[i] = piece

In [None]:
with open("num_to_piece_random.pkl", "wb") as f:
    pickle.dump(num_to_piece, f)

In [None]:
with open("num_to_piece_random.pkl", 'rb') as f:
    num_to_piece = pickle.load(f)

Here we create a database for each n-gram type (total 22). Each n-gram database maps from the n-gram (a tuple) to a dictionary. This dictionary maps each piece that the n-gram appears in to a list of offsets within the piece.

In [None]:
for i in range(0, len(combinations), 3):
    dbs = make_DB(filelist, combinations[i: i+3])
    for combination in dbs:
        store_DB(dbs[combination], combination, db_dir)
    dbs.clear()

## Make fingerprint matches table

The fingerprint matches table is a nested dictionary. 
* The first level maps each n-gram type to another dictionary. 
* The second dictionary maps each n-gram of that type to a tuple (x, y), where x is the total number of times the fingerprint occurs in IMSLP and y is the total number of PDFs in IMSLP with that fingerprint.

In [None]:
db_dir = "/data1/kji/databases_random"

In [None]:
fp_matches = {combination: {} for combination in combinations}

In [None]:
for combination in combinations:
    with open(f"{db_dir}/{combination}.pkl", "rb") as f:
        d = dill.load(f)
    for fp in d:
        # first element is number of times the fingerprint occurs, second element is number of PDFs containing the fingerprint
        fp_matches[combination][fp] = (sum(len(d[fp][piece]) for piece in d[fp]), len(d[fp]))
    d.clear()