## Databases for each n-gram combination

There are 22 different n-gram combinations (1-3 grams) with maximum context 6. This notebook constructs a database for each type of n-gram and also a database storing the counts for each n-gram type.

In [5]:
%matplotlib inline

In [6]:
import itertools
import pickle
from collections import defaultdict
import numpy as np
import pickle
import pandas as pd
import os
import time
import dill
from glob import iglob
import tqdm

In [7]:
context = 6
combinations = []
for n_gram in range(1, 4):
    combinations += [[0] + list(tup) for tup in itertools.combinations(range(1, context), n_gram-1)]

In [8]:
combinations = ["".join(str(num) for num in combination) for combination in combinations]
len(combinations)

16

In [9]:
def getTotalBscore(bscore_file):
    bscore_array = []
    with open(bscore_file,'rb') as f:
        bscore_array = pickle.load(f)
    total_bscore = np.array([]).reshape(62,0)
    for page in bscore_array:
        try:
            total_page = unpackbits(np.array(page), 62)
        except TypeError:
            total_page = np.array([]).reshape(62,0)
            for num in page:
                col = np.array(decodeColumn(num)).reshape(62,-1)
                total_page = np.concatenate((total_page,col),axis=1)
        total_bscore = np.concatenate((total_bscore,total_page),axis=1)
    return total_bscore

In [10]:
def decodeColumn(num):
    col = []
    for i in range(62):
        col.insert(0,num%2)
        num = int(num/2)
    return col

In [11]:
def unpackbits(x, num_bits):
    xshape = list(x.shape)
    x = x.reshape([-1, 1])
    mask = 2**np.arange(num_bits, dtype=x.dtype).reshape([1, num_bits])
    return np.flip((x & mask).astype(bool).astype(float), 1).T

We precompute powers of 2 from $2^0$ to $2^{61}$ to speed up calculating hash values.

In [12]:
powers = 1 << np.arange(62)[::-1]

In [13]:
def make_fps(data, combinations, dbs, piece):
    for colindex in range(len(data)):
        for combination in combinations:
            cols = []
            # we need at least enough fingerprints for all the indices in our combination
            try:
                for i in combination:
                    cols.append(data[colindex+int(i)])
            except IndexError:
                continue
            fp = []
            equals_Zero = True
            for column in cols:
                hashint = int(column.dot(powers))
                fp.append(hashint)
                if hashint != 0:
                    equals_Zero = False
            if equals_Zero == True:
                continue
            dbs[combination][tuple(fp)][piece].append(colindex)

Now we iterate over every query in our list of queries, compute the bootleg score for that query, generate all possible n-grams from the bootleg score, and store it in our database, which is an in-memory Python dictionary.

In [14]:
def make_DB(filelist, combinations, dbs = None):
    if not dbs:
        dbs = {combination: defaultdict(lambda : defaultdict(list)) for combination in combinations}
    with open(filelist, 'r') as f:
        failed = []
        for i, curfile in enumerate(f):
            curfile = curfile.strip().strip('\n')
            try:
                num = curfile.split('/')[-1][0]
                if num == 'd':
                    data = getTotalBscore(curfile)
                else:
                    with open(curfile, 'rb') as pickle_file:
                        data = pickle.load(pickle_file)
                make_fps(data.T, combinations, dbs, i)
            except:
                failed.append(curfile)
    return dbs

In [15]:
def store_DB(db, combination, outdir):
    with open(f"{outdir}/{combination}.pkl", "wb") as f:
        dill.dump(db, f)

In [16]:
filelist = 'cfg_new/db.list'

In [17]:
db_dir = '/data1/dyang/Marketplace_db'

In [18]:
os.makedirs(db_dir, exist_ok=True)

We make a mapping from a number to each piece to save memory.

In [19]:
num_to_piece = {}

In [20]:
with open(filelist, 'r') as f:
    failed = []
    for i, curfile in enumerate(f):
        curfile = curfile.strip().strip('\n')
        piece = curfile.split('/')[-1][:-4]
        num_to_piece[i] = piece

FileNotFoundError: [Errno 2] No such file or directory: 'cfg_new/db.list'

In [21]:
with open("num_to_piece_random.pkl", "wb") as f:
    pickle.dump(num_to_piece, f)

In [22]:
with open("num_to_piece_random.pkl", 'rb') as f:
    num_to_piece = pickle.load(f)

Here we create a database for each n-gram type (total 22). Each n-gram database maps from the n-gram (a tuple) to a dictionary. This dictionary maps each piece that the n-gram appears in to a list of offsets within the piece.

In [None]:
for i in range(0, len(combinations), 3):
    dbs = make_DB(filelist, combinations[i: i+3])
    for combination in dbs:
        store_DB(dbs[combination], combination, db_dir)
    dbs.clear()

## Make fingerprint matches table

The fingerprint matches table is a nested dictionary. 
* The first level maps each n-gram type to another dictionary. 
* The second dictionary maps each n-gram of that type to a tuple (x, y), where x is the total number of times the fingerprint occurs in IMSLP and y is the total number of PDFs in IMSLP with that fingerprint.

In [23]:
db_dir = '/data1/dyang/Marketplace_db'

In [24]:
fp_matches = {combination: {} for combination in combinations}

In [25]:
for combination in combinations:
    print(combination)
    with open(f"{db_dir}/{combination}.pkl", "rb") as f:
        d = dill.load(f)
    keys = list(d.keys())
    for fp in tqdm.tqdm(keys):
        # first element is number of times the fingerprint occurs, second element is number of PDFs containing the fingerprint
        fp_matches[combination][fp] = (sum(len(d[fp][piece]) for piece in d[fp]), len(d[fp]))
    d.clear()

0


100%|██████████| 525138/525138 [00:04<00:00, 111562.92it/s]


01


100%|██████████| 7784183/7784183 [00:36<00:00, 213746.37it/s]


02


100%|██████████| 8609973/8609973 [00:30<00:00, 279922.99it/s]


03


100%|██████████| 8851338/8851338 [00:31<00:00, 278038.82it/s]


04


100%|██████████| 9219432/9219432 [00:33<00:00, 275258.59it/s]


05


100%|██████████| 9235715/9235715 [00:34<00:00, 266335.67it/s]


012


100%|██████████| 22307224/22307224 [01:14<00:00, 298736.53it/s]


013


100%|██████████| 23445645/23445645 [01:29<00:00, 263050.90it/s]


014


100%|██████████| 24267480/24267480 [01:38<00:00, 246681.55it/s]


015


100%|██████████| 24713977/24713977 [01:45<00:00, 234066.94it/s]


023


100%|██████████| 23413888/23413888 [01:40<00:00, 232365.81it/s]


024


100%|██████████| 24079679/24079679 [01:49<00:00, 219046.66it/s]


025


100%|██████████| 25050140/25050140 [01:56<00:00, 214741.05it/s]


034


100%|██████████| 24231752/24231752 [01:41<00:00, 239820.72it/s]


035


100%|██████████| 25061971/25061971 [01:44<00:00, 240875.58it/s]


045


100%|██████████| 24707724/24707724 [01:45<00:00, 233763.08it/s]


In [None]:
#import dill
fp_matches_file = f"{db_dir}/fp_matches.pkl"
with open(fp_matches_file, "wb") as f:
    pickle.dump(fp_matches, f)

In [18]:
len(fp_matches['01'].keys())

7784183

In [19]:
with open('/data1/kji/databases_v4/fp_matches.pkl', 'rb') as f:
    a = pickle.load(f)
len(a['01'].keys())

KeyboardInterrupt: 

In [21]:
pickle.HIGHEST_PROTOCOL

5