## Databases for each n-gram combination

There are 26 different n-gram combinations with various stride lengths. This notebook constructs a database for each type of n-gram and also a database storing the counts for each n-gram type.

In [1]:
%matplotlib inline

In [2]:
import itertools
import pickle
from collections import defaultdict
import numpy as np
import pickle
import pandas as pd
import sqlalchemy
import os
import time
import dill
from tqdm import tqdm
from glob import iglob

In [3]:
combinations = []
for n_gram in range(1, 5):
    combinations += [[0] + list(tup) for tup in itertools.combinations(range(1, 6), n_gram-1)]

In [4]:
combinations = ["".join(str(num) for num in combination) for combination in combinations]

In [5]:
def getTotalBscore(bscore_file):
    bscore_array = []
    with open(bscore_file,'rb') as f:
        bscore_array = pickle.load(f)
    total_bscore = np.array([]).reshape(62,0)
    for page in bscore_array:
        try:
            total_page = unpackbits(np.array(page), 62)
        except TypeError:
            total_page = np.array([]).reshape(62,0)
            for num in page:
                col = np.array(decodeColumn(num)).reshape(62,-1)
                total_page = np.concatenate((total_page,col),axis=1)
        total_bscore = np.concatenate((total_bscore,total_page),axis=1)
    return total_bscore

In [6]:
def decodeColumn(num):
    col = []
    for i in range(62):
        col.insert(0,num%2)
        num = int(num/2)
    return col

In [7]:
def unpackbits(x, num_bits):
    xshape = list(x.shape)
    x = x.reshape([-1, 1])
    mask = 2**np.arange(num_bits, dtype=x.dtype).reshape([1, num_bits])
    return np.flip((x & mask).astype(bool).astype(float), 1).T

We precompute powers of 2 from $2^0$ to $2^{61}$ to speed up calculating hash values.

In [8]:
powers = 1 << np.arange(62)[::-1]

In [9]:
def make_fps(data, combinations, dbs, piece):
    for colindex in range(len(data)):
        for combination in combinations:
            cols = []
            # we need at least enough fingerprints for all the indices in our combination
            try:
                for i in combination:
                    cols.append(data[colindex+int(i)])
            except IndexError:
                continue
            fp = []
            equals_Zero = True
            for column in cols:
                hashint = int(column.dot(powers))
                fp.append(hashint)
                if hashint != 0:
                    equals_Zero = False
            if equals_Zero == True:
                continue
            dbs[combination][tuple(fp)][piece].append(colindex)

In [10]:
def make_DB(filelist, combinations, dbs = None):
    if not dbs:
        dbs = {combination: defaultdict(lambda : defaultdict(list)) for combination in combinations}
    with open(filelist, 'r') as f:
        failed = []
        for i, curfile in enumerate(f):
            curfile = curfile.strip().strip('\n')
            try:
                num = curfile.split('/')[-1][0]
                if num == 'd':
                    data = getTotalBscore(curfile)
                    make_fps(data.T, combinations, dbs, i)
                else:
                    with open(curfile, 'rb') as pickle_file:
                        data = pickle.load(pickle_file)
                    make_fps(data.T, combinations, dbs, i)
            except:
                failed.append(curfile)
    return dbs

In [11]:
def store_DB(db, combination, outdir):
    with open(f"{outdir}/{combination}.pkl", "wb") as f:
        dill.dump(db, f)

In [12]:
filelist = 'cfg_files/db.list'

In [21]:
db_dir = '/data1/kji/databases2'

In [14]:
os.makedirs(db_dir, exist_ok=True)

We make a mapping from a number to each piece to save memory.

In [77]:
num_to_piece = {}

In [78]:
with open(filelist, 'r') as f:
        failed = []
        for i, curfile in enumerate(f):
            curfile = curfile.strip().strip('\n')
            piece = curfile.split('/')[-1][:-4]
            num_to_piece[i] = piece

In [80]:
# with open("num_to_piece.pkl", "wb") as f:
#     pickle.dump(num_to_piece, f)

In [19]:
with open("num_to_piece.pkl", 'rb') as f:
    num_to_piece = pickle.load(f)

First loop over all combinations (total 26).
Then loop over every single file in IMSLP.
Then within each file, loop over every single offset and compute the n-gram given the combination.
Increment the count for that n-gram combination in the dictionary.
When done, pickle the dictionary and free the memory.

In [None]:
for i in range(0, 26, 2):
    dbs = make_DB(filelist, combinations[i: i+2])
    for combination in dbs:
        store_DB(dbs[combination], combination, db_dir)
    dbs.clear()

0
0
0
0
0
0
0
0
0


In [None]:
start = time.time()
dbs = make_DB(filelist, combinations[-10:-6])
db_end_time = time.time()
print(f"Finished constructing databases in {db_end_time - start} seconds")
for combination in dbs:
    store_DB(dbs[combination], combination, db_dir)
print(f"Finished storing databases on disk in {time.time() - db_end_time} seconds")

## Construct database of counts

This database is a mapping from each unique n-gram that is one of our 26 types to the number of matches in all of IMSLP.

In [12]:
files = []
for filename in iglob(f"{db_dir}/*.pkl", recursive=True):
    files.append(filename)

In [13]:
files.sort(key = len)

In [None]:
def store_DB(db, combination, outdir):
    with open(f"{outdir}/{combination}.pkl", "wb") as f:
        pickle.dump(db, f)

In [None]:
filename = f"/data1/kji/databases/split21_Bt_200mill.pkl"
with open(filename, "rb") as f:
    d = pickle.load(f)
counts = {}
for fp in d:
    counts[fp] = sum([len(d[fp][piece]) for piece in d[fp]])
d.clear()
print(f"finished processing {filename}")
store_DB(counts, f"split21_Bt_200mill_counts", db_dir)
print(f"stored database for {filename}")

In [9]:
store_DB(counts, f"split21_Bt_200mill_counts", db_dir)

In [None]:
for combination in combinations:
    filename = f"/data1/kji/databases2/{combination}.pkl"
    with open(filename, "rb") as f:
        d = dill.load(f)
        counts = {}
        for fp in d:
            counts[fp] = sum([len(d[fp][piece]) for piece in d[fp]])
        d.clear()
        print(f"finished processing {filename}")
        f.flush()
        store_DB(counts, f"{combination}_counts", db_dir)
        print(f"stored database for {filename}")

In [None]:
store_DB(matches, "matches_array", db_dir)

In [None]:
for combination in combinations:
    filename = f"/data1/kji/databases/{combination}_counts.pkl"
    with open(filename, "rb") as f:
        d = pickle.load(f)
        length = len(d)
        fps = np.empty(length, dtype = object)
        values = np.empty(length, dtype = int)
        for i, (n_gram, count) in enumerate(d.items()):
            fps[i] = n_gram
            values[i] = count
        d.clear()
        n_grams = np.concatenate([n_grams, fps])
        matches = np.concatenate([matches, values])
        print(f"finished processing {filename}")
        f.flush()

finished processing /data1/kji/databases/0125_counts.pkl
finished processing /data1/kji/databases/0134_counts.pkl
finished processing /data1/kji/databases/0135_counts.pkl
finished processing /data1/kji/databases/0145_counts.pkl
finished processing /data1/kji/databases/0234_counts.pkl
