# Test a version of falcon without Faiss

In [4]:
import os, sys, time
import pickle
import functools
import queue, threading

import numpy as np
import pandas as pd
import sklearn.metrics.pairwise
from sklearn.cluster import DBSCAN
from sklearn.utils import murmurhash3_32

from tqdm.notebook import tqdm
sys.path.append('../')
from cluster import spectrum

In [5]:
_min_mz, _max_mz = 101., 1500.
path = '/media/maesk/WD/falcon/PXD000561/spectra/'
charge = 2
mz_splits = range(50,1501)
#mz_splits = [600,601]
files = [f'{path}{charge}_{mz}.pkl' for mz in mz_splits if os.path.isfile(f'{path}{charge}_{mz}.pkl')]

# Create the export file
export_filename = '/media/maesk/WD/clusters_brute_force.csv'
pd.DataFrame({'identifier': [],
              'precursor_charge': [],
              'precursor_mz': [],
              'cluster': []}).to_csv(export_filename, index=False)

process_spectrum = functools.partial(
        spectrum.process_spectrum,
        min_peaks=5,
        min_mz_range=250.,
        mz_min=101.,
        mz_max=1500.,
        remove_precursor_tolerance=0.5,
        min_intensity=0.01,
        max_peaks_used=50,
        scaling=None)
vec_len, min_mz, max_mz = spectrum.get_dim(_min_mz, _max_mz, 0.05)
hash_lookup = np.asarray([murmurhash3_32(i, 0, True) % 800
                              for i in range(vec_len)], np.uint32)

/!\ We ignore large buckets for which the distance matrix does not fit in memory

In [None]:
curr_cl = 0
bucket_sizes = []
duration_tot = []
duration_dist = []
duration_clust = []

for pkl_filename in tqdm(files):
    sps = []

    with open(pkl_filename, 'rb') as f_in:
        for spec in pickle.load(f_in):
            spec = process_spectrum(spec)
            # Discard low-quality spectra.
            if spec is not None:
                sps.append(spec)
                
    if len(sps) > 30000:
        continue

    time_1 = time.time()
    vec_ld = spectrum.to_vector_parallel(sps, dim=800, min_mz=min_mz,
                                         max_mz=max_mz, bin_size=0.05,
                                         hash_lookup=hash_lookup, norm=True)
    time_2 = time.time()
    dist_mat = sklearn.metrics.pairwise.cosine_distances(vec_ld)
    time_3 = time.time()
    clustering = DBSCAN(eps=0.1, min_samples=2, metric='precomputed').fit(dist_mat)
    labels = clustering.labels_
    mask_noise = labels==-1
    labels[~mask_noise] = labels[~mask_noise] + curr_cl # Don't use same ids as other clusters
    time_4 = time.time()
    pd.DataFrame({'identifier': [sp.identifier for sp in sps],
                  'precursor_charge': [sp.precursor_charge for sp in sps],
                  'precursor_mz': [sp.precursor_mz for sp in sps],
                  'cluster': labels}).to_csv(export_filename, mode='a', header=False, index=False)
    curr_cl = curr_cl + sum(~mask_noise)

    bucket_sizes.append(len(sps))
    duration_tot.append(time_4-time_1)
    duration_dist.append(time_3-time_2)
    duration_clust.append(time_4-time_3)
    pd.DataFrame({'bucket': pkl_filename,
                  'size': bucket_sizes,
                  'time_total': duration_tot,
                  'time_dist': duration_dist,
                  'time_dbscan': duration_clust}).to_csv('bruteForcePerformances.csv')

  0%|          | 0/1201 [00:00<?, ?it/s]

Size 9994
Size 8798
Size 5096
Size 4884
Size 5138
Size 4600
Size 4536
Size 7921
Size 9423
Size 5693
Size 5366
Size 5911
Size 6601
Size 5699
Size 7443
Size 11181
Size 10117
Size 6235
Size 7103
Size 4881
Size 4869
Size 6343
Size 16413
Size 7960
Size 7690
Size 6515
Size 6674
Size 7018
Size 6822
Size 11794
Size 11702
Size 7376
Size 7945
Size 6867
Size 6206
Size 8527
Size 12237
Size 11787
Size 6065
Size 8009
Size 6939
Size 5803
Size 7209
Size 12821
Size 13782
Size 9264
Size 7369
Size 7958
Size 7202
Size 9957
Size 16400
Size 16483
Size 14408
Size 11102
Size 8932
Size 7020
Size 8900
Size 14063
Size 15310
Size 13783
Size 10138
Size 11785
Size 11140
Size 11467
Size 13495
Size 19874
Size 15481
Size 12974
Size 10422
Size 10068
Size 8515
Size 10463
Size 15708
Size 16251
Size 13732
Size 11670
Size 9236
Size 9380
Size 11567
Size 18642
Size 16491
Size 12317
Size 14637
Size 12557
Size 11408
Size 11241
Size 19889
Size 14182
Size 13647
Size 12392
Size 13218
Size 13706
Size 16246
Size 18458
Size 22681
Si

In [None]:
# Load 