# Compare the distance matrices

This notebook can be used to compare the distance matrix obtained by 
* cosine distance s.t. it is defined in MS-cluster
* cosine distance between the high dimensional vectors of falcon 
* the ANN search used by falcon

In [11]:
import os
import sys
import scipy.sparse as ss
import matplotlib
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

sys.path.append('..')
import config
from cluster import cluster, spectrum

import nb_utils
read_spectra = nb_utils.read_spectra
exact_sparse_matrix = nb_utils.exact_sparse_matrix

In [12]:
data = {}

data['erwinia'] = {'path': '/media/maesk/WD/falcon/erwinia'}
data['CCLE_7000'] = {'path': '/media/maesk/WD/falcon/CCLE_7000'}
#data['CCLE_5M'] = {'path': os.path.abspath("/media/maesk/WD/falcon/CCLE_5M")}
#data['PXD000561'] = {'path': os.path.abspath("/media/maesk/WD/falcon/PXD000561_all/spectra")}

charge = 2
exportPath = "compare_CCLE_PXD000561"

# Extract the precursor mzs for each dataset
for ds in data.values():
    precursor_mzs = []
    for spec in tqdm(read_spectra(charge, os.path.join(ds['path'], 'spectra'))):
        precursor_mzs.append(spec.precursor_mz)
        
    ds['precursor_mzs'] = precursor_mzs

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

### Show the precursor mz's distribution (1 m/z wide bins)

In [13]:
"""
# TODO fix bug, the 1st histogram takes the yaxis of the second one
fig, axs = plt.subplots(1, len(data), figsize=(12,4))
fig.suptitle(f'Precursos mz\' distribution (charge {charge})')
ds_names = list(data.keys())
for i in range(0,len(ds_names)):
    ds = data[ds_names[i]]
    min_mz = math.floor(min(ds['precursor_mzs']))
    max_mz = math.floor(max(ds['precursor_mzs']))
    bins = range(min_mz, max_mz+1, 1)
    axs[i].hist(ds['precursor_mzs'], bins=bins)
    axs[i].set(title=ds_names[i], xlabel="Precursor mz", ylabel="Number of spectra")
plt.tight_layout()
plt.savefig(os.path.join(exportPath, 'precursor_mzs'), dpi=150)
plt.show()
"""

'\n# TODO fix bug, the 1st histogram takes the yaxis of the second one\nfig, axs = plt.subplots(1, len(data), figsize=(12,4))\nfig.suptitle(f\'Precursos mz\' distribution (charge {charge})\')\nds_names = list(data.keys())\nfor i in range(0,len(ds_names)):\n    ds = data[ds_names[i]]\n    min_mz = math.floor(min(ds[\'precursor_mzs\']))\n    max_mz = math.floor(max(ds[\'precursor_mzs\']))\n    bins = range(min_mz, max_mz+1, 1)\n    axs[i].hist(ds[\'precursor_mzs\'], bins=bins)\n    axs[i].set(title=ds_names[i], xlabel="Precursor mz", ylabel="Number of spectra")\nplt.tight_layout()\nplt.savefig(os.path.join(exportPath, \'precursor_mzs\'), dpi=150)\nplt.show()\n'

### Compute the exact pairwise matrix

In [14]:
precursor_tol_mass = 20 # ppm

for ds_name, ds in data.items():
    dist_exact, n_sp_diff_bucket = \
        exact_sparse_matrix(
            os.path.join(ds['path'], 'spectra'),
            charge,
            precursor_tol_mass)

    dist_falcon = ss.load_npz(os.path.join(ds['path'], 'nn', f'dist_{charge}.npz'))

    assert dist_falcon.shape == dist_exact.shape
    n_spectra = dist_falcon.shape[0]
    n_lost = dist_exact.nnz - dist_falcon.nnz

    print('Dataset: %s' % (ds_name,) )
    print('\tShape: (%d,%d)' % (n_spectra, n_spectra) )
    print('\tNumber of entries in the exact matrix: %d' % (dist_exact.nnz,) )
    print('\tNumber of entries in the falcon matrix: %d' % (dist_falcon.nnz,) )
    print('\tNumber of spectra lost: %d' % (n_lost,) )
    print('\tProportion of spectra lost: %.4f %% \n\n' % (n_lost / (n_spectra**2) * 100 ,) )

  0%|          | 0/3749 [00:00<?, ?it/s]

Dataset: erwinia
	Shape: (3716,3716)
	Number of entries in the exact matrix: 11470
	Number of entries in the falcon matrix: 11454
	Number of spectra lost: 16
	Proportion of spectra lost: 0.0001 % 


Dataset: CCLE_7000
	Shape: (3587,3587)
	Number of entries in the exact matrix: 9017
	Number of entries in the falcon matrix: 9017
	Number of spectra lost: 0
	Proportion of spectra lost: 0.0000 % 




  0%|          | 0/3603 [00:00<?, ?it/s]

In [None]:
fig, ax = plt.subplots()


Sanity check: verify that all elements in the falcon distance matrix are in the
exact matrice. The falcon matrix will contain less elements because:
* It only contains *n_neighbors* entries for each spectrum
* Spectra with close precursor mzs in neighboring buckets are not compared

For small datasets (less than 1000 spectra per bucket), such that no partitioning is used
and a bruteforce approach is used, the following sanity tests should pass

In [15]:
"""assert len(indices_lost( (dist_falcon, dist_exact) )) == 0
print("OK, all the entries of the falcon matrix are in the exact matrix")

assert len(indices_lost( (dist_exact, dist_falcon) )) == n_missed_sp
print("OK, all the indices lost correspond to spectra in different buckets")"""

'assert len(indices_lost( (dist_falcon, dist_exact) )) == 0\nprint("OK, all the entries of the falcon matrix are in the exact matrix")\n\nassert len(indices_lost( (dist_exact, dist_falcon) )) == n_missed_sp\nprint("OK, all the indices lost correspond to spectra in different buckets")'