In [21]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

import spectra_cluster.clustering_parser as clustering_parser
import spectrum_utils.plot as sup
import nb_utils

from ms_io import ms_io


### Function to parse the clusters contained in a file

In [22]:
def extract_clusters_stats(file):
    parser = clustering_parser.ClusteringParser(file)

    totalSp = 0
    totalClusteredSp = 0
    nClusters = 0 # incremented only if the cluster contains at least 2 spectra
    clusters = {}

    for cluster in parser:
        totalSp = totalSp + cluster.n_spectra
        if cluster.n_spectra >= 2:
            nClusters = nClusters + 1
            totalClusteredSp = totalClusteredSp + cluster.n_spectra
            clusters[cluster.id] = []

            for sp in cluster._spectra:
                _, src_verb, id_verb, _ = sp.title.split('#')
                _, _, id = id_verb.split('=')
                _, src = src_verb.split('=')
                id = int(id) - 1
                clusters[cluster.id].append( (src, id) )

    avSize = np.mean([len(l) for l in clusters.values()])

    return totalSp, totalClusteredSp, nClusters, avSize, clusters


### Show the stats + plot some clusters

In [26]:
dir_clustering = "prideClusters/Prot_01/fragm_0.05"
clustering_file = os.path.join(dir_clustering, "0-99_noFiltering.clustering")

totalSp, totalClusteredSp, nClusters, avSize, clusters = extract_clusters_stats(clustering_file)

propClusteredSp = totalClusteredSp/totalSp*100
print("Total number of spectra:", totalSp)
print("Total clustered spectra:", totalClusteredSp)
print("Proportion of clustered spectra: {0:.2f}%".format(propClusteredSp))
print("Number of clusters:", nClusters)
print("Average size: {0:.2f}".format(avSize))


Total number of spectra: 463455
Total clustered spectra: 12797
Proportion of clustered spectra: 2.76%
Number of clusters: 5476
Average size: 2.34


### Plot some clusters

In [24]:
dir_mgf = "../datasets/CCLE_Protein_01"
export_path = os.path.join('prideClusters/Prot_01/fragm_0.05', 'fig_0-99999')
limit = 100

counter = 0
df_dic = {'gl_cl_id': [],
          'gl_min_sim': [],
          'gl_max_sim': [],
          'gl_av_sim': [],
          'gl_prec_diff': [],
          'cl_size': []}

for id_cl, cl in tqdm(clusters.items(), total=limit):
    sps = []
    for (filename, scanid) in cl:
        sp = ms_io.get_one_spectrum(os.path.join(dir_mgf, filename), scanid)
        sps.append(sp)

    nsp = len(sps)
    fig, axs = plt.subplots(nsp-1, 1, figsize=(12, 6*(nsp-1)))
    fig.suptitle(f"Cluster {id_cl}")

    # Plot the spectra
    for j in range(1, len(sps)):
        ax = axs if nsp == 2 else axs[j-1]
        spectrum_top, spectrum_bottom = sps[0], sps[j]
        ax.set_title("({}/{}) : {} and {}".format(
            j, nsp-1,
            spectrum_top.identifier,
            spectrum_bottom.identifier
        ))
        sup.mirror(spectrum_top, spectrum_bottom, ax=ax)

    plt.savefig(os.path.join(export_path, f'{id_cl}.png'), dpi=300)
    plt.close()

    # Compute the similarity
    sim = []
    prec_diff = []
    for i in range(nsp-1):
        for j in range(i+1, nsp):
            s = nb_utils.hdvectors_distance( (sps[i], sps[j]) )
            sim.append(s)

            prec_diff.append(abs(sps[i].precursor_mz - sps[j].precursor_mz))

    df_dic['gl_cl_id'].append(id_cl)
    df_dic['gl_min_sim'].append(min(sim))
    df_dic['gl_max_sim'].append(max(sim))
    df_dic['gl_av_sim'].append(np.mean(sim))
    df_dic['gl_prec_diff'].append(max(prec_diff))
    df_dic['cl_size'].append(nsp)

    counter = counter + 1
    if counter > limit:
        break

  0%|          | 0/100 [00:00<?, ?it/s]

In [25]:
df = pd.DataFrame(data=df_dic)
df.columns = ['cluster_id', 'min_distance', 'max_distance', 'av_distane', 'max_prec_mz_diff', 'cluster_size']
df.to_csv(os.path.join(export_path, "distances.csv"))