In [5]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import itertools

import warnings

warnings.filterwarnings('ignore')

In [2]:
expected = pd.read_csv(
        "/Users/lukas/University/Bachelor_Thesis/Project/PeptideDeNovoSequencing/Data/BD7_Thermo_Pool52_HCD/Thermo_SRM_Pool_52_01_01_3xHCD-1h-R2-tryptic/msmsScans.txt",
        sep='\t')
count_spectra = expected.shape[0]
print("amount expected spectra:", count_spectra)
count_identified = expected.query("Identified == '+'").shape[0]
print("amount idenfied spectra:",count_identified )

FileNotFoundError: [Errno 2] No such file or directory: '/Users/lukas/University/Bachelor_Thesis/Project/PeptideDeNovoSequencing/Data/BD7_Thermo_Pool52_HCD/Thermo_SRM_Pool_52_01_01_3xHCD-1h-R2-tryptic/msmsScans.txt'

In [7]:
algorithms = [('DeepNovo', 'deepnovo'), ('Novor', 'novor'), ('DirecTag', 'direcTag'), ('PEAKS', 'peaks')]
pools = ['Pool_49', 'Pool_52', 'Pool_60']
combinations = itertools.product(pools, algorithms)
for comb in combinations:
    pool = comb[0]
    result_file = comb[1][1]
    alg = comb[1][0]
    # read dataset
    data = pd.read_csv(f'../../Data/ScoringResults/{pool}/{result_file}_scored.tsv', sep='\t', index_col=0)

    # plot similarity distribution
    plt.figure()
    similarity_pt = sns.histplot(data['Similarity'], bins=100)
    similarity_pt.set_title(f'{" ".join(pool.split("_"))} | {alg} - Similarity Distribution')
    plt.savefig(f'../../Data/Analysis/{pool}/{result_file}/hist_similarity.png')
    plt.close()

    # plot identity distribution
    plt.figure()
    identity_plt =sns.histplot(data['Identity'], bins=100, color='red')
    identity_plt.set_title(f'{" ".join(pool.split("_"))} | {alg} - Identity Distribution')
    plt.savefig(f'../../Data/Analysis/{pool}/{result_file}/hist_identity.png')
    plt.close()

    # plot similarity vs identity
    plt.figure()
    sns.scatterplot(data=data, x='Similarity', y='Identity')
    plt.title(f'{alg} - Similarity vs Identity')
    plt.savefig(f'../../Data/Analysis/{pool}/{result_file}/scatter_similarity_identity.png')
    plt.close()

    # plot levenstein distance distribution
    if 'Levenshtein' in data.columns:
        plt.figure()
        sns.histplot(data['Levenshtein'], bins=100, color='green')
        plt.title(f'{" ".join(pool.split("_"))} | {alg} - Levenshtein Distance Distribution')
        plt.savefig(f'../../Data/Analysis/{pool}/{result_file}/hist_levenshtein.png')
        plt.close()

    # plot local alignment score distribution
    plt.figure()
    sns.histplot(data['Local Alignment'], bins=100, color='purple')
    plt.title(f'{" ".join(pool.split("_"))} | {alg} - Local Alignment Score Distribution')
    plt.savefig(f'../../Data/Analysis/{pool}/{result_file}/hist_local_alignment.png')
    plt.close()

    # plot global alignment score distribution
    plt.figure()
    sns.histplot(data['Global Alignment'], bins=100, color='orange')
    plt.title(f'{" ".join(pool.split("_"))} | {alg} - Global Alignment Score Distribution')
    plt.savefig(f'../../Data/Analysis/{pool}/{result_file}/hist_global_alignment.png')
    plt.close()

    # plot normalized local alignment score distribution
    plt.figure()
    sns.histplot(data['Normalized Local Alignment'], bins=100, color='brown')
    plt.title(f'{" ".join(pool.split("_"))} | {alg} - Normalized Local Alignment Score Distribution')
    plt.savefig(f'../../Data/Analysis/{pool}/{result_file}/hist_norm_local_alignment.png')
    plt.close()
    # plot normalized global alignment score distribution
    plt.figure()
    sns.histplot(data['Normalized Global Alignment'], bins=100, color='pink')
    plt.title(f'{" ".join(pool.split("_"))} | {alg} - Normalized Global Alignment Score Distribution')
    plt.savefig(f'../../Data/Analysis/{pool}/{result_file}/hist_norm_global_alignment.png')
    plt.close()

    # print details
    print(f'Algorithm: {alg}', pool)
    print('100% similarity:', (data.query('Similarity == 1.0').shape[0]))
    print('100% identity:', (data.query('Identity == 1.0').shape[0]))
    if 'Levenshtein' in data.columns:
        print('0 levenshtein:', (data.query('Levenshtein == 0').shape[0]))
    print('total count:', data.shape[0], '\n')

Algorithm: DeepNovo Pool_49
100% similarity: 10896
100% identity: 3589
0 levenshtein: 3589
total count: 30256 

Algorithm: Novor Pool_49
100% similarity: 15629
100% identity: 8250
0 levenshtein: 8250
total count: 33877 

Algorithm: DirecTag Pool_49
100% similarity: 121002
100% identity: 60068
0 levenshtein: 0
total count: 251849 

Algorithm: PEAKS Pool_49
100% similarity: 15204
100% identity: 8156
0 levenshtein: 8156
total count: 28481 

Algorithm: DeepNovo Pool_52
100% similarity: 11688
100% identity: 2657
0 levenshtein: 2657
total count: 27962 

Algorithm: Novor Pool_52
100% similarity: 15252
100% identity: 8848
0 levenshtein: 8848
total count: 31591 

Algorithm: DirecTag Pool_52
100% similarity: 105899
100% identity: 46951
0 levenshtein: 0
total count: 230256 

Algorithm: PEAKS Pool_52
100% similarity: 16174
100% identity: 9555
0 levenshtein: 9555
total count: 30035 

Algorithm: DeepNovo Pool_60
100% similarity: 13072
100% identity: 4186
0 levenshtein: 4186
total count: 24707 

Algo

In [13]:
for p in pools:
    metrics = ['Local Alignment', 'Global Alignment', 'Normalized Local Alignment', 'Normalized Global Alignment']
    metrics_file = ['local_alignment', 'global_alignment', 'norm_local_alignment', 'norm_global_alignment']
    novor_df = pd.read_csv(f'../../Data/ScoringResults/{p}/novor_scored.tsv', sep='\t', index_col=0)
    deepnovo_df = pd.read_csv(f'../../Data/ScoringResults/{p}/deepnovo_scored.tsv', sep='\t', index_col=0)
    directTag_df = pd.read_csv(f'../../Data/ScoringResults/{p}/direcTag_scored.tsv', sep='\t', index_col=0)
    peaks_df = pd.read_csv(f'../../Data/ScoringResults/{p}/peaks_scored.tsv', sep='\t', index_col=0)
    for met, file_name in zip(metrics, metrics_file):
        plt.figure()
        plt.hist(novor_df[met], bins=100, color='blue', alpha=0.3, label='Novor', density=True)
        plt.hist(deepnovo_df[met], bins=100, color='red', alpha=0.3, label='DeepNovo', density=True)
        plt.hist(directTag_df[met], bins=100, color='green', alpha=0.3, label='DirectTag', density=True)
        plt.hist(peaks_df[met], bins=100, color='orange', alpha=0.3, label='PEAKS', density=True)
        plt.title(f'{" ".join(p.split("_"))} | {met} Density Distribution')
        plt.legend()
        plt.xlabel(f'{met} Score')
        plt.ylabel('Density')
        plt.savefig(f'../../Data/Analysis/{p}/hist_{file_name}.png')
        plt.close()