In [22]:
import pandas as pd
import seaborn as sns
import numpy as np

from scipy import stats

import plotly.express as px
import plotly.graph_objects as go

In [2]:
quant_df = pd.read_csv("../../data/processed/lc-ms_sc_1_iimn_gnps_quant.csv")
quantifications_df = quant_df[[col for col in quant_df.columns if "Peak area" in col]]

In [62]:
fig = go.Figure(
                data=go.Heatmap(x=quantifications_df.columns,
                                y=quantifications_df.index,
                                z=quantifications_df,
                                colorscale="Inferno"),
                layout={"height": 800})
fig.show()

## Accumulation in strains

In [4]:
quantifications_xblank = quantifications_df[[col for col in quantifications_df.columns if "blank" not in col.lower()]]

In [52]:
summed_signals = pd.DataFrame({"Signals found": np.sum(quantifications_xblank > 0.0, axis=0)})
fig = px.histogram(summed_signals, x="Signals found", nbins=20)
fig.update_layout( yaxis_title="Number of strains (#)" )
fig.show()

In [53]:
summed_signals = pd.DataFrame({"Signal intensity": quantifications_xblank.sum()})
fig = px.histogram(summed_signals, x="Signal intensity", nbins=20)
fig.update_layout( yaxis_title="Number of strains (#)" )
fig.show()

## Scoring

In [7]:
zscores = stats.zscore(quantifications_df, axis=1)

In [67]:
fig = go.Figure(data=go.Heatmap(x=quantifications_df.columns,
                                y=quantifications_df.index,
                                z=zscores,
                                zmin=-15.0,
                                zmax=15,
                                colorscale="Tropic"),
                layout={"height": 800})
fig.show()

In [12]:
def plot_cutoff_accumulation(zscores:pd.DataFrame, cutoff_range:tuple, axis:int=0, sample_marker=None):
    score_cutoffs = pd.DataFrame({i: np.sum((zscores > i) | (zscores < -i), axis=axis) for i in range(*cutoff_range)})
    names = score_cutoffs.index
    if sample_marker:
        marked_samples = [sample_marker in name for name in names] 
        score_cutoffs[sample_marker] = marked_samples
        fig = px.strip(score_cutoffs, color=sample_marker, hover_name=names)
    else:
        fig = px.strip(score_cutoffs, hover_name=names)
    fig.update_layout( xaxis_title="z-score cutoff", yaxis_title="Strains per metabolites" if axis else "Metabolites per strain",
                       hovermode="x", )
    #fig.data[0].update(span = (0, None), spanmode='manual')
    fig.show()

In [13]:
plot_cutoff_accumulation(zscores=zscores, cutoff_range=(3,10), axis=1)

In [14]:
plot_cutoff_accumulation(zscores=zscores, cutoff_range=(4,10), axis=0, sample_marker="blank")

## Merge annotations

In [15]:
structures_df = pd.read_csv("../../project_batches/sirius/summaries/structure_identifications.tsv", sep="\t")

In [16]:
annot_df = pd.concat([quant_df["row ID"], zscores], axis=1, )

In [17]:
def read_filter_sirius(file_path, filter_column, threshold_value:float) -> pd.DataFrame:
    df = pd.read_csv( file_path, sep="\t")
    df.replace("-Infinity", -np.inf, inplace=True)
    df[filter_column] = df[filter_column].replace(",", ".", regex=True).astype(float)
    solid_df = df.loc[df[filter_column] >= threshold_value]
    return solid_df.sort_values(filter_column, ascending=False)

In [18]:
solid_sirius_formulas = read_filter_sirius("../../project_batches/sirius/summaries/formula_identifications.tsv", "ZodiacScore", 0.5)
solid_sirius_formulas

Unnamed: 0,formulaRank,molecularFormula,adduct,precursorFormula,ZodiacScore,SiriusScore,TreeScore,IsotopeScore,numExplainedPeaks,explainedIntensity,...,medianAbsoluteMassErrorFragmentPeaks(ppm),massErrorPrecursor(ppm),lipidClass,ionMass,retentionTimeInSeconds,retentionTimeInMinutes,formulaId,alignedFeatureId,mappingFeatureId,overallFeatureQuality
318,1,C17H22O2,[M + H3N + H]+,C17H26NO2+,1.000,141567,138148,3419,37,0599,...,4439,-0471,,276196,120,1992,633592464038598105,633591814802215644,9980,
0,1,C7H10N2O,[M + H3N + H]+,C7H14N3O+,1.000,65864,65864,0000,26,0829,...,9135,-17386,,156110,3,0045,633592390407567197,633591789191793566,85,
294,1,C8H8O4,[M + H3N + H]+,C8H12NO4+,1.000,172340,170074,2267,28,0676,...,6862,-0003,,186076,88,1461,633592390399178521,633591813489398313,7905,
292,1,C2H4ClFN2O2S,[M - H]-,C2H3ClFN2O2S-,1.000,48196,48196,0000,11,0494,...,5650,0649,,172959,85,1421,633592390281737826,633591814399562390,8673,
289,1,C5H6O4,[M + H3N + H]+,C5H10NO4+,1.000,72754,72754,0000,17,0762,...,6378,0076,,148060,83,1381,633592363530458132,633591813975937631,8073,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,1,C32H62N4O10,[M + H3N + H]+,C32H66N5O10+,0.517,72103,64722,7381,34,0823,...,2958,-0639,,680480,14,0225,633592536625229444,633591793704864855,1870,
107,1,H2O3P,[M + Na]+,H2NaO3P+,0.512,-5215,-5215,0000,0,0000,...,,-9198,,103962,28,0472,633592343829801032,633591805046263539,4549,
256,1,C4H6O3,[M - H]-,C4H5O3-,0.506,4500,4500,0000,0,0000,...,,8339,,101025,75,1245,633592359793327303,633591808368152748,6504,
15,1,C40H60O7,[M + H3N + H]+,C40H64NO7+,0.505,85401,76112,9289,34,0793,...,6171,-0266,,670468,14,0228,633592487920967984,633591790454279180,920,


In [19]:
solid_sirius_structures = read_filter_sirius("../../project_batches/sirius/summaries/structure_identifications.tsv", "ConfidenceScoreApproximate", 0.5)
solid_sirius_structures

Unnamed: 0,structurePerIdRank,formulaRank,ConfidenceScoreExact,ConfidenceScoreApproximate,CSI:FingerIDScore,ZodiacScore,SiriusScore,molecularFormula,adduct,precursorFormula,...,pubchemids,links,dbflags,ionMass,retentionTimeInSeconds,retentionTimeInMinutes,formulaId,alignedFeatureId,mappingFeatureId,overallFeatureQuality
141,1,1,990,0.99,-6895,1000,60344,C9H19NO4,[M - H]-,C9H18NO4-,...,154731314;163322579;5748487;153710400;54315096...,PUBCHEMANNOTATIONBIO:(null);HMDB:(HMDB0004231 ...,140382571374,204126,34,560,633592440412068526,633591804387757716,4099,
196,1,1,936,0.986,-7672,1000,53832,C6H9NO5,[M - H]-,C6H8NO5-,...,4217221;6941837;163321899;51037023;51038524;77...,PUBCHEMANNOTATIONBIO:(null);HMDB:(HMDB0000812)...,138906175854,174042,72,1208,633592390495647718,633591810209452374,7080,
96,1,1,878,0.976,-15395,1000,76945,C6H9NO5,[M - H]-,C6H8NO5-,...,4217221;6941837;163321899;51037023;51038524;77...,PUBCHEMANNOTATIONBIO:(null);HMDB:(HMDB0000812)...,138906175854,174042,30,500,633592390520813690,633591802609372699,3699,
225,1,1,930,0.968,-8759,946,103022,C10H17N3O6S,[M - H]-,C10H16N3O6S-,...,42615251;58833436;9839547;20756463;40467184;87...,PUBCHEMANNOTATIONBIO:(null);HMDB:(HMDB0000125 ...,139577465214,306079,80,1330,633592476902530524,633591812126249463,7579,
230,1,1,722,0.879,-37394,1000,164031,C5H9NO4,[M - H]-,C5H8NO4-,...,42615584;101073736;101129201;101129202;2157512...,PUBCHEMANNOTATIONBIO:(null);HMDB:(HMDB0000148 ...,139577465726,146047,82,1370,633592357792642500,633591808938578152,6768,
146,1,1,873,0.873,-27361,693,38514,C9H17NO5,[M - H]-,C9H16NO5-,...,101257666;6997253;163322206;5191579;163581050;...,PUBCHEMANNOTATIONBIO:(null);HMDB:(HMDB0000210 ...,139577465726,218105,35,587,633592403799982124,633591804639415981,4181,
200,1,1,863,0.863,-15052,986,56451,C5H6N2O4,[M - H]-,C5H5N2O4-,...,136898439;6971032;5460289;5461056;53627468;252...,PUBCHEMANNOTATIONBIO:(null);HMDB:(HMDB0000528 ...,139443247486,157027,73,1212,633592390508230666,633591809911656765,7046,
236,1,2,387,0.857,-4535,12,42148,C4H6O5,[M - H]-,C4H5O5-,...,17754080;101429368;101429369;157366046;1573660...,PUBCHEMANNOTATIONBIO:(null);HMDB:(HMDB0000156 ...,139577461630,133015,83,1386,633592354504306084,633591812252078588,7673,
94,1,1,813,0.813,-425904,1000,50474,C37H66N6O6,[M - H]-,C37H65N6O6-,...,122220106,PUBCHEM:(122220106),2,689500,30,497,633592786920336211,633591796540214505,2535,
187,1,2,807,0.807,-25826,15,94566,C7H15NO3,[M + H]+,C7H16NO3+,...,118524994;118524995;6914103;6914104;87285198;1...,PUBCHEMANNOTATIONBIO:(null);HMDB:(HMDB0000062 ...,139575364462,162112,69,1145,633592377472326382,633591807206330389,5437,


In [20]:
solid_sirius_denovo_structures = read_filter_sirius("../../project_batches/sirius/summaries/denovo_structure_identifications.tsv", "CSI:FingerIDScore", -1000)
solid_sirius_denovo_structures

# INCLUDE DE NOVO IF CSI:FingerIDScore De Novo < CSI:FingerIDScore Structures

Unnamed: 0,structurePerIdRank,formulaRank,CSI:FingerIDScore,ModelScore,ZodiacScore,SiriusScore,molecularFormula,adduct,precursorFormula,InChIkey2D,InChI,name,smiles,ionMass,retentionTimeInSeconds,retentionTimeInMinutes,formulaId,alignedFeatureId,mappingFeatureId,overallFeatureQuality
60,1,1,-3.739,-1913,1000,125130,C37H72NO8P,[M - H]-,C37H71NO8P-,CZOSTDZGCCEZTJ,InChI=1S/C37H72NO8P/c1-3-5-7-9-11-13-15-17-19-...,(2-aminoethoxy)[2-(hexadec-9-enoyloxy)-3-(hexa...,CCCCCCCCCCCCCCCC(=O)OCC(COP(=O)(O)OCCN)OC(=O)C...,688496,18,0302,633592753785332260,633591797379075380,2802,
25,1,3,-4.027,-0153,0027,13888,C4HF9O3S,[M - H]-,C4F9O3S-,JGTNAGYHADQMCM,"InChI=1S/C4HF9O3S/c5-1(6,3(9,10)11)2(7,8)4(12,...",,C(C(C(F)(F)S(=O)(=O)O)(F)F)(C(F)(F)F)(F)F,298945,15,0252,633592471043087220,633591792849226799,1536,
32,1,1,-4.237,-0742,1000,7030431,C42H79O10P,[M - H]-,C42H78O10P-,DSNRWDQKZIEDDB,InChI=1S/C42H79O10P/c1-3-5-7-9-11-13-15-17-19-...,"1,2-Di-(9E-Octadecenoyl)-Sn-Glycero-3-Phospho-...",CCCCCCCCC=CCCCCCCCC(=O)OCC(COP(=O)(O)OCC(CO)O)...,773538,15,0254,633593053522894358,633591797597179203,2856,
245,1,2,-4.535,-0155,0012,42148,C4H6O5,[M - H]-,C4H5O5-,BJEPYKJPYRNKOW,"InChI=1S/C4H6O5/c5-2(4(8)9)1-3(6)7/h2,5H,1H2,(...",Malate,C(C(C(=O)O)O)C(=O)O,133015,83,1386,633592354504306084,633591812252078588,7673,
100,1,1,-5.267,-4414,1000,74517,C39H74NO8P,[M - H]-,C39H73NO8P-,UIELPOKGTOHFNL,InChI=1S/C39H74NO8P/c1-3-5-7-9-11-13-15-17-18-...,(2-aminoethoxy)[2-(hexadec-9-enoyloxy)-3-(octa...,CCCCCCCCC=CCCCCCCCC(=O)OCC(COP(=O)(O)OCCN)OC(=...,714512,30,0498,633592775587325707,633591802961694255,3770,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,1,2,-370.673,-14659,0034,84105,C39H59NO13,[M + H]+,C39H60NO13+,FASQQSYDIVROID,InChI=1S/C39H59NO13/c1-17-10-27(51-36(49)24(17...,,CC1=C(COC2OC(CO)C(O)C(O)C2O)C(=O)OC(C(C)C(O)C(...,750407,17,0280,633593289393775745,633591793931357296,2036,
38,1,1,-428.159,-14779,1000,63299,C43H67N3O6,[M - H]-,C43H66N3O6-,BWWQFRYTRDMDLG,InChI=1S/C43H67N3O6/c1-4-6-8-10-12-14-15-16-17...,,CCCCCCCCCCCCCCCC(=O)NC1C(=O)C(OC(=O)C2=Nc3cccc...,720495,15,0258,633593299418162597,633591800763878874,3498,
77,1,1,-479.432,-11898,0000,41198,C26H36N2O6,[M + H]+,C26H37N2O6+,HTUDYBHLJVZOOG,InChI=1S/C26H36N2O6/c1-18(2)9-8-11-20(4)15-19(...,,COc1ccc(OC(=O)O)c(NC(=O)CNC(=O)C=CC=CC(C)=CC(C...,473264,22,0368,633592555336020123,633591796364053722,2510,
17,1,1,-485.463,-13697,0994,148905,C25H32N2O8,[M + H]+,C25H33N2O8+,BWNNXKLVEMGRJO,InChI=1S/C25H32N2O8/c1-13(2)11-16(28)20-18(24(...,,COC(=O)C1=C(C(O)CC(C)C)NC(=O)C(C(=O)OCc2ccccc2...,489227,14,0233,633592499778266576,633591800457694647,3257,


In [21]:
solid_sirius_denovo_structures = read_filter_sirius("../../project_batches/sirius/summaries/canopus_structure_summary.tsv", "CSI:FingerIDScore", -1000)
solid_sirius_denovo_structures

# Strctures:
"""
NPC#pathway	NPC#pathway Probability	NPC#superclass	NPC#superclass Probability	NPC#class	NPC#class Probability
ClassyFire#superclass	ClassyFire#superclass probability	ClassyFire#class	ClassyFire#class Probability	ClassyFire#subclass	ClassyFire#subclass Probability
ClassyFire#level 5	ClassyFire#level 5 Probability	ClassyFire#most specific class	ClassyFire#most specific class Probability	ClassyFire#all classifications	
ionMass	retentionTimeInSeconds	retentionTimeInMinutes	formulaId	alignedFeatureId	mappingFeatureId	overallFeatureQuality
"""
# 

KeyError: 'CSI:FingerIDScore'