In [1]:
import sys

import pandas as pd

from os.path import join

sys.path.append(join("..", "..", ".."))

from rampt.helpers.types import StrPath
from rampt.steps.analysis.statistics import *
from rampt.helpers.openms import OpenMS_File_Handler

In [2]:
summary = pd.read_csv(
	join("..", "..", "..", "tests", "example_files", "summary.tsv"), sep="\t", index_col=0
)
summary.head(5)

Unnamed: 0,ID,m/z,retention time,acnA_R1_P3-C1_pos.mzML Peak area,Sirius_formula,Sirius_formula_confidence,Sirius_formula_NPC_pathway,Sirius_formula_NPC_pathway_confidence,Sirius_formula_NPC_superclass,Sirius_formula_NPC_superclass_confidence,...,Sirius_structure_ClassyFire_level_5,Sirius_structure_ClassyFire_level_5_confidence,Sirius_structure_ClassyFire_most_specific_class,Sirius_structure_ClassyFire_most_specific_class_confidence,Sirius_denovo_structure_smiles,Sirius_denovo_structure_CSI:FingerIDScore,FBMN_compound_name,FBMN_MQ_score(cosine),FBMN_m/z_error(ppm),FBMN_shared_peaks
0,2,267.12273,0.258567,8440.623,C14H18O5,0.971,Polyketides,0.397,Styrylpyrones,0.313,...,Carboxylic acid esters,0.555,Carboxylic acid esters,0.555,CC=C(OC)C(C)OC(=O)C=CC1=CC(C)OC1=O,-137.273,GLUTATHIONE - 40.0 eV,0.951968,1.58486,15.0
1,5,387.181498,0.258567,103012.305,C22H26O6,0.231,Shikimates and Phenylpropanoids,0.873,Lignans,0.097,...,,,Benzene and substituted derivatives,0.961,CCOC1C(C(C2C(O1)COC(O2)C3=CC=CC=C3)OCC4=CC=CC=...,-124.133,Adenosine - 40.0 eV,0.919218,0.682964,7.0
2,8,404.207959,0.258567,84145.45,C22H26O6,0.515,Shikimates and Phenylpropanoids,0.846,Styrylpyrones,0.132,...,,,Benzene and substituted derivatives,0.918,OCC1OC2COCOC2C(OCc2ccccc2)C1OCc1ccccc1,-157.411,N-Acetyl-L-aspartic acid - 40.0 eV,0.862974,7.0139,8.0
3,14,489.227182,0.258567,5538.1426,C30H32O6,0.0,Shikimates and Phenylpropanoids,0.952,Coumarins,0.311,...,,,Anisoles,0.554,COc1ccc(C(=O)COc2cc(OC)c(C(=O)c3ccc(C(C)C)cc3)...,-339.451,,,,
4,16,506.25372,0.258567,4641.331,C25H35N3O8,0.241,Carbohydrates,0.282,Anthranilic acid alkaloids,0.17,...,,,Pyranones and derivatives,0.572,CC(=O)OCC1OC2COC(c3ccccc3)OC2C(OCc2ccccc2)C1c1...,-237.828,AEG(o-16:3/16:0),0.819039,0.555382,14.0


In [3]:
oms_file_handler = OpenMS_File_Handler()

In [4]:
def search_check_peak_info(
	summary: pd.DataFrame,
	keywords_peaks: list[str] = ["peak area", "peak height"],
	keywords_pos: list[str] = ["pos", "+"],
	keywords_neg: list[str] = ["neg", "-"],
) -> dict:
	peak_columns = {"positive": [], "negative": []}
	for column_name in summary.columns:
		keyword_peak_found = bool(
			[column_name for keyword in keywords_peaks if keyword.lower() in column_name.lower()]
		)
		if keyword_peak_found:
			if (
				"float" in summary[column_name].dtype.name
				or "int" in summary[column_name].dtype.name
			):
				keyword_pos_found = bool(
					[
						column_name
						for keyword in keywords_pos
						if keyword.lower() in column_name.lower()
					]
				)
				keyword_neg_found = bool(
					[
						column_name
						for keyword in keywords_neg
						if keyword.lower() in column_name.lower()
					]
				)
				if keyword_pos_found:
					peak_columns["positive"].append(column_name)
				elif keyword_neg_found:
					peak_columns["negative"].append(column_name)
	return peak_columns


peak_columns = search_check_peak_info(summary=summary)

In [5]:
def z_score(summary: pd.DataFrame, peak_columns_mode: list) -> pd.DataFrame:
	if len(peak_columns_mode) < 2:
		warn(
			"Data must contain at least 2 columns with peak information to calculate z-scores between samples. Returning unchanged."
		)
		return summary[peak_columns_mode]
	else:
		analysis = stats.zscore(summary[peak_columns_mode], axis=1)
		return analysis


analysis_positive = z_score(summary, peak_columns["positive"])
analysis_negative = z_score(summary, peak_columns["negative"])



In [6]:
def complete_analysis(summary: pd.DataFrame, analysis: pd.DataFrame):
	peak_columns = search_check_peak_info(summary=summary)
	analysis = z_score(summary, peak_columns)
	return analysis


def export_results(summary: pd.DataFrame, peak_columns: list, out_path: StrPath):
	summary[peak_columns].to_csv(join(out_path, "analysis.tsv"), sep="\t")
	summary.to_csv(join(out_path, "analysis_full.tsv"), sep="\t")