In [60]:
import pandas as pd
import plotly.express as px
import re
import domaps
import numpy as np

In [5]:
basedir = r"H:\Proteomics Data\Ongoing Projects\VA1 - DIA maps"

In [14]:
dirs = {
    "equal windows": r"\VA1.2 - DIA nano EXPL\EXPL1_nano_100min_DIA_120K_15K_1-875s\MQ\combined_directDIA\txt",
    "variable windows": r"\VA1.2 - DIA nano EXPL\EXPL3_nano_100min_DIA_MQlive\MQ\combined_directDIA\txt",
    "44 1.2": r"\VA1.4 - DIA Evo Expl\single shot\44min\DIA_120K_15K_1-2s\MQ\directDIA\combined\txt",
    "44 1.5": r"\VA1.4 - DIA Evo Expl\single shot\44min\DIA_120K_15K_1-5s\MQ\combined_directDIA\txt",
    "44 2.0": r"\VA1.4 - DIA Evo Expl\single shot\44min\DIA_120K_15K_2s\MQ\\combined_directDIA\txt",
    "21 1.2": r"\VA1.4 - DIA Evo Expl\single shot\21min\Evosep_21min_DIA_120K_15K_1-2s\MQ\combined_directDIA\txt",
    "21 1.5": r"\VA1.4 - DIA Evo Expl\single shot\21min\Evosep_21min_DIA_120K_15K_1-5s\MQ\combined_directDIA\txt",
    "21 2.0": r"\VA1.4 - DIA Evo Expl\single shot\21min\Evosep_21min_DIA_120K_15K_2s\MQ\combined_directDIA\txt",
}

In [77]:
data = {}
for k, d in dirs.items():
    print(f"Reading {k}")
    data[k] = pd.read_csv(
        basedir + d + r"\evidence.txt",
        sep="\t",
        usecols=[
            "Precursor num scans",
            "Fragment median num scans",
            "Leading razor protein",
            "id",
            "Experiment",
        ],
    )

Reading equal windows
Reading variable windows
Reading 44 1.2
Reading 44 1.5
Reading 44 2.0
Reading 21 1.2
Reading 21 1.5
Reading 21 2.0


In [78]:
ppp = pd.DataFrame(columns=["points per peak"], index=list(data.keys()))
ppp.index.name = "Experiment"
for k, df in data.items():
    ppp.loc[k, "points per peak"] = df[df["Precursor num scans"] > 0][
        "Precursor num scans"
    ].mean()
ppp

Unnamed: 0_level_0,points per peak
Experiment,Unnamed: 1_level_1
equal windows,10.382963
variable windows,7.972049
44 1.2,12.790485
44 1.5,9.230067
44 2.0,7.94356
21 1.2,8.620195
21 1.5,6.912417
21 2.0,5.656492


In [82]:
ppp2 = pd.DataFrame(columns=["points per peak MS2"], index=list(data.keys()))
ppp2.index.name = "Experiment"
for k, df in data.items():
    ppp2.loc[k, "points per peak MS2"] = df[df["Fragment median num scans"] > 0][
        "Fragment median num scans"
    ].mean()
ppp2

Unnamed: 0_level_0,points per peak MS2
Experiment,Unnamed: 1_level_1
equal windows,7.992802
variable windows,6.924264
44 1.2,8.869119
44 1.5,7.057978
44 2.0,6.22108
21 1.2,6.267233
21 1.5,5.451366
21 2.0,4.777741


In [70]:
peptides = {}
for k, d in dirs.items():
    print(f"Reading {k}")
    settings = dict(
        column_filters={
            "Potential contaminant": ["!=", "'+'"],
            "Reverse": ["!=", "'+'"],
        },
        sets={"Intensity": "Intensity .*"},
        original_protein_ids="Proteins",
        genes="Gene names",
        name_pattern=".* (?P<rep>.*)_(?P<frac>.*)",
    )
    regex = domaps.generate_usecols_regex(settings)
    df = domaps.format_data_pivot(
        pd.read_csv(
            basedir + d + r"\peptides.txt",
            sep="\t",
            usecols=lambda x: bool(re.match(regex, x)),
        ),
        sets=settings["sets"],
        original_protein_ids=settings["original_protein_ids"],
        genes=settings["genes"],
        name_pattern=settings["name_pattern"],
        index_cols=["Potential contaminant", "Reverse"],
    )
    df = domaps.filter_singlecolumn_keep(df, column="Reverse")
    peptides[k] = domaps.filter_singlecolumn_keep(df, column="Potential contaminant")

Reading equal windows
Reading variable windows
Reading 44 1.2
Reading 44 1.5
Reading 44 2.0



Columns (110) have mixed types.Specify dtype option on import or set low_memory=False.



Reading 21 1.2
Reading 21 1.5
Reading 21 2.0


In [89]:
depth = pd.DataFrame(columns=["peptide depth"], index=list(peptides.keys()))
depth.index.name = "Experiment"
for k, df in peptides.items():
    depth.loc[k, "peptide depth"] = df.replace({0: np.nan}).dropna(how="all").shape[0]
depth

Unnamed: 0_level_0,peptide depth
Experiment,Unnamed: 1_level_1
equal windows,71213
variable windows,99176
44 1.2,42065
44 1.5,53606
44 2.0,51301
21 1.2,32877
21 1.5,36741
21 2.0,35841


In [76]:
coverage = pd.DataFrame(columns=["peptide coverage"], index=list(peptides.keys()))
coverage.index.name = "Experiment"
for k, df in peptides.items():
    coverage.loc[k, "peptide coverage"] = (
        df.stack(["Map", "Fraction"]).dropna().shape[0] / df.shape[0] / df.shape[1]
    )
coverage

Unnamed: 0_level_0,peptide coverage
Experiment,Unnamed: 1_level_1
equal windows,0.469921
variable windows,0.448515
44 1.2,0.364988
44 1.5,0.413831
44 2.0,0.379122
21 1.2,0.402137
21 1.5,0.392165
21 2.0,0.375929


In [90]:
pd.concat([ppp, ppp2, depth, coverage], axis=1).to_csv("MSQC.csv")