MannLabs · mschwoer · Jul 17, 2024 · May 22, 2024 · May 22, 2024 · Jun 12, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -12,5 +12,7 @@ repos:
   hooks:
     - id: ruff-format
     - id: ruff
+      args:
+        - "--fix"
 
 exclude: .bumpversion.cfg
diff --git a/alphabase/constants/aa.py b/alphabase/constants/aa.py
@@ -69,7 +69,7 @@ def reset_AA_df():
 def reset_AA_Composition():
     global AA_Composition
     AA_Composition = {}
-    for aa, formula, mass in AA_DF.values:
+    for aa, formula, _mass in AA_DF.values:
         AA_Composition[aa] = dict(parse_formula(formula))
     return AA_Composition
 

diff --git a/alphabase/constants/modification.py b/alphabase/constants/modification.py
@@ -276,10 +276,7 @@ def calc_modloss_mass_with_importance(
     mod_losses = np.zeros(nAA + 2)
     mod_losses[mod_sites] = [MOD_LOSS_MASS[mod] for mod in mod_names]
     _loss_importance = np.zeros(nAA + 2)
-    _loss_importance[mod_sites] = [
-        MOD_LOSS_IMPORTANCE[mod] if mod in MOD_LOSS_IMPORTANCE else 0
-        for mod in mod_names
-    ]
+    _loss_importance[mod_sites] = [MOD_LOSS_IMPORTANCE.get(mod, 0) for mod in mod_names]
 
     # Will not consider the modloss if the corresponding modloss_importance is 0
     mod_losses[_loss_importance == 0] = 0

diff --git a/alphabase/io/hdf.py b/alphabase/io/hdf.py
@@ -199,14 +199,16 @@ def set_truncate(self, truncate: bool = True):
     def __setattr__(self, name, value):
         try:
             super().__setattr__(name, value)
-        except NotImplementedError:
+        except NotImplementedError as e:
             if not self.truncate:
                 if name in self.group_names:
-                    raise KeyError(f"Group name '{name}' cannot be truncated")
+                    raise KeyError(f"Group name '{name}' cannot be truncated") from e
                 elif name in self.dataset_names:
-                    raise KeyError(f"Dataset name '{name}' cannot be truncated")
+                    raise KeyError(f"Dataset name '{name}' cannot be truncated") from e
                 elif name in self.dataframe_names:
-                    raise KeyError(f"Dataframe name '{name}' cannot be truncated")
+                    raise KeyError(
+                        f"Dataframe name '{name}' cannot be truncated"
+                    ) from e
             if isinstance(value, (np.ndarray, pd.core.series.Series)):
                 self.add_dataset(name, value)
             elif isinstance(value, (dict, pd.DataFrame)):
@@ -217,7 +219,7 @@ def __setattr__(self, name, value):
                     "Only (str, bool, int, float, np.ndarray, "
                     "pd.core.series.Series, dict pd.DataFrame) types are "
                     "accepted.",
-                )
+                ) from e
 
     def add_dataset(
         self,
@@ -252,12 +254,12 @@ def add_dataset(
                     # chunks=array.shape,
                     maxshape=tuple([None for i in array.shape]),
                 )
-            except TypeError:
+            except TypeError as e:
                 raise NotImplementedError(
                     f"Type {array.dtype} is not understood. "
                     "If this is a string format, try to cast it to "
                     "np.dtype('O') as possible solution."
-                )
+                ) from e
             dataset = HDF_Dataset(
                 file_name=self.file_name,
                 name=f"{self.name}/{name}",
@@ -496,10 +498,7 @@ def __init__(
             >>> hdf_file.dfs.df1.data_from
             "colleagues"
         """
-        if delete_existing:
-            mode = "w"
-        else:
-            mode = "a"
+        mode = "w" if delete_existing else "a"
         with h5py.File(file_name, mode):  # , swmr=True):
             pass
         super().__init__(

diff --git a/alphabase/io/tempmmap.py b/alphabase/io/tempmmap.py
@@ -78,11 +78,10 @@ def _get_file_location(abs_file_path: str, overwrite=False) -> str:
         The file path if it is valid.
     """
     # check overwrite status and existence of file
-    if not overwrite:
-        if os.path.exists(abs_file_path):
-            raise ValueError(
-                "The file already exists. Set overwrite to True to overwrite the file or choose a different name."
-            )
+    if not overwrite and os.path.exists(abs_file_path):
+        raise ValueError(
+            "The file already exists. Set overwrite to True to overwrite the file or choose a different name."
+        )
 
     # ensure that the filename conforms to the naming convention
     if not os.path.basename.endswith(".hdf"):

diff --git a/alphabase/peptide/fragment.py b/alphabase/peptide/fragment.py
@@ -1047,25 +1047,24 @@ def create_fragment_mz_dataframe(
     pd.DataFrame
         `fragment_mz_df` with given `charged_frag_types`
     """
-    if reference_fragment_df is None:
-        if "frag_start_idx" in precursor_df.columns:
-            # raise ValueError(
-            #     "`precursor_df` contains 'frag_start_idx' column, "\
-            #     "please provide `reference_fragment_df` argument"
-            # )
-            fragment_mz_df = init_fragment_by_precursor_dataframe(
-                precursor_df,
-                charged_frag_types,
-                dtype=dtype,
-            )
-            return create_fragment_mz_dataframe(
-                precursor_df=precursor_df,
-                charged_frag_types=charged_frag_types,
-                reference_fragment_df=fragment_mz_df,
-                inplace_in_reference=True,
-                batch_size=batch_size,
-                dtype=dtype,
-            )
+    if reference_fragment_df is None and "frag_start_idx" in precursor_df.columns:
+        # raise ValueError(
+        #     "`precursor_df` contains 'frag_start_idx' column, "\
+        #     "please provide `reference_fragment_df` argument"
+        # )
+        fragment_mz_df = init_fragment_by_precursor_dataframe(
+            precursor_df,
+            charged_frag_types,
+            dtype=dtype,
+        )
+        return create_fragment_mz_dataframe(
+            precursor_df=precursor_df,
+            charged_frag_types=charged_frag_types,
+            reference_fragment_df=fragment_mz_df,
+            inplace_in_reference=True,
+            batch_size=batch_size,
+            dtype=dtype,
+        )
     if "nAA" not in precursor_df.columns:
         # fast
         return create_fragment_mz_dataframe_by_sort_precursor(
@@ -1255,12 +1254,10 @@ def filter_fragment_number(
     if not set(["frag_start_idx", "frag_stop_idx"]).issubset(precursor_df.columns):
         raise KeyError("frag_start_idx and frag_stop_idx not in dataframe")
 
-    for i, (start_idx, stop_idx, n_allowed_lib) in enumerate(
-        zip(
-            precursor_df["frag_start_idx"].values,
-            precursor_df["frag_stop_idx"].values,
-            precursor_df[n_fragments_allowed_column_name].values,
-        )
+    for start_idx, stop_idx, n_allowed_lib in zip(
+        precursor_df["frag_start_idx"].values,
+        precursor_df["frag_stop_idx"].values,
+        precursor_df[n_fragments_allowed_column_name].values,
     ):
         _allowed = min(n_allowed_lib, n_allowed)
 

diff --git a/alphabase/peptide/precursor.py b/alphabase/peptide/precursor.py
@@ -25,18 +25,16 @@ def refine_precursor_df(
     """
     if ensure_data_validity:
         df.fillna("", inplace=True)
-        if "charge" in df.columns:
-            if df.charge.dtype not in [
-                "int",
-                "int8",
-                "int64",
-                "int32",
-                # np.int64, np.int32, np.int8,
-            ]:
-                df["charge"] = df["charge"].astype(np.int8)
-        if "mod_sites" in df.columns:
-            if df.mod_sites.dtype not in ["O", "U"]:
-                df["mod_sites"] = df.mod_sites.astype("U")
+        if "charge" in df.columns and df.charge.dtype not in [
+            "int",
+            "int8",
+            "int64",
+            "int32",
+            # np.int64, np.int32, np.int8,
+        ]:
+            df["charge"] = df["charge"].astype(np.int8)
+        if "mod_sites" in df.columns and df.mod_sites.dtype not in ["O", "U"]:
+            df["mod_sites"] = df.mod_sites.astype("U")
 
     if "nAA" not in df.columns:
         df["nAA"] = df.sequence.str.len().astype(np.int32)
@@ -107,7 +105,7 @@ def update_precursor_mz(
     # precursor_mz_idx = precursor_df.columns.get_loc(
     #     'precursor_mz'
     # )
-    for nAA, big_df_group in _grouped:
+    for _, big_df_group in _grouped:
         for i in range(0, len(big_df_group), batch_size):
             batch_end = i + batch_size
 

diff --git a/alphabase/protein/fasta.py b/alphabase/protein/fasta.py
@@ -43,7 +43,7 @@ def read_fasta_file(fasta_filename: str = ""):
         protein information,
         {protein_id:str, full_name:str, gene_name:str, description:str, sequence:str}
     """
-    with open(fasta_filename, "rt", encoding="utf-8") as handle:
+    with open(fasta_filename, encoding="utf-8") as handle:
         iterator = SeqIO.parse(handle, "fasta")
         while iterator:
             try:
@@ -429,8 +429,8 @@ def add_single_peptide_labeling(
     nterm_label_mod: str,
     cterm_label_mod: str,
 ):
-    add_nterm_label = True if nterm_label_mod else False
-    add_cterm_label = True if cterm_label_mod else False
+    add_nterm_label = bool(nterm_label_mod)
+    add_cterm_label = bool(cterm_label_mod)
     if mod_sites:
         _sites = mod_sites.split(";")
         if "0" in _sites:
@@ -478,10 +478,7 @@ def create_labeling_peptide_df(
     if len(peptide_df) == 0:
         return peptide_df
 
-    if inplace:
-        df = peptide_df
-    else:
-        df = peptide_df.copy()
+    df = peptide_df if inplace else peptide_df.copy()
 
     (label_aas, label_mod_dict, nterm_label_mod, cterm_label_mod) = parse_labels(labels)
 

diff --git a/alphabase/psm_reader/alphapept_reader.py b/alphabase/psm_reader/alphapept_reader.py
@@ -18,10 +18,7 @@ def parse_ap(precursor):
     Parser to parse peptide strings
     """
     items = precursor.split("_")
-    if len(items) == 3:
-        decoy = 1
-    else:
-        decoy = 0
+    decoy = 1 if len(items) == 3 else 0
     modseq = items[0]
     charge = items[-1]
 
@@ -81,7 +78,7 @@ def _init_modification_mapping(self):
     def _load_file(self, filename):
         with h5py.File(filename, "r") as _hdf:
             dataset = _hdf[self.hdf_dataset]
-            df = pd.DataFrame({col: dataset[col] for col in dataset.keys()})
+            df = pd.DataFrame({col: dataset[col] for col in dataset})
             df["raw_name"] = os.path.basename(filename)[: -len(".ms_data.hdf")]
             df["precursor"] = df["precursor"].str.decode("utf-8")
             # df['naked_sequence'] = df['naked_sequence'].str.decode('utf-8')

diff --git a/alphabase/psm_reader/maxquant_reader.py b/alphabase/psm_reader/maxquant_reader.py
@@ -75,10 +75,7 @@ def parse_mod_seq(
         0 for N-term; -1 for C-term; 1 to N for normal modifications.
     """
     PeptideModSeq = modseq
-    if modseq[0] == "_":
-        underscore_for_ncterm = True
-    else:
-        underscore_for_ncterm = False
+    underscore_for_ncterm = modseq[0] == "_"
     mod_list = []
     site_list = []
     site = PeptideModSeq.find(mod_sep[0])
@@ -136,7 +133,7 @@ def __init__(
         fdr=0.01,
         keep_decoy=False,
         fixed_C57=True,
-        mod_seq_columns=["Modified sequence"],
+        mod_seq_columns=None,
         **kwargs,
     ):
         """Reader for MaxQuant msms.txt and evidence.txt
@@ -168,6 +165,8 @@ def __init__(
             The columns to find modified sequences,
             by default ['Modified sequence']
         """
+        if mod_seq_columns is None:
+            mod_seq_columns = ["Modified sequence"]
         super().__init__(
             column_mapping=column_mapping,
             modification_mapping=modification_mapping,

diff --git a/alphabase/psm_reader/msfragger_reader.py b/alphabase/psm_reader/msfragger_reader.py
@@ -13,10 +13,7 @@
 
 
 def _is_fragger_decoy(proteins):
-    for prot in proteins:
-        if not prot.lower().startswith("rev_"):
-            return False
-    return True
+    return all(prot.lower().startswith("rev_") for prot in proteins)
 
 
 mass_mapped_mods = psm_reader_yaml["msfragger_pepxml"]["mass_mapped_mods"]

diff --git a/alphabase/psm_reader/psm_reader.py b/alphabase/psm_reader/psm_reader.py
@@ -240,9 +240,10 @@ def _reverse_mod_mapping(self):
         for this_mod, other_mod in self.modification_mapping.items():
             if isinstance(other_mod, (list, tuple)):
                 for _mod in other_mod:
-                    if _mod in self.rev_mod_mapping:
-                        if this_mod.endswith("Protein N-term"):
-                            continue
+                    if _mod in self.rev_mod_mapping and this_mod.endswith(
+                        "Protein N-term"
+                    ):
+                        continue
                     self.rev_mod_mapping[_mod] = this_mod
             else:
                 self.rev_mod_mapping[other_mod] = this_mod
@@ -344,7 +345,7 @@ def normalize_rt_by_raw_name(self):
             self.norm_rt()
         if "raw_name" not in self.psm_df.columns:
             return
-        for raw_name, df_group in self.psm_df.groupby("raw_name"):
+        for _, df_group in self.psm_df.groupby("raw_name"):
             self.psm_df.loc[df_group.index, "rt_norm"] = (
                 df_group.rt_norm / df_group.rt_norm.max()
             )
@@ -510,19 +511,21 @@ def _post_process(self, origin_df: pd.DataFrame):
 
     def filter_psm_by_modifications(
         self,
-        include_mod_set=set(
-            [
-                "Oxidation@M",
-                "Phospho@S",
-                "Phospho@T",
-                "Phospho@Y",
-                "Acetyl@Protein N-term",
-            ]
-        ),
+        include_mod_set=None,
     ):
         """
         Only keeps peptides with modifications in `include_mod_list`.
         """
+        if include_mod_set is None:
+            include_mod_set = set(
+                [
+                    "Oxidation@M",
+                    "Phospho@S",
+                    "Phospho@T",
+                    "Phospho@Y",
+                    "Acetyl@Protein N-term",
+                ]
+            )
         self._psm_df.mods = self._psm_df.mods.apply(
             keep_modifications, mod_set=include_mod_set
         )

diff --git a/alphabase/quantification/quant_reader/config_dict_loader.py b/alphabase/quantification/quant_reader/config_dict_loader.py
@@ -24,7 +24,7 @@ def get_input_type_and_config_dict(input_file, input_type_to_use=None):
 
     uploaded_data_columns = set(pd.read_csv(input_file, sep=sep, nrows=1).columns)
 
-    for input_type in type2relevant_columns.keys():
+    for input_type in type2relevant_columns:
         if (input_type_to_use is not None) and (input_type != input_type_to_use):
             continue
         relevant_columns = type2relevant_columns.get(input_type)
@@ -65,7 +65,7 @@ def _load_config(config_yaml):
 
 def _get_type2relevant_cols(config_all):
     type2relcols = {}
-    for type in config_all.keys():
+    for type in config_all:
         config_typedict = config_all.get(type)
         relevant_cols = get_relevant_columns_config_dict(config_typedict)
         type2relcols[type] = relevant_cols
@@ -78,7 +78,7 @@ def get_relevant_columns_config_dict(config_typedict):
     for filtconf in config_typedict.get("filters", {}).values():
         filtcols.append(filtconf.get("param"))
 
-    if "ion_hierarchy" in config_typedict.keys():
+    if "ion_hierarchy" in config_typedict:
         for headr in config_typedict.get("ion_hierarchy").values():
             ioncols = list(itertools.chain.from_iterable(headr.get("mapping").values()))
             dict_ioncols.extend(ioncols)

diff --git a/alphabase/quantification/quant_reader/longformat_reader.py b/alphabase/quantification/quant_reader/longformat_reader.py
@@ -153,7 +153,7 @@ def adapt_subtable(input_df_subset, config_dict, use_alphaquant_format):
     input_df_subset = quantreader_utils.filter_input(
         config_dict.get("filters", {}), input_df_subset
     )
-    if "ion_hierarchy" in config_dict.keys():
+    if "ion_hierarchy" in config_dict:
         return table_reformatter.merge_protein_cols_and_config_dict(
             input_df_subset, config_dict, use_alphaquant_format
         )
@@ -240,9 +240,9 @@ def process_with_dask(
 
 def get_hierarchy_names_from_config_dict(config_dict_for_type):
     hierarchy_names = []
-    if "ion_hierarchy" in config_dict_for_type.keys():
+    if "ion_hierarchy" in config_dict_for_type:
         ion_hierarchy = config_dict_for_type.get("ion_hierarchy")
-        for hierarchy_type in ion_hierarchy.keys():
+        for hierarchy_type in ion_hierarchy:
             hierarchy_names += ion_hierarchy.get(hierarchy_type).get("order")
         return list(set(hierarchy_names))
     else: