MannLabs · mschwoer · May 22, 2024 · May 22, 2024 · Jun 12, 2024 · Jun 12, 2024
diff --git a/alphabase/constants/aa.py b/alphabase/constants/aa.py
@@ -69,15 +69,17 @@ def reset_AA_df():
 def reset_AA_Composition():
     global AA_Composition
     AA_Composition = {}
-    for aa, formula, mass in AA_DF.values:
+    for aa, formula, _mass in AA_DF.values:
         AA_Composition[aa] = dict(parse_formula(formula))
     return AA_Composition
 
 
 reset_AA_Composition()
 
 
-def reset_AA_atoms(atom_replace_dict: typing.Dict = {}):
+def reset_AA_atoms(atom_replace_dict: typing.Dict = None):
+    if atom_replace_dict is None:
+        atom_replace_dict = {}
     reset_elements()
     replace_atoms(atom_replace_dict)
     reset_AA_mass()

diff --git a/alphabase/constants/isotope.py b/alphabase/constants/isotope.py
@@ -155,14 +155,7 @@ def _calc_one_elem_cum_dist(element_cum_dist: np.ndarray, element_cum_mono: np.n
 class IsotopeDistribution:
     def __init__(
         self,
-        max_elem_num_dict: dict = {
-            "C": 2000,
-            "H": 5000,
-            "N": 1000,
-            "O": 1000,
-            "S": 200,
-            "P": 200,
-        },
+        max_elem_num_dict: dict = None,
     ):
         """Faster calculation of isotope abundance distribution by pre-building
         isotope distribution tables for most common elements.
@@ -193,6 +186,15 @@ def __init__(
             {element: mono position array of cumulated isotope distribution},
             and mono position array is a 1-D int np.ndarray.
         """
+        if max_elem_num_dict is None:
+            max_elem_num_dict = {
+                "C": 2000,
+                "H": 5000,
+                "N": 1000,
+                "O": 1000,
+                "S": 200,
+                "P": 200,
+            }
         self.element_to_cum_dist_dict = {}
         self.element_to_cum_mono_idx = {}
         for elem, n in max_elem_num_dict.items():

diff --git a/alphabase/constants/modification.py b/alphabase/constants/modification.py
@@ -276,10 +276,7 @@ def calc_modloss_mass_with_importance(
     mod_losses = np.zeros(nAA + 2)
     mod_losses[mod_sites] = [MOD_LOSS_MASS[mod] for mod in mod_names]
     _loss_importance = np.zeros(nAA + 2)
-    _loss_importance[mod_sites] = [
-        MOD_LOSS_IMPORTANCE[mod] if mod in MOD_LOSS_IMPORTANCE else 0
-        for mod in mod_names
-    ]
+    _loss_importance[mod_sites] = [MOD_LOSS_IMPORTANCE.get(mod, 0) for mod in mod_names]
 
     # Will not consider the modloss if the corresponding modloss_importance is 0
     mod_losses[_loss_importance == 0] = 0

diff --git a/alphabase/io/hdf.py b/alphabase/io/hdf.py
@@ -199,14 +199,16 @@ def set_truncate(self, truncate: bool = True):
     def __setattr__(self, name, value):
         try:
             super().__setattr__(name, value)
-        except NotImplementedError:
+        except NotImplementedError as e:
             if not self.truncate:
                 if name in self.group_names:
-                    raise KeyError(f"Group name '{name}' cannot be truncated")
+                    raise KeyError(f"Group name '{name}' cannot be truncated") from e
                 elif name in self.dataset_names:
-                    raise KeyError(f"Dataset name '{name}' cannot be truncated")
+                    raise KeyError(f"Dataset name '{name}' cannot be truncated") from e
                 elif name in self.dataframe_names:
-                    raise KeyError(f"Dataframe name '{name}' cannot be truncated")
+                    raise KeyError(
+                        f"Dataframe name '{name}' cannot be truncated"
+                    ) from e
             if isinstance(value, (np.ndarray, pd.core.series.Series)):
                 self.add_dataset(name, value)
             elif isinstance(value, (dict, pd.DataFrame)):
@@ -217,7 +219,7 @@ def __setattr__(self, name, value):
                     "Only (str, bool, int, float, np.ndarray, "
                     "pd.core.series.Series, dict pd.DataFrame) types are "
                     "accepted.",
-                )
+                ) from e
 
     def add_dataset(
         self,
@@ -252,12 +254,12 @@ def add_dataset(
                     # chunks=array.shape,
                     maxshape=tuple([None for i in array.shape]),
                 )
-            except TypeError:
+            except TypeError as e:
                 raise NotImplementedError(
                     f"Type {array.dtype} is not understood. "
                     "If this is a string format, try to cast it to "
                     "np.dtype('O') as possible solution."
-                )
+                ) from e
             dataset = HDF_Dataset(
                 file_name=self.file_name,
                 name=f"{self.name}/{name}",
@@ -496,10 +498,7 @@ def __init__(
             >>> hdf_file.dfs.df1.data_from
             "colleagues"
         """
-        if delete_existing:
-            mode = "w"
-        else:
-            mode = "a"
+        mode = "w" if delete_existing else "a"
         with h5py.File(file_name, mode):  # , swmr=True):
             pass
         super().__init__(

diff --git a/alphabase/io/tempmmap.py b/alphabase/io/tempmmap.py
@@ -119,11 +119,10 @@ def create_empty_mmap(shape: tuple, dtype: np.dtype, path: str = None, overwrite
         )
     else:
         # check that if overwrite is false the file does not already exist
-        if not overwrite:
-            if os.path.exists(path):
-                raise ValueError(
-                    "The file already exists. Set overwrite to True to overwrite the file or choose a different name."
-                )
+        if not overwrite and os.path.exists(path):
+            raise ValueError(
+                "The file already exists. Set overwrite to True to overwrite the file or choose a different name."
+            )
         if not os.path.basename.endswith(".hdf"):
             raise ValueError("The chosen file name needs to end with .hdf")
         if os.path.isdir(os.path.commonpath(path)):

diff --git a/alphabase/peptide/fragment.py b/alphabase/peptide/fragment.py
@@ -494,10 +494,12 @@ def mask_fragments_for_charge_greater_than_precursor_charge(
     precursor_charge_array: np.ndarray,
     nAA_array: np.ndarray,
     *,
-    candidate_fragment_charges: list = [2, 3, 4],
+    candidate_fragment_charges: list = None,
 ):
     """Mask the fragment dataframe when
     the fragment charge is larger than the precursor charge"""
+    if candidate_fragment_charges is None:
+        candidate_fragment_charges = [2, 3, 4]
     precursor_charge_array = np.repeat(precursor_charge_array, nAA_array - 1)
     for col in fragment_df.columns:
         for charge in candidate_fragment_charges:
@@ -681,8 +683,8 @@ def flatten_fragments(
     fragment_intensity_df: pd.DataFrame,
     min_fragment_intensity: float = -1,
     keep_top_k_fragments: int = 1000,
-    custom_columns: list = ["type", "number", "position", "charge", "loss_type"],
-    custom_df: Dict[str, pd.DataFrame] = {},
+    custom_columns: list = None,
+    custom_df: Dict[str, pd.DataFrame] = None,
 ) -> Tuple[pd.DataFrame, pd.DataFrame]:
     """
     Converts the tabular fragment format consisting of
@@ -750,6 +752,10 @@ def flatten_fragments(
         - charge:    uint8, fragment charge
         - loss_type: int16, fragment loss type, 0=noloss, 17=NH3, 18=H2O, 98=H3PO4 (phos), ...
     """
+    if custom_df is None:
+        custom_df = {}
+    if custom_columns is None:
+        custom_columns = ["type", "number", "position", "charge", "loss_type"]
     if len(precursor_df) == 0:
         return precursor_df, pd.DataFrame()
     # new dataframes for fragments and precursors are created
@@ -1047,25 +1053,24 @@ def create_fragment_mz_dataframe(
     pd.DataFrame
         `fragment_mz_df` with given `charged_frag_types`
     """
-    if reference_fragment_df is None:
-        if "frag_start_idx" in precursor_df.columns:
-            # raise ValueError(
-            #     "`precursor_df` contains 'frag_start_idx' column, "\
-            #     "please provide `reference_fragment_df` argument"
-            # )
-            fragment_mz_df = init_fragment_by_precursor_dataframe(
-                precursor_df,
-                charged_frag_types,
-                dtype=dtype,
-            )
-            return create_fragment_mz_dataframe(
-                precursor_df=precursor_df,
-                charged_frag_types=charged_frag_types,
-                reference_fragment_df=fragment_mz_df,
-                inplace_in_reference=True,
-                batch_size=batch_size,
-                dtype=dtype,
-            )
+    if reference_fragment_df is None and "frag_start_idx" in precursor_df.columns:
+        # raise ValueError(
+        #     "`precursor_df` contains 'frag_start_idx' column, "\
+        #     "please provide `reference_fragment_df` argument"
+        # )
+        fragment_mz_df = init_fragment_by_precursor_dataframe(
+            precursor_df,
+            charged_frag_types,
+            dtype=dtype,
+        )
+        return create_fragment_mz_dataframe(
+            precursor_df=precursor_df,
+            charged_frag_types=charged_frag_types,
+            reference_fragment_df=fragment_mz_df,
+            inplace_in_reference=True,
+            batch_size=batch_size,
+            dtype=dtype,
+        )
     if "nAA" not in precursor_df.columns:
         # fast
         return create_fragment_mz_dataframe_by_sort_precursor(
@@ -1255,12 +1260,10 @@ def filter_fragment_number(
     if not set(["frag_start_idx", "frag_stop_idx"]).issubset(precursor_df.columns):
         raise KeyError("frag_start_idx and frag_stop_idx not in dataframe")
 
-    for i, (start_idx, stop_idx, n_allowed_lib) in enumerate(
-        zip(
-            precursor_df["frag_start_idx"].values,
-            precursor_df["frag_stop_idx"].values,
-            precursor_df[n_fragments_allowed_column_name].values,
-        )
+    for start_idx, stop_idx, n_allowed_lib in zip(
+        precursor_df["frag_start_idx"].values,
+        precursor_df["frag_stop_idx"].values,
+        precursor_df[n_fragments_allowed_column_name].values,
     ):
         _allowed = min(n_allowed_lib, n_allowed)
 

diff --git a/alphabase/peptide/precursor.py b/alphabase/peptide/precursor.py
@@ -25,18 +25,16 @@ def refine_precursor_df(
     """
     if ensure_data_validity:
         df.fillna("", inplace=True)
-        if "charge" in df.columns:
-            if df.charge.dtype not in [
-                "int",
-                "int8",
-                "int64",
-                "int32",
-                # np.int64, np.int32, np.int8,
-            ]:
-                df["charge"] = df["charge"].astype(np.int8)
-        if "mod_sites" in df.columns:
-            if df.mod_sites.dtype not in ["O", "U"]:
-                df["mod_sites"] = df.mod_sites.astype("U")
+        if "charge" in df.columns and df.charge.dtype not in [
+            "int",
+            "int8",
+            "int64",
+            "int32",
+            # np.int64, np.int32, np.int8,
+        ]:
+            df["charge"] = df["charge"].astype(np.int8)
+        if "mod_sites" in df.columns and df.mod_sites.dtype not in ["O", "U"]:
+            df["mod_sites"] = df.mod_sites.astype("U")
 
     if "nAA" not in df.columns:
         df["nAA"] = df.sequence.str.len().astype(np.int32)
@@ -107,7 +105,7 @@ def update_precursor_mz(
     # precursor_mz_idx = precursor_df.columns.get_loc(
     #     'precursor_mz'
     # )
-    for nAA, big_df_group in _grouped:
+    for _, big_df_group in _grouped:
         for i in range(0, len(big_df_group), batch_size):
             batch_end = i + batch_size
 

diff --git a/alphabase/protein/fasta.py b/alphabase/protein/fasta.py
@@ -429,8 +429,8 @@ def add_single_peptide_labeling(
     nterm_label_mod: str,
     cterm_label_mod: str,
 ):
-    add_nterm_label = True if nterm_label_mod else False
-    add_cterm_label = True if cterm_label_mod else False
+    add_nterm_label = bool(nterm_label_mod)
+    add_cterm_label = bool(cterm_label_mod)
     if mod_sites:
         _sites = mod_sites.split(";")
         if "0" in _sites:
@@ -478,10 +478,7 @@ def create_labeling_peptide_df(
     if len(peptide_df) == 0:
         return peptide_df
 
-    if inplace:
-        df = peptide_df
-    else:
-        df = peptide_df.copy()
+    df = peptide_df if inplace else peptide_df.copy()
 
     (label_aas, label_mod_dict, nterm_label_mod, cterm_label_mod) = parse_labels(labels)
 
@@ -507,7 +504,7 @@ def protein_idxes_to_names(protein_idxes: str, protein_names: list):
 
 def append_special_modifications(
     df: pd.DataFrame,
-    var_mods: list = ["Phospho@S", "Phospho@T", "Phospho@Y"],
+    var_mods: list = None,
     min_mod_num: int = 0,
     max_mod_num: int = 1,
     max_peptidoform_num: int = 100,
@@ -553,6 +550,8 @@ def append_special_modifications(
     pd.DataFrame
         The precursor_df with new modification added.
     """
+    if var_mods is None:
+        var_mods = ["Phospho@S", "Phospho@T", "Phospho@Y"]
     if len(var_mods) == 0 or len(df) == 0:
         return df
 
@@ -644,7 +643,7 @@ class SpecLibFasta(SpecLibBase):
 
     def __init__(
         self,
-        charged_frag_types: list = ["b_z1", "b_z2", "y_z1", "y_z2"],
+        charged_frag_types: list = None,
         *,
         protease: str = "trypsin",
         max_missed_cleavages: int = 2,
@@ -654,12 +653,12 @@ def __init__(
         precursor_charge_max: int = 4,
         precursor_mz_min: float = 400.0,
         precursor_mz_max: float = 2000.0,
-        var_mods: list = ["Acetyl@Protein_N-term", "Oxidation@M"],
+        var_mods: list = None,
         min_var_mod_num: int = 0,
         max_var_mod_num: int = 2,
-        fix_mods: list = ["Carbamidomethyl@C"],
+        fix_mods: list = None,
         labeling_channels: dict = None,
-        special_mods: list = [],
+        special_mods: list = None,
         min_special_mod_num: int = 0,
         max_special_mod_num: int = 1,
         special_mods_cannot_modify_pep_n_term: bool = False,
@@ -758,6 +757,14 @@ def __init__(
         include_contaminants : bool, optional
             If include contaminants.fasta, by default False
         """
+        if special_mods is None:
+            special_mods = []
+        if fix_mods is None:
+            fix_mods = ["Carbamidomethyl@C"]
+        if var_mods is None:
+            var_mods = ["Acetyl@Protein_N-term", "Oxidation@M"]
+        if charged_frag_types is None:
+            charged_frag_types = ["b_z1", "b_z2", "y_z1", "y_z2"]
         super().__init__(
             charged_frag_types=charged_frag_types,
             precursor_mz_min=precursor_mz_min,

diff --git a/alphabase/psm_reader/alphapept_reader.py b/alphabase/psm_reader/alphapept_reader.py
@@ -18,10 +18,7 @@ def parse_ap(precursor):
     Parser to parse peptide strings
     """
     items = precursor.split("_")
-    if len(items) == 3:
-        decoy = 1
-    else:
-        decoy = 0
+    decoy = 1 if len(items) == 3 else 0
     modseq = items[0]
     charge = items[-1]
 
@@ -81,7 +78,7 @@ def _init_modification_mapping(self):
     def _load_file(self, filename):
         with h5py.File(filename, "r") as _hdf:
             dataset = _hdf[self.hdf_dataset]
-            df = pd.DataFrame({col: dataset[col] for col in dataset.keys()})
+            df = pd.DataFrame({col: dataset[col] for col in dataset})
             df["raw_name"] = os.path.basename(filename)[: -len(".ms_data.hdf")]
             df["precursor"] = df["precursor"].str.decode("utf-8")
             # df['naked_sequence'] = df['naked_sequence'].str.decode('utf-8')