diff --git a/pyproject.toml b/pyproject.toml index 0099751e9..5572c5284 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ dependencies = [ "pandas>=2.0,<=2.2.1", "pybtex", "pybtex-apa-style", + "robustica>=0.1.3", "scikit-learn>=0.21, <=1.4.1.post1", "scipy>=1.2.0, <=1.13.0", "threadpoolctl", diff --git a/tedana/config.py b/tedana/config.py new file mode 100644 index 000000000..11cf2011c --- /dev/null +++ b/tedana/config.py @@ -0,0 +1,19 @@ +"""Setting default values for ICA decomposition.""" + +DEFAULT_ICA_METHOD = "robustica" +DEFAULT_N_ROBUST_RUNS = 30 +DEFAULT_N_MAX_ITER = 500 +DEFAULT_N_MAX_RESTART = 10 +DEFAULT_SEED = 42 + + +"""Setting extreme values for number of robust runs.""" + +MIN_N_ROBUST_RUNS = 5 +MAX_N_ROBUST_RUNS = 500 +WARN_N_ROBUST_RUNS = 200 + + +"""Setting the warning threshold for the index quality.""" + +WARN_IQ = 0.6 diff --git a/tedana/decomposition/ica.py b/tedana/decomposition/ica.py index b1e88e908..fc387f478 100644 --- a/tedana/decomposition/ica.py +++ b/tedana/decomposition/ica.py @@ -4,15 +4,33 @@ import warnings import numpy as np +from robustica import RobustICA from scipy import stats from sklearn.decomposition import FastICA +from tedana.config import ( + DEFAULT_ICA_METHOD, + DEFAULT_N_MAX_ITER, + DEFAULT_N_MAX_RESTART, + DEFAULT_N_ROBUST_RUNS, + WARN_IQ, + WARN_N_ROBUST_RUNS, +) + LGR = logging.getLogger("GENERAL") RepLGR = logging.getLogger("REPORT") -def tedica(data, n_components, fixed_seed, maxit=500, maxrestart=10): - """Perform ICA on ``data`` and return mixing matrix. +def tedica( + data, + n_components, + fixed_seed, + ica_method=DEFAULT_ICA_METHOD, + n_robust_runs=DEFAULT_N_ROBUST_RUNS, + maxit=DEFAULT_N_MAX_ITER, + maxrestart=DEFAULT_N_MAX_RESTART, +): + """Perform ICA on `data` with the user selected ica method and returns mixing matrix. Parameters ---------- @@ -20,9 +38,13 @@ def tedica(data, n_components, fixed_seed, maxit=500, maxrestart=10): Dimensionally reduced optimally combined functional data, where `S` is samples and `T` is time n_components : :obj:`int` - Number of components retained from PCA decomposition + Number of components retained from PCA decomposition. fixed_seed : :obj:`int` - Seed for ensuring reproducibility of ICA results + Seed for ensuring reproducibility of ICA results. + ica_method : :obj: `str' + slected ICA method, can be fastica or robutica. + n_robust_runs : :obj: `int' + selected number of robust runs when robustica is used. Default is 30. maxit : :obj:`int`, optional Maximum number of iterations for ICA. Default is 500. maxrestart : :obj:`int`, optional @@ -38,9 +60,6 @@ def tedica(data, n_components, fixed_seed, maxit=500, maxrestart=10): fixed_seed : :obj:`int` Random seed from final decomposition. - Notes - ----- - Uses `sklearn` implementation of FastICA for decomposition """ warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd") RepLGR.info( @@ -48,6 +67,155 @@ def tedica(data, n_components, fixed_seed, maxit=500, maxrestart=10): "decompose the dimensionally reduced dataset." ) + ica_method = ica_method.lower() + + if ica_method == "robustica": + mmix, fixed_seed = r_ica( + data, + n_components=n_components, + fixed_seed=fixed_seed, + n_robust_runs=n_robust_runs, + max_it=maxit, + ) + elif ica_method == "fastica": + mmix, fixed_seed = f_ica( + data, + n_components=n_components, + fixed_seed=fixed_seed, + maxit=maxit, + maxrestart=maxrestart, + ) + else: + raise ValueError("The selected ICA method is invalid!") + + return mmix, fixed_seed + + +def r_ica(data, n_components, fixed_seed, n_robust_runs, max_it): + """Perform robustica on `data` and returns mixing matrix. + + Parameters + ---------- + data : (S x T) :obj:`numpy.ndarray` + Dimensionally reduced optimally combined functional data, where `S` is + samples and `T` is time + n_components : :obj:`int` + Number of components retained from PCA decomposition. + fixed_seed : :obj:`int` + Seed for ensuring reproducibility of ICA results. + n_robust_runs : :obj: `int' + selected number of robust runs when robustica is used. Default is 30. + maxit : :obj:`int`, optional + Maximum number of iterations for ICA. Default is 500. + + Returns + ------- + mmix : (T x C) :obj:`numpy.ndarray` + Z-scored mixing matrix for converting input data to component space, + where `C` is components and `T` is the same as in `data` + fixed_seed : :obj:`int` + Random seed from final decomposition. + """ + if n_robust_runs > WARN_N_ROBUST_RUNS: + LGR.warning( + "The selected n_robust_runs is a very big number! The process will take a long time!" + ) + + RepLGR.info("RobustICA package was used for ICA decomposition \\citep{Anglada2022}.") + + if fixed_seed == -1: + fixed_seed = np.random.randint(low=1, high=1000) + + for robust_method in ("DBSCAN", "AgglomerativeClustering"): + + try: + rica = RobustICA( + n_components=n_components, + robust_runs=n_robust_runs, + whiten="arbitrary-variance", + max_iter=max_it, + random_state=fixed_seed, + robust_dimreduce=False, + fun="logcosh", + robust_method=robust_method, + ) + + s, mmix = rica.fit_transform(data) + q = rica.evaluate_clustering( + rica.S_all, rica.clustering.labels_, rica.signs_, rica.orientation_ + ) + + except Exception: + continue + + LGR.info( + f"The {robust_method} clustering algorithm was used clustering " + f"components across different runs" + ) + + iq = np.array( + np.mean(q[q["cluster_id"] >= 0].iq) + ) # Excluding outliers (cluster -1) from the index quality calculation + + if iq < WARN_IQ: + LGR.warning( + f"The resultant mean Index Quality is low ({iq}). It is recommended to rerun the " + "process with a different seed." + ) + + mmix = mmix[ + :, q["cluster_id"] >= 0 + ] # Excluding outliers (cluster -1) when calculating the mixing matrix + mmix = stats.zscore(mmix, axis=0) + + LGR.info( + f"RobustICA with {n_robust_runs} robust runs and seed {fixed_seed} was used. " + f"The mean Index Quality is {iq}." + ) + + no_outliers = np.count_nonzero(rica.clustering.labels_ == -1) + if no_outliers: + LGR.info( + f"The {robust_method} clustering algorithm detected outliers when clustering " + f"components for different runs. These outliers are excluded when calculating " + f"the index quality and the mixing matrix to maximise the robustness of the " + f"decomposition." + ) + + return mmix, fixed_seed + + +def f_ica(data, n_components, fixed_seed, maxit, maxrestart): + """Perform FastICA on `data` and returns mixing matrix. + + Parameters + ---------- + data : (S x T) :obj:`numpy.ndarray` + Dimensionally reduced optimally combined functional data, where `S` is + samples and `T` is time + n_components : :obj:`int` + Number of components retained from PCA decomposition + fixed_seed : :obj:`int` + Seed for ensuring reproducibility of ICA results + maxit : :obj:`int`, optional + Maximum number of iterations for ICA. Default is 500. + maxrestart : :obj:`int`, optional + Maximum number of attempted decompositions to perform with different + random seeds. ICA will stop running if there is convergence prior to + reaching this limit. Default is 10. + + Returns + ------- + mmix : (T x C) :obj:`numpy.ndarray` + Z-scored mixing matrix for converting input data to component space, + where `C` is components and `T` is the same as in `data` + fixed_seed : :obj:`int` + Random seed from final decomposition. + + Notes + ----- + Uses `sklearn` implementation of FastICA for decomposition + """ if fixed_seed == -1: fixed_seed = np.random.randint(low=1, high=1000) diff --git a/tedana/resources/references.bib b/tedana/resources/references.bib index 366fd3e63..b60fb7e9b 100644 --- a/tedana/resources/references.bib +++ b/tedana/resources/references.bib @@ -333,3 +333,14 @@ @article{tedana_decision_trees year = {2024}, doi = {10.6084/m9.figshare.25251433.v2} } + +@Article{Anglada2022, + Author = {Anglada-Girotto Miquel and Miravet-Verde Samuel and Serrano Luis and Head Sarah}, + Title = {robustica: customizable robust independent component analysis}, + Journal = {BMC Bioinformatics}, + Volume = {23}, + Number = {519}, + doi = {10.1186/s12859-022-05043-9}, + year = 2022 +} + diff --git a/tedana/tests/test_integration.py b/tedana/tests/test_integration.py index 23a320509..3d9d31d8b 100644 --- a/tedana/tests/test_integration.py +++ b/tedana/tests/test_integration.py @@ -257,6 +257,8 @@ def test_integration_five_echo(skip_integration): tedana_cli.tedana_workflow( data=datalist, tes=echo_times, + ica_method="robustica", + n_robust_runs=6, out_dir=out_dir, tedpca=0.95, fittype="curvefit", @@ -302,6 +304,7 @@ def test_integration_four_echo(skip_integration): data=datalist, mixm=op.join(op.dirname(datalist[0]), "desc-ICA_mixing_static.tsv"), tes=[11.8, 28.04, 44.28, 60.52], + ica_method="fastica", out_dir=out_dir, tedpca="kundu-stabilize", gscontrol=["gsr", "mir"], @@ -346,6 +349,8 @@ def test_integration_three_echo(skip_integration): tedana_cli.tedana_workflow( data=f"{test_data_path}/three_echo_Cornell_zcat.nii.gz", tes=[14.5, 38.5, 62.5], + ica_method="robustica", + n_robust_runs=5, out_dir=out_dir, low_mem=True, tedpca="aic", @@ -361,6 +366,10 @@ def test_integration_three_echo(skip_integration): "62.5", "--out-dir", out_dir_manual, + "--ica_method", + "robustica", + "--n_robust_runs", + "5", "--debug", "--verbose", "-f", diff --git a/tedana/workflows/parser_utils.py b/tedana/workflows/parser_utils.py index b0db74bdc..db0f13b37 100644 --- a/tedana/workflows/parser_utils.py +++ b/tedana/workflows/parser_utils.py @@ -3,6 +3,8 @@ import argparse import os.path as op +from tedana.config import MAX_N_ROBUST_RUNS, MIN_N_ROBUST_RUNS + def check_tedpca_value(string, is_parser=True): """ @@ -33,6 +35,31 @@ def check_tedpca_value(string, is_parser=True): return intarg +def check_n_robust_runs_value(string, is_parser=True): + """ + Check n_robust_runs argument. + + Check if argument is an int between MIN_N_ROBUST_RUNS and MAX_N_ROBUST_RUNS. + """ + error = argparse.ArgumentTypeError if is_parser else ValueError + try: + intarg = int(string) + except ValueError: + msg = ( + f"Argument to n_robust_runs must be an integer " + f"between {MIN_N_ROBUST_RUNS} and {MAX_N_ROBUST_RUNS}." + ) + raise error(msg) + + if not (MIN_N_ROBUST_RUNS <= intarg <= MAX_N_ROBUST_RUNS): + raise error( + f"n_robust_runs must be an integer between {MIN_N_ROBUST_RUNS} " + f"and {MAX_N_ROBUST_RUNS}." + ) + else: + return intarg + + def is_valid_file(parser, arg): """Check if argument is existing file.""" if not op.isfile(arg) and arg is not None: diff --git a/tedana/workflows/tedana.py b/tedana/workflows/tedana.py index 79f61ff13..9b63983cd 100644 --- a/tedana/workflows/tedana.py +++ b/tedana/workflows/tedana.py @@ -29,9 +29,20 @@ utils, ) from tedana.bibtex import get_description_references +from tedana.config import ( + DEFAULT_ICA_METHOD, + DEFAULT_N_MAX_ITER, + DEFAULT_N_MAX_RESTART, + DEFAULT_N_ROBUST_RUNS, + DEFAULT_SEED, +) from tedana.selection.component_selector import ComponentSelector from tedana.stats import computefeats2 -from tedana.workflows.parser_utils import check_tedpca_value, is_valid_file +from tedana.workflows.parser_utils import ( + check_n_robust_runs_value, + check_tedpca_value, + is_valid_file, +) LGR = logging.getLogger("GENERAL") RepLGR = logging.getLogger("REPORT") @@ -174,6 +185,21 @@ def _get_parser(): ), default="tedana_orig", ) + optional.add_argument( + "--ica_method", + dest="ica_method", + help=( + "The applied ICA method. fastica runs FastICA from " + "sklearn once with the seed value. " + "robustica will run FastICA n_robust_runs times and uses " + "clustering methods to overcome the randomness of the FastICA " + "algorithm. FastICA was the default in tedana version 23 and earlier. " + "robustica will be slower." + ), + choices=["robustica", "fastica"], + type=str.lower, + default=DEFAULT_ICA_METHOD, + ) optional.add_argument( "--seed", dest="fixed_seed", @@ -183,9 +209,21 @@ def _get_parser(): "Value used for random initialization of ICA " "algorithm. Set to an integer value for " "reproducible ICA results. Set to -1 for " - "varying results across ICA calls. " + "varying results across ICA calls. This" + "applies to both fastica and robustica methods." + ), + default=DEFAULT_SEED, + ) + optional.add_argument( + "--n_robust_runs", + dest="n_robust_runs", + type=check_n_robust_runs_value, + help=( + "The number of times robustica will run. " + "This is only effective when ica_method is " + "set to robustica." ), - default=42, + default=DEFAULT_N_ROBUST_RUNS, ) optional.add_argument( "--maxit", @@ -193,7 +231,7 @@ def _get_parser(): metavar="INT", type=int, help=("Maximum number of iterations for ICA."), - default=500, + default=DEFAULT_N_MAX_ITER, ) optional.add_argument( "--maxrestart", @@ -207,7 +245,7 @@ def _get_parser(): "convergence is achieved before maxrestart " "attempts, ICA will finish early." ), - default=10, + default=DEFAULT_N_MAX_RESTART, ) optional.add_argument( "--tedort", @@ -334,10 +372,12 @@ def tedana_workflow( fittype="loglin", combmode="t2s", tree="tedana_orig", + ica_method=DEFAULT_ICA_METHOD, + n_robust_runs=DEFAULT_N_ROBUST_RUNS, tedpca="aic", - fixed_seed=42, - maxit=500, - maxrestart=10, + fixed_seed=DEFAULT_SEED, + maxit=DEFAULT_N_MAX_ITER, + maxrestart=DEFAULT_N_MAX_RESTART, tedort=False, gscontrol=None, no_reports=False, @@ -399,6 +439,17 @@ def tedana_workflow( to be a simpler process, but it accepts and rejects some distinct components compared to the others. Testing to better understand the effects of the differences is ongoing. Default is 'tedana_orig'. + ica_method : {'robustica', 'fastica'}, optional + The applied ICA method. fastica runs FastICA from sklearn + once with the seed value. 'robustica' will run + 'FastICA' n_robust_runs times and uses clustering methods to overcome + the randomness of the FastICA algorithm. + FastICA was the default in tedana version 23 and earlier. + robustica will be slower. + Default is 'robustica' + n_robust_runs : :obj:`int`, optional + The number of times robustica will run. This is only effective when 'ica_method' is + set to 'robustica'. tedpca : {'mdl', 'aic', 'kic', 'kundu', 'kundu-stabilize', float, int}, optional Method with which to select components in TEDPCA. If a float is provided, then it is assumed to represent percentage of variance @@ -408,8 +459,8 @@ def tedana_workflow( Default is 'aic'. fixed_seed : :obj:`int`, optional Value passed to ``mdp.numx_rand.seed()``. - Set to a positive integer value for reproducible ICA results; - otherwise, set to -1 for varying results across calls. + Set to a positive integer value for reproducible ICA results (fastica/robustica); + otherwise, set to -1 for varying results across ICA (fastica/robustica) calls. maxit : :obj:`int`, optional Maximum number of iterations for ICA. Default is 500. maxrestart : :obj:`int`, optional @@ -681,7 +732,13 @@ def tedana_workflow( seed = fixed_seed while keep_restarting: mmix, seed = decomposition.tedica( - dd, n_components, seed, maxit, maxrestart=(maxrestart - n_restarts) + dd, + n_components, + seed, + ica_method, + n_robust_runs, + maxit, + maxrestart=(maxrestart - n_restarts), ) seed += 1 n_restarts = seed - fixed_seed