diff --git a/dev_tools/pca_check.py b/dev_tools/pca_check.py new file mode 100644 index 000000000..ceca6f450 --- /dev/null +++ b/dev_tools/pca_check.py @@ -0,0 +1,144 @@ + +import matplotlib + +matplotlib.use("AGG") + +import logging + +LGR = logging.getLogger("GENERAL") +MPL_LGR = logging.getLogger("matplotlib") +MPL_LGR.setLevel(logging.WARNING) +RepLGR = logging.getLogger("REPORT") + +import json +import os +import subprocess + +import nibabel as nib +import numpy as np +import pandas as pd +from scipy import io as scipy_io +from tedana.workflows import tedana as tedana_cli + +from mapca import MovingAveragePCA + +repoid = "ds000258" +wdir = "/home/portega/Downloads" + +repo = os.path.join(wdir, repoid) +if not os.path.isdir(repo): + os.mkdir(repo) + +print(">>> INSTALLING DATASET:") +subprocess.run( + f"cd {wdir} && datalad install git@github.com:OpenNeuroDatasets/{repoid}.git", shell=True +) + +subjects = [] + +# Create pandas dataframe to store results with the following columns: +# Subject, maPCA_AIC, GIFT_AIC, maPCA_KIC, GIFT_KIC, maPCA_MDL, GIFT_MDL +results_df = pd.DataFrame( + columns=["Subject", "maPCA_AIC", "GIFT_AIC", "maPCA_KIC", "GIFT_KIC", "maPCA_MDL", "GIFT_MDL"] +) + + +for sbj in os.listdir(repo): + sbj_dir = os.path.join(repo, sbj) + + # Access subject directory + if os.path.isdir(sbj_dir) and "sub-" in sbj_dir: + echo_times = [] + func_files = [] + + subjects.append(os.path.basename(sbj_dir)) + + print(">>>> Downloading subject", sbj) + + subprocess.run(f"datalad get {sbj}/func", shell=True, cwd=repo) + + print(">>>> Searching for functional files and echo times") + + # Get functional filenames and echo times + for func_file in os.listdir(os.path.join(repo, sbj, "func")): + if func_file.endswith(".json"): + with open(os.path.join(repo, sbj, "func", func_file)) as f: + data = json.load(f) + echo_times.append(data["EchoTime"]) + elif func_file.endswith(".nii.gz"): + func_files.append(os.path.join(repo, sbj, "func", func_file)) + + # Sort echo_times values from lowest to highest and multiply by 1000 + echo_times = np.array(sorted(echo_times)) * 1000 + + # Sort func_files + func_files = sorted(func_files) + print(func_files) + + # Tedana output directory + tedana_output_dir = os.path.join(sbj_dir, "tedana") + + print("Running tedana") + + # Run tedana + tedana_cli.tedana_workflow( + data=func_files, + tes=echo_times, + out_dir=tedana_output_dir, + tedpca="mdl", + ) + + # Find tedana optimally combined data and mask + tedana_optcom = os.path.join(tedana_output_dir, "desc-optcom_bold.nii.gz") + tedana_mask = os.path.join(tedana_output_dir, "desc-adaptiveGoodSignal_mask.nii.gz") + + # Read tedana optimally combined data and mask + tedana_optcom_img = nib.load(tedana_optcom) + tedana_mask_img = nib.load(tedana_mask) + + # Save tedana optimally combined data and mask into mat files + tedana_optcom_mat = os.path.join(sbj_dir, "optcom_bold.mat") + tedana_mask_mat = os.path.join(sbj_dir, "mask.mat") + print("Saving tedana optimally combined data and mask into mat files") + scipy_io.savemat(tedana_optcom_mat, {"data": tedana_optcom_img.get_fdata()}) + scipy_io.savemat(tedana_mask_mat, {"mask": tedana_mask_img.get_fdata()}) + + # Run mapca + print("Running mapca") + pca = MovingAveragePCA(normalize=True) + _ = pca.fit_transform(tedana_optcom_img, tedana_mask_img) + + # Get AIC, KIC and MDL values + aic = pca.aic_ + kic = pca.kic_ + mdl = pca.mdl_ + + # Remove tedana output directory and the anat and func directories + subprocess.run(f"rm -rf {tedana_output_dir}", shell=True, cwd=repo) + subprocess.run(f"rm -rf {sbj}/anat", shell=True, cwd=repo) + subprocess.run(f"rm -rf {sbj}/func", shell=True, cwd=repo) + + # Here run matlab script with subprocess.run + print("Running GIFT version of maPCA") + + # Append AIC, KIC and MDL values to a pandas dataframe + print("Appending AIC, KIC and MDL values to a pandas dataframe") + results_df = results_df.append( + { + "Subject": sbj, + "maPCA_AIC": aic, + "GIFT_AIC": 0, + "maPCA_KIC": kic, + "GIFT_KIC": 0, + "maPCA_MDL": mdl, + "GIFT_MDL": 0, + }, + ignore_index=True, + ) + + print("Subject", sbj, "done") + +# Save pandas dataframe to csv file +results_df.to_csv(os.path.join(wdir, "results.csv"), index=False) + + diff --git a/tedana/decomposition/pca.py b/tedana/decomposition/pca.py index c95afaa52..1ec6de536 100644 --- a/tedana/decomposition/pca.py +++ b/tedana/decomposition/pca.py @@ -22,12 +22,10 @@ def low_mem_pca(data): """ Run Singular Value Decomposition (SVD) on input data. - Parameters ---------- data : (S [*E] x T) array_like Optimally combined (S x T) or full multi-echo (S*E x T) data. - Returns ------- u : (S [*E] x C) array_like @@ -185,6 +183,7 @@ def tedpca( :py:mod:`tedana.constants` : The module describing the filenames for various naming conventions """ + # TODO refactor this to make it nice if algorithm == "kundu": alg_str = ( "followed by the Kundu component selection decision tree \\citep{kundu2013integrated}" @@ -268,20 +267,15 @@ def tedpca( f"90% varexp: {varex_90_varexp}% | 95% varexp: {varex_95_varexp}%" ) - pca_optimization_curves = np.array([aic["value"], kic["value"], mdl["value"]]) - pca_criteria_components = np.array( - [ - n_aic, - n_kic, - n_mdl, - n_varex_90, - n_varex_95, - ] - ) - - # Plot maPCA optimization curves - LGR.info("Plotting maPCA optimization curves") - plot_pca_results(pca_optimization_curves, pca_criteria_components, all_varex, io_generator) + # # Plot maPCA optimization curves once all relevant mehtods have been + # # computed + # LGR.info("Plotting maPCA optimization curves") + # plot_pca_results( + # pca_optimization_curves, + # pca_criteria_components, + # all_varex, # TODO find out what this is doing + # io_generator # TODO find out what this is doing + # ) # Save maPCA results into a dictionary mapca_results = { @@ -300,6 +294,7 @@ def tedpca( "explained_variance_total": mdl_varexp, "curve": mdl["value"], }, + # TODO add the new values "varex_90": { "n_components": n_varex_90, "explained_variance_total": varex_90_varexp, @@ -387,7 +382,23 @@ def tedpca( io_generator.save_file(comp_maps, "z-scored PCA components img") # Select components using decision tree - if algorithm == "kundu": + if isinstance(algorithm, float): + alg_str = "variance explained-based" + elif isinstance(algorithm, int): + alg_str = "a fixed number of components and no" + else: + alg_str = algorithm + LGR.info( + f"Selected {comptable.shape[0]} components with {round(100*varex_norm.sum(),2)}% " + f"normalized variance explained using {alg_str} dimensionality detection" + ) + comptable["classification"] = "accepted" + comptable["rationale"] = "" + + if algorithm in ["mdl", "aic", "kic"]: + # Continue the standard branch of the logic with Kundu + # TODO the logic here needs a big refactoring + comptable, metric_metadata = kundu_tedpca( comptable, n_echos, @@ -395,7 +406,9 @@ def tedpca( rdaw, stabilize=False, ) - elif algorithm == "kundu-stabilize": + n_kundu = np.sum(comptable["classification"] == "accepted") + values_kundu = np.array(comptable["variance explained"]) + comptable, metric_metadata = kundu_tedpca( comptable, n_echos, @@ -403,19 +416,46 @@ def tedpca( rdaw, stabilize=True, ) - else: - if isinstance(algorithm, float): - alg_str = "variance explained-based" - elif isinstance(algorithm, int): - alg_str = "a fixed number of components and no" + n_kundu_st = np.sum(comptable["classification"] == "accepted") + values_kundu_st = np.array(comptable["variance explained"]) + + n_val_kundu = values_kundu_st.shape[0] + n_val = aic["value"].shape[0] + if n_val > n_val_kundu: + values_kundu = np.concatenate( + [values_kundu, np.array([np.nan] * (n_val - n_val_kundu))] + ) + values_kundu_st = np.concatenate( + [values_kundu_st, np.array([np.nan] * (n_val - n_val_kundu))] + ) else: - alg_str = algorithm - LGR.info( - f"Selected {comptable.shape[0]} components with {round(100*varex_norm.sum(),2)}% " - f"normalized variance explained using {alg_str} dimensionality estimate" + values_kundu = values_kundu[:n_val] + values_kundu_st = values_kundu_st[:n_val] + + pca_optimization_curves = np.concatenate( + [ + aic["value"].reshape(1, -1), + kic["value"].reshape(1, -1), + mdl["value"].reshape(1, -1), + values_kundu.reshape(1, -1), + values_kundu_st.reshape(1, -1), + ] + ) + pca_criteria_components = np.array( + [ + n_aic, + n_kic, + n_mdl, + n_kundu, + n_kundu_st, + n_varex_90, + n_varex_95, + ] ) - comptable["classification"] = "accepted" - comptable["rationale"] = "" + + # Plot PCA selection curves + LGR.info("Plotting PCA selection curves") + plot_pca_results(pca_optimization_curves, pca_criteria_components, all_varex, io_generator) # Save decomposition files comp_names = [ diff --git a/tedana/reporting/static_figures.py b/tedana/reporting/static_figures.py index 3740f1637..76fcd0f65 100644 --- a/tedana/reporting/static_figures.py +++ b/tedana/reporting/static_figures.py @@ -343,26 +343,26 @@ def pca_results(criteria, n_components, all_varex, io_generator): ) # Vertical line depicting the optimal number of components for 90% variance explained plt.vlines( - n_components[3], + n_components[5], ymin=np.min(criteria), ymax=np.max(criteria), - color="tab:red", + color="tab:brown", linestyles="dashed", label="90% varexp", ) # Vertical line depicting the optimal number of components for 95% variance explained plt.vlines( - n_components[4], + n_components[6], ymin=np.min(criteria), ymax=np.max(criteria), - color="tab:purple", + color="tab:pink", linestyles="dashed", label="95% varexp", ) plt.legend() - #  Save the plot + # Save the plot plot_name = "pca_criteria.png" pca_criteria_name = os.path.join(io_generator.out_dir, "figures", plot_name) plt.savefig(pca_criteria_name) @@ -403,28 +403,46 @@ def pca_results(criteria, n_components, all_varex, io_generator): linestyles="dashed", label="MDL", ) - # Vertical line depicting the optimal number of components for 90% variance explained + # Vertical line depicting the optimal number of components for Kundu plt.vlines( n_components[3], ymin=0, ymax=1, color="tab:red", linestyles="dashed", - label="90% varexp", + label="Kundu", ) - # Vertical line depicting the optimal number of components for 95% variance explained + # Vertical line depicting the optimal number of components for Kundu stabilized plt.vlines( n_components[4], ymin=0, ymax=1, color="tab:purple", linestyles="dashed", + label="Kundu stabilized", + ) + # Vertical line depicting the optimal number of components for 90% variance explained + plt.vlines( + n_components[5], + ymin=0, + ymax=1, + color="tab:brown", + linestyles="dashed", + label="90% varexp", + ) + # Vertical line depicting the optimal number of components for 95% variance explained + plt.vlines( + n_components[6], + ymin=0, + ymax=1, + color="tab:pink", + linestyles="dashed", label="95% varexp", ) plt.legend() - #  Save the plot + # Save the plot plot_name = "pca_variance_explained.png" pca_variance_explained_name = os.path.join(io_generator.out_dir, "figures", plot_name) plt.savefig(pca_variance_explained_name) diff --git a/tedana/tests/test_integration.py b/tedana/tests/test_integration.py index a8de177f4..2db0cba4f 100644 --- a/tedana/tests/test_integration.py +++ b/tedana/tests/test_integration.py @@ -336,7 +336,8 @@ def test_integration_three_echo(skip_integration): low_mem=True, tedpca="aic", ) - + # TODO push changes to my branch + # TODO script downloading data... # Test re-running, but use the CLI args = [ "-d", @@ -353,7 +354,7 @@ def test_integration_three_echo(skip_integration): "--mix", os.path.join(out_dir, "desc-ICA_mixing.tsv"), ] - tedana_cli._main(args) + tedana_cli._main(args) # FIXME ficheros no coinciden # compare the generated output files fn = resource_filename("tedana", "tests/data/cornell_three_echo_outputs.txt") diff --git a/tedana/workflows/tedana.py b/tedana/workflows/tedana.py index 4d443033d..5d75360bc 100644 --- a/tedana/workflows/tedana.py +++ b/tedana/workflows/tedana.py @@ -618,6 +618,7 @@ def tedana_workflow( if mixm is None: # Identify and remove thermal noise from data + # TODO this is entrypoint for pca algorithm dd, n_components = decomposition.tedpca( catd, data_oc,