Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Exp/3 pca seq #909

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
144 changes: 144 additions & 0 deletions dev_tools/pca_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@

import matplotlib

matplotlib.use("AGG")

import logging

LGR = logging.getLogger("GENERAL")
MPL_LGR = logging.getLogger("matplotlib")
MPL_LGR.setLevel(logging.WARNING)
RepLGR = logging.getLogger("REPORT")

import json
import os
import subprocess

import nibabel as nib
import numpy as np
import pandas as pd
from scipy import io as scipy_io
from tedana.workflows import tedana as tedana_cli

from mapca import MovingAveragePCA

repoid = "ds000258"
wdir = "/home/portega/Downloads"

repo = os.path.join(wdir, repoid)
if not os.path.isdir(repo):
os.mkdir(repo)

print(">>> INSTALLING DATASET:")
subprocess.run(
f"cd {wdir} && datalad install git@github.com:OpenNeuroDatasets/{repoid}.git", shell=True
)

subjects = []

# Create pandas dataframe to store results with the following columns:
# Subject, maPCA_AIC, GIFT_AIC, maPCA_KIC, GIFT_KIC, maPCA_MDL, GIFT_MDL
results_df = pd.DataFrame(
columns=["Subject", "maPCA_AIC", "GIFT_AIC", "maPCA_KIC", "GIFT_KIC", "maPCA_MDL", "GIFT_MDL"]
)


for sbj in os.listdir(repo):
sbj_dir = os.path.join(repo, sbj)

# Access subject directory
if os.path.isdir(sbj_dir) and "sub-" in sbj_dir:
echo_times = []
func_files = []

subjects.append(os.path.basename(sbj_dir))

print(">>>> Downloading subject", sbj)

subprocess.run(f"datalad get {sbj}/func", shell=True, cwd=repo)

print(">>>> Searching for functional files and echo times")

# Get functional filenames and echo times
for func_file in os.listdir(os.path.join(repo, sbj, "func")):
if func_file.endswith(".json"):
with open(os.path.join(repo, sbj, "func", func_file)) as f:
data = json.load(f)
echo_times.append(data["EchoTime"])
elif func_file.endswith(".nii.gz"):
func_files.append(os.path.join(repo, sbj, "func", func_file))

# Sort echo_times values from lowest to highest and multiply by 1000
echo_times = np.array(sorted(echo_times)) * 1000

# Sort func_files
func_files = sorted(func_files)
print(func_files)

# Tedana output directory
tedana_output_dir = os.path.join(sbj_dir, "tedana")

print("Running tedana")

# Run tedana
tedana_cli.tedana_workflow(
data=func_files,
tes=echo_times,
out_dir=tedana_output_dir,
tedpca="mdl",
)

# Find tedana optimally combined data and mask
tedana_optcom = os.path.join(tedana_output_dir, "desc-optcom_bold.nii.gz")
tedana_mask = os.path.join(tedana_output_dir, "desc-adaptiveGoodSignal_mask.nii.gz")

# Read tedana optimally combined data and mask
tedana_optcom_img = nib.load(tedana_optcom)
tedana_mask_img = nib.load(tedana_mask)

# Save tedana optimally combined data and mask into mat files
tedana_optcom_mat = os.path.join(sbj_dir, "optcom_bold.mat")
tedana_mask_mat = os.path.join(sbj_dir, "mask.mat")
print("Saving tedana optimally combined data and mask into mat files")
scipy_io.savemat(tedana_optcom_mat, {"data": tedana_optcom_img.get_fdata()})
scipy_io.savemat(tedana_mask_mat, {"mask": tedana_mask_img.get_fdata()})

# Run mapca
print("Running mapca")
pca = MovingAveragePCA(normalize=True)
_ = pca.fit_transform(tedana_optcom_img, tedana_mask_img)

# Get AIC, KIC and MDL values
aic = pca.aic_
kic = pca.kic_
mdl = pca.mdl_

# Remove tedana output directory and the anat and func directories
subprocess.run(f"rm -rf {tedana_output_dir}", shell=True, cwd=repo)
subprocess.run(f"rm -rf {sbj}/anat", shell=True, cwd=repo)
subprocess.run(f"rm -rf {sbj}/func", shell=True, cwd=repo)

# Here run matlab script with subprocess.run
print("Running GIFT version of maPCA")

# Append AIC, KIC and MDL values to a pandas dataframe
print("Appending AIC, KIC and MDL values to a pandas dataframe")
results_df = results_df.append(
{
"Subject": sbj,
"maPCA_AIC": aic,
"GIFT_AIC": 0,
"maPCA_KIC": kic,
"GIFT_KIC": 0,
"maPCA_MDL": mdl,
"GIFT_MDL": 0,
},
ignore_index=True,
)

print("Subject", sbj, "done")

# Save pandas dataframe to csv file
results_df.to_csv(os.path.join(wdir, "results.csv"), index=False)


98 changes: 69 additions & 29 deletions tedana/decomposition/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,10 @@
def low_mem_pca(data):
"""
Run Singular Value Decomposition (SVD) on input data.

Parameters
----------
data : (S [*E] x T) array_like
Optimally combined (S x T) or full multi-echo (S*E x T) data.

Returns
-------
u : (S [*E] x C) array_like
Expand Down Expand Up @@ -185,6 +183,7 @@ def tedpca(
:py:mod:`tedana.constants` : The module describing the filenames for
various naming conventions
"""
# TODO refactor this to make it nice
if algorithm == "kundu":
alg_str = (
"followed by the Kundu component selection decision tree \\citep{kundu2013integrated}"
Expand Down Expand Up @@ -268,20 +267,15 @@ def tedpca(
f"90% varexp: {varex_90_varexp}% | 95% varexp: {varex_95_varexp}%"
)

pca_optimization_curves = np.array([aic["value"], kic["value"], mdl["value"]])
pca_criteria_components = np.array(
[
n_aic,
n_kic,
n_mdl,
n_varex_90,
n_varex_95,
]
)

# Plot maPCA optimization curves
LGR.info("Plotting maPCA optimization curves")
plot_pca_results(pca_optimization_curves, pca_criteria_components, all_varex, io_generator)
# # Plot maPCA optimization curves once all relevant mehtods have been
# # computed
# LGR.info("Plotting maPCA optimization curves")
# plot_pca_results(
# pca_optimization_curves,
# pca_criteria_components,
# all_varex, # TODO find out what this is doing
# io_generator # TODO find out what this is doing
# )

# Save maPCA results into a dictionary
mapca_results = {
Expand All @@ -300,6 +294,7 @@ def tedpca(
"explained_variance_total": mdl_varexp,
"curve": mdl["value"],
},
# TODO add the new values
"varex_90": {
"n_components": n_varex_90,
"explained_variance_total": varex_90_varexp,
Expand Down Expand Up @@ -387,35 +382,80 @@ def tedpca(
io_generator.save_file(comp_maps, "z-scored PCA components img")

# Select components using decision tree
if algorithm == "kundu":
if isinstance(algorithm, float):
alg_str = "variance explained-based"
elif isinstance(algorithm, int):
alg_str = "a fixed number of components and no"
else:
alg_str = algorithm
LGR.info(
f"Selected {comptable.shape[0]} components with {round(100*varex_norm.sum(),2)}% "
f"normalized variance explained using {alg_str} dimensionality detection"
)
comptable["classification"] = "accepted"
comptable["rationale"] = ""

if algorithm in ["mdl", "aic", "kic"]:
# Continue the standard branch of the logic with Kundu
# TODO the logic here needs a big refactoring

comptable, metric_metadata = kundu_tedpca(
comptable,
n_echos,
kdaw,
rdaw,
stabilize=False,
)
elif algorithm == "kundu-stabilize":
n_kundu = np.sum(comptable["classification"] == "accepted")
values_kundu = np.array(comptable["variance explained"])

comptable, metric_metadata = kundu_tedpca(
comptable,
n_echos,
kdaw,
rdaw,
stabilize=True,
)
else:
if isinstance(algorithm, float):
alg_str = "variance explained-based"
elif isinstance(algorithm, int):
alg_str = "a fixed number of components and no"
n_kundu_st = np.sum(comptable["classification"] == "accepted")
values_kundu_st = np.array(comptable["variance explained"])

n_val_kundu = values_kundu_st.shape[0]
n_val = aic["value"].shape[0]
if n_val > n_val_kundu:
values_kundu = np.concatenate(
[values_kundu, np.array([np.nan] * (n_val - n_val_kundu))]
)
values_kundu_st = np.concatenate(
[values_kundu_st, np.array([np.nan] * (n_val - n_val_kundu))]
)
else:
alg_str = algorithm
LGR.info(
f"Selected {comptable.shape[0]} components with {round(100*varex_norm.sum(),2)}% "
f"normalized variance explained using {alg_str} dimensionality estimate"
values_kundu = values_kundu[:n_val]
values_kundu_st = values_kundu_st[:n_val]

pca_optimization_curves = np.concatenate(
[
aic["value"].reshape(1, -1),
kic["value"].reshape(1, -1),
mdl["value"].reshape(1, -1),
values_kundu.reshape(1, -1),
values_kundu_st.reshape(1, -1),
]
)
pca_criteria_components = np.array(
[
n_aic,
n_kic,
n_mdl,
n_kundu,
n_kundu_st,
n_varex_90,
n_varex_95,
]
)
comptable["classification"] = "accepted"
comptable["rationale"] = ""

# Plot PCA selection curves
LGR.info("Plotting PCA selection curves")
plot_pca_results(pca_optimization_curves, pca_criteria_components, all_varex, io_generator)

# Save decomposition files
comp_names = [
Expand Down
36 changes: 27 additions & 9 deletions tedana/reporting/static_figures.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,26 +343,26 @@ def pca_results(criteria, n_components, all_varex, io_generator):
)
# Vertical line depicting the optimal number of components for 90% variance explained
plt.vlines(
n_components[3],
n_components[5],
ymin=np.min(criteria),
ymax=np.max(criteria),
color="tab:red",
color="tab:brown",
linestyles="dashed",
label="90% varexp",
)
# Vertical line depicting the optimal number of components for 95% variance explained
plt.vlines(
n_components[4],
n_components[6],
ymin=np.min(criteria),
ymax=np.max(criteria),
color="tab:purple",
color="tab:pink",
linestyles="dashed",
label="95% varexp",
)

plt.legend()

#  Save the plot
# Save the plot
plot_name = "pca_criteria.png"
pca_criteria_name = os.path.join(io_generator.out_dir, "figures", plot_name)
plt.savefig(pca_criteria_name)
Expand Down Expand Up @@ -403,28 +403,46 @@ def pca_results(criteria, n_components, all_varex, io_generator):
linestyles="dashed",
label="MDL",
)
# Vertical line depicting the optimal number of components for 90% variance explained
# Vertical line depicting the optimal number of components for Kundu
plt.vlines(
n_components[3],
ymin=0,
ymax=1,
color="tab:red",
linestyles="dashed",
label="90% varexp",
label="Kundu",
)
# Vertical line depicting the optimal number of components for 95% variance explained
# Vertical line depicting the optimal number of components for Kundu stabilized
plt.vlines(
n_components[4],
ymin=0,
ymax=1,
color="tab:purple",
linestyles="dashed",
label="Kundu stabilized",
)
# Vertical line depicting the optimal number of components for 90% variance explained
plt.vlines(
n_components[5],
ymin=0,
ymax=1,
color="tab:brown",
linestyles="dashed",
label="90% varexp",
)
# Vertical line depicting the optimal number of components for 95% variance explained
plt.vlines(
n_components[6],
ymin=0,
ymax=1,
color="tab:pink",
linestyles="dashed",
label="95% varexp",
)

plt.legend()

#  Save the plot
# Save the plot
plot_name = "pca_variance_explained.png"
pca_variance_explained_name = os.path.join(io_generator.out_dir, "figures", plot_name)
plt.savefig(pca_variance_explained_name)
Expand Down
5 changes: 3 additions & 2 deletions tedana/tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,8 @@ def test_integration_three_echo(skip_integration):
low_mem=True,
tedpca="aic",
)

# TODO push changes to my branch
# TODO script downloading data...
# Test re-running, but use the CLI
args = [
"-d",
Expand All @@ -353,7 +354,7 @@ def test_integration_three_echo(skip_integration):
"--mix",
os.path.join(out_dir, "desc-ICA_mixing.tsv"),
]
tedana_cli._main(args)
tedana_cli._main(args) # FIXME ficheros no coinciden

# compare the generated output files
fn = resource_filename("tedana", "tests/data/cornell_three_echo_outputs.txt")
Expand Down