In [59]:
import os.path as op

import plotly.express as px
import pandas as pd
import nibabel as nib
from nilearn._utils.niimg_conversions import _check_same_fov
from nilearn.image import concat_imgs, resample_to_img
from nimare.stats import pearson
from nimare.utils import get_masker

In [2]:
# Drop duplicate images within collection, assuming the are in the same space and have
# the same number of subjects

In [51]:
def _n_contrast(df):
    return df.cognitive_contrast_cogatlas_id.unique().size


def _n_definitions(df):
    return df.contrast_definition.unique().size


def _get_count_by_collection(data_df, groupby, label, count_func=None):
    if count_func:
        out_df = data_df.groupby(groupby).apply(count_func)
    else:
        out_df = data_df.groupby(groupby).size()

    out_df = pd.DataFrame(out_df)
    out_df = out_df.rename(columns={out_df.columns[0]: label})

    return out_df.reset_index()

In [2]:
HIERARCHY_DICT = {
    "None / Other": -float("inf"),
    "Other": -float("inf"),
    "None": -float("inf"),
}

In [3]:
def _get_corr_matrix(images, masker):
    _resample_kwargs = {"clip": True, "interpolation": "linear"}
    imgs = [
        (
            nib.load(img)
            if _check_same_fov(nib.load(img), reference_masker=masker.mask_img)
            else resample_to_img(nib.load(img), masker.mask_img, **_resample_kwargs)
        )
        for img in images
    ]

    img4d = concat_imgs(imgs, ensure_ndim=4)
    data = masker.transform(img4d)

    return [pearson(img_map, data) for img_map in list(data)]

In [4]:
def _n_cogats(df):
    return df.cognitive_paradigm_cogatlas_id.unique().size

In [5]:
def _infer_contrast(text):
    # TODO: infer the contrast name
    # Actually, use regex to match the text to a contrast name
    # Implement fuzzy matching, to list the cogatlas contrast names and select the most similar
    # Use tokenizer from scikit learn to tokenize the text and compare with the contrast names
    return text


def _select_relevant_contrast(contrast_names):
    if len(contrast_names) == 1:
        return contrast_names[0]
    else:
        # Select one based on the hierarchy using the contrast name
        scores = []
        for contrast_name in contrast_names:
            if contrast_name in HIERARCHY_DICT:
                score = HIERARCHY_DICT[contrast_name]
            else:
                # If contrast name is not in the hierarchy, infer the contrast name
                inferred_name = _infer_contrast(contrast_name)
                score = HIERARCHY_DICT[inferred_name]
            scores.append(score)

        # If multiple contrast names have the same score, select the first one
        return contrast_names[scores.index(max(scores))]


def _heuristic_selection(data_df, columns):
    """Heuristic to select the most relevant image from a collection of images.

    Parameters
    ----------
    data_df : pandas.DataFrame
        Dataframe containing the images to select from.
    columns : list of str
        List of columns to use for the heuristic sorted by priority.
        This columns are use for the recursive selection of the image.

    Returns
    -------
    pandas.DataFrame
        Dataframe containing the selected image.
    """
    assert len(columns) > 0, "At least one column is required"

    # Drop rows with NaN in the first column
    sub_df = data_df.dropna(subset=[columns[0]])

    if sub_df.shape[0] == 0:
        # No contrasts name, check the contrast_definition if the next element
        # from columns is available
        if len(columns) > 1:
            return _heuristic_selection(data_df, columns[1:])
        else:
            # No contrast definition. Select random image from the collection
            return data_df.sample(1)

    elif sub_df.shape[0] == 1:
        # Only one contrast definition, select it
        return sub_df.copy()

    else:
        # Multiple contrast definitions
        # Get unique contrast names, and select one based on the hierarchy
        contrast_names = sub_df.cognitive_contrast_cogatlas_name.unique()
        contrast_selected = _select_relevant_contrast(contrast_names)

        # Select the image with the selected contrast
        image_selected = sub_df[sub_df.cognitive_contrast_cogatlas_name == contrast_selected]
        if image_selected.shape[0] == 1:
            # Only one image with the selected contrast
            return image_selected.copy()
        else:
            if len(columns) > 1:
                # Actually, if if columns[0] it need to look at the next column
                return _heuristic_selection(image_selected, columns[1:])
            else:
                # Multiple images with the selected contrast, select one randomly.
                return image_selected.sample(1)

In [16]:
data_dir = "../data"
nv_collections_images_df = pd.read_csv(op.join(data_dir, "nv_collections_images.csv"))

nv_collections_images_df["cognitive_contrast_cogatlas_id"] = nv_collections_images_df[
    "cognitive_contrast_cogatlas_id"
].replace({"Other": None})

In [48]:
unique_cogats = (
    nv_collections_images_df.groupby("collection_id").apply(_n_cogats).sort_values(ascending=True)
)
img_per_coll = nv_collections_images_df.groupby("collection_id").size().sort_values(ascending=True)

# Get collections with only one unique cognitive atlas and multiple images
coll_unique_cogat = unique_cogats[unique_cogats == 1]
img_per_coll_unique_cogat = img_per_coll[img_per_coll.index.isin(coll_unique_cogat.index)]
coll_unique_cogat_multi_images = img_per_coll_unique_cogat[img_per_coll_unique_cogat > 1]
coll_unique_cogat_multi_images = pd.DataFrame(coll_unique_cogat_multi_images)
coll_unique_cogat_multi_images = coll_unique_cogat_multi_images.rename(
    columns={coll_unique_cogat_multi_images.columns[0]: "n_images"}
)
coll_unique_cogat_multi_images = coll_unique_cogat_multi_images.reset_index()
coll_unique_cogat_multi_images["n_tasks"] = 1

# The collections in the following dataframe have only 1 image, so they
# do not need to go through multiple images selection process.
coll_one_image = img_per_coll[img_per_coll == 1]
coll_one_image_df = nv_collections_images_df[
    nv_collections_images_df.collection_id.isin(coll_one_image.index)
]

# What about the collections with multiple unique cognitive atlas?

In [54]:
nv_collections_contrast_df = nv_collections_images_df.dropna(
    subset=["cognitive_contrast_cogatlas_id"]
)
nv_collections_definition_df = nv_collections_images_df.dropna(
    subset=["contrast_definition"]
)

n_contrasts_by_coll = _get_count_by_collection(
    nv_collections_contrast_df, 
    "collection_id", 
    "n_contrasts",
)
n_unique_contrasts_by_coll = _get_count_by_collection(
    nv_collections_contrast_df, 
    "collection_id", 
    "n_unique_contrasts", 
    count_func=_n_contrast,
)

n_contrasts_def_by_coll =  _get_count_by_collection(
    nv_collections_definition_df, 
    "collection_id", 
    "n_contrasts_def",
)
n_unique_contrasts_def_by_coll = _get_count_by_collection(
    nv_collections_definition_df, 
    "collection_id", 
    "n_unique_contrasts_def", 
    count_func=_n_definitions,
)

In [55]:
coll_unique_cogat_multi_images = (
    coll_unique_cogat_multi_images.merge(n_contrasts_by_coll, how="left", on="collection_id")
    .merge(n_unique_contrasts_by_coll, how="left", on="collection_id")
    .merge(n_contrasts_def_by_coll, how="left", on="collection_id")
    .merge(n_unique_contrasts_def_by_coll, how="left", on="collection_id")
)

for column in ["n_contrasts", "n_unique_contrasts", "n_contrasts_def", "n_unique_contrasts_def"]:
    coll_unique_cogat_multi_images[column] = coll_unique_cogat_multi_images[column].fillna(0).astype(int)

In [58]:
coll_unique_cogat_multi_images.to_csv(op.join(data_dir, "nv_colls_unique_cogat_multi_images.csv"), index=False)

In [13]:
collections = coll_unique_cogat_multi_images.index.to_list()

image_selected_lst = []
for collection in collections:
    sub_df = nv_collections_images_df[nv_collections_images_df.collection_id == collection]

    image_selected = _heuristic_selection(
        sub_df,
        [
            "cognitive_contrast_cogatlas_name",
            "contrast_definition",
        ],
    )
    image_selected_lst.append(image_selected)
# Look at the concept links to contrast

image_selected_df = pd.concat(image_selected_lst)

In [14]:
coll_image_selected_df = pd.concat([coll_one_image_df, image_selected_df])
coll_image_selected_df

Unnamed: 0,pmid,pmcid,doi,secondary_doi,collection_id,collection_name,source,image_name,map_type,image_file,collection_id.1,image_id,number_of_subjects,cognitive_paradigm_cogatlas_id,cognitive_contrast_cogatlas_id,contrast_definition,cognitive_paradigm_cogatlas_name,cognitive_contrast_cogatlas_name,image_path
3317,38066069,10709616.0,,,13520,Experience sampling reveals the role that cove...,pubmed,Non-target,Z,images/13520/nontarget_group.nii.gz,13520,798383,57,trm_4da86cfe8cf1b,Other,,sustained attention to response task,Other,13520-798383_nontarget_group.nii.gz
1218,29069521,5716095.0,10.1093/scan/nsx125,,3952,Love flows downstream: mothers’ and children’s...,neurovault,children's brain self-family contrast negative...,Z,images/3952/zstat8.nii.gz,3952,65066,22,trm_4da890594742a,cnt_553a6ea7469af,mothers' attention to self pain > their child ...,emotional regulation task,all,3952-65066_zstat8.nii.gz
1163,29753107,,10.1016/j.neuroimage.2018.05.025,,3814,The neural basis of free language choice in bi...,neurovault,Figure 2a,T,images/3814/spmT_0002.nii.gz,3814,64145,15,tsk_4a57abb949cfb,cnt_4df6a77053b3f,naming in English (L2) versus naming in German...,picture naming task,accuracy of participant minus average accuracy...,3814-64145_spmT_0002.nii.gz
1165,25062683,,10.1016/j.biopsych.2014.06.005,,3655,Corticostriatal Control of Goal-Directed Actio...,neurovault,Valued actions simple effect in 18 healthy adults,T,images/3655/spmT_0002.nii.gz,3655,62601,18,trm_4f2414059baa8,,Simple effect of valued actions delta function,instrumental learning task,None / Other,3655-62601_spmT_0002.nii.gz
1001,29437891,5858598.0,,,3554,Abstract memory representations in the ventrom...,pubmed,Prototype model correlates,Z,images/3554/zstat1.nii.gz,3554,61824,29,trm_4f24112057e90,cnt_55a3034ca0943,prototype model predictor versus implicit base...,categorization task,percent correct,3554-61824_zstat1.nii.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2601,34147817,8220377.0,10.1016/j.nicl.2021.102723,,8993,Whole-brain functional correlates of memory fo...,neurovault,across group subsequentmemory scenes participa...,T,images/8993/across-group_subsequentmemory-scen...,8993,459316,60,tsk_mFS3uwUMAhXxe,,across-groups,Memory encoding task,None / Other,8993-459316_across-group_subsequentmemory-scen...
3198,37713673,,10.1162/jocn_a_02055,,13693,"Neural Mechanisms Underlying Trust to Friends,...",neurovault,Positive T-test Whole brain Regression with Ge...,T,images/13693/spmT_0001_6.nii.gz,13693,795325,92,tsk_Ncknr0soiM4IV,,,social decision-making task,None / Other,13693-795325_spmT_0001_6.nii.gz
174,27989844,,10.1016/j.neuroimage.2016.10.041,,1848,In need of constraint: Understanding the role ...,neurovault,Seed_i9_Perseverance_NegativeContrast_zstat,Z,images/1848/zstat13_4.nii.gz,1848,28822,204,trm_56a91a92043bc,,,UPPS-P Impulsivity Scale,None / Other,1848-28822_zstat13_4.nii.gz
831,28957344,5619738.0,10.1371/journal.pone.0185152,,2853,Functional hemispheric asymmetries during the ...,neurovault,Planning lateralization right hand S,T,images/2853/spmT_0003_1.nii.gz,2853,53730,19,trm_550b53d7dd674,,,motor fMRI task paradigm,None / Other,2853-53730_spmT_0003_1.nii.gz


In [11]:
image_selected_df.to_csv(op.join(data_dir, "nv_collections_images_selected.csv"), index=False)

NameError: name 'image_selected_df' is not defined

In [12]:
mask_fn = "/Users/jperaza/Documents/GitHub/NiMARE/nimare/resources/templates/MNI152_2x2x2_brainmask.nii.gz"
masker = get_masker(mask_fn)

In [19]:
# Example of duplicated images: same map, different stats
img1_fn = "/Users/jperaza/Documents/GitHub/large-scale-ibma/data/nv-data/images/13137-790833_speechRev_spmT_0001.nii.gz"
img2_fn = "/Users/jperaza/Documents/GitHub/large-scale-ibma/data/nv-data/images/13137-790848_speechRev_Zmap.nii.gz"

In [20]:
_get_corr_matrix([img1_fn, img2_fn], masker)

[array([1.0000007, 0.9965031], dtype=float32),
 array([0.9965031, 1.0000005], dtype=float32)]

In [21]:
# Example of duplicated images: same contrast, sign inverted
img1_fn = (
    "/Users/jperaza/Documents/GitHub/large-scale-ibma/data/nv-data/images/3192-57498_zstat1.nii.gz"
)
img2_fn = "/Users/jperaza/Documents/GitHub/large-scale-ibma/data/nv-data/images/3192-57499_zstat1_1.nii.gz"

In [22]:
_get_corr_matrix([img1_fn, img2_fn], masker)

[array([ 0.9999992, -0.9999992], dtype=float32),
 array([-0.9999992,  0.9999992], dtype=float32)]

In [None]:
# Should we drop image that have ICA or PCA in the file name?