# Test Datasets
Create some test datasets on DSA for developing NeuroTK.

In [6]:
# Imports
from dsa_helpers.girder_utils import login, get_items
import pandas as pd

In [29]:
# Function to reconstruct the nested dictionary (source: ChatGPT)
def reconstruct_nested(flattened, sep):
    nested = {}
    for key, value in flattened.items():
        keys = key.split(sep)
        d = nested
        for part in keys[:-1]:
            if part not in d:
                d[part] = {}
            d = d[part]
        d[keys[-1]] = value
    return nested

In [2]:
# Authenticate girder client.
gc = login("http://bdsa.pathology.emory.edu:8080/api/v1")

In [None]:
# Filter items to only those verified, not control, and not bad images.
# Secondary, filter to only Tau stains.
items_by_case = {}  # save the items grouped by their case ID.
items = []

# Iterate through the year folders in the Emory ADRC collection.
for fld in gc.listFolder("673133b4900c0c05599779aa"):
    # Only check 2020, 2022, 2023 and 2024, which I have verified.
    if fld['name'] in ('2020', '2022', '2023', '2024'):
        # Check items in the folder.
        for item in get_items(gc, fld['_id']):
            if 'meta' not in item:
                meta = {}
            else:
                meta = item['meta']
                
            if 'npSchema' not in meta:
                continue
                
            # Only include good images that are verified and are not control.
            if (
                meta.get('control') != "yes" and \
                meta.get("bad_image") == "no" and \
                meta['npSchema'].get('verified') == "yes" and \
                meta['npSchema'].get('stainID') == "Tau"
            ):
                case_id = meta['npSchema']['caseID']
                
                if case_id not in items_by_case:
                    items_by_case[case_id] = []
                    
                items_by_case[case_id].append(item)
                items.append(item)
                
print(f"Total number of cases: {len(items_by_case)}")
print(f"Total number of images: {len(items)}")

In [None]:
df = pd.json_normalize(items, sep=":").fillna("")
df.head()

In [None]:
# Print the unique region names.
for k, v in df['meta:npSchema:regionName'].value_counts().items():
    if k != "":
        print(f"{k} (n={v})")
        
# Specify the regions that I want to include.
regions = {
    "Amygdala": "Amygdala",
    "Temporal cortex": "Temporal cortex",
    "Hippocampus": "Hippocampus",
    "Occipital cortex": "Occipital cortex",
    "Left hippocampus": "Hippocampus",
    "Right hippocampus": "Hippocampus",
}

# Get a count of the unique region values.
n_regions = len(set(regions.values()))
print(f"Total number of regions: {n_regions}")

In [None]:
"""Loop through the cases.
Only include cases that have images from all the selected regions.
Only include on image from each region.
"""
neurotk_items = []
neurotk_items_small = []
case_count = 0

for case_id, case_items in items_by_case.items():
    # Convert the case items to a dataframe.
    case_df = pd.json_normalize(case_items, sep=":").fillna("")
    
    # Add a region name column, whose value is set by the regions dictionary.
    for idx, r in case_df.iterrows():
        if r['meta:npSchema:regionName'] in regions:
            case_df.loc[idx, 'region'] = regions[r['meta:npSchema:regionName']]
        else:
            case_df.loc[idx, 'region'] = ""
            
    # Filter to regions available.
    case_df = case_df[case_df['region'] != ""]
    
    # Check that all regions are available.
    if len(case_df['region'].unique()) != n_regions:
        continue
    
    # For each unique region only take one image.
    for region in case_df['region'].unique():
        case_df_region = case_df[case_df['region'] == region]
        
        item = reconstruct_nested(case_df_region.iloc[0].to_dict(), ":")
        item["meta"]["npSchema"]["region"] = item["region"]
        
        # Only add some keys.
        item = {
            "_id": item['_id'],
            "name": item['name'],
            "meta": item['meta'],
        }

        # Take the first image.
        neurotk_items.append(
            item
        )
        
        if case_count < 2:
            neurotk_items_small.append(
                item
            )
        
    case_count += 1

print("Total number of items to push to NeuroTK:", len(neurotk_items))  
print("For the small dataset:", len(neurotk_items_small))

In [40]:
# Create the dataset items.
_ = gc.createItem(
    "673512de900c0c0559bf12f0", "Verified Tau 4 Regions", reuseExisting=True,
    metadata={"dataset": neurotk_items, "filters": {}}
)
_ = gc.createItem(
    "673512de900c0c0559bf12f0", "Verified Tau 4 Regions (small)", 
    reuseExisting=True, 
    metadata={"dataset": neurotk_items_small, "filters": {}}
)

**Done!**