This notebook aggregates each bone on a per fish basis.
We will be aggregating via the following methods:
- Mean
- Sum
- Custom defined per feature

In [1]:
import pathlib

import numpy as np
import pandas as pd

In [2]:
# set the path to the processed data
unaggregated_fs_data_path = pathlib.Path(
    "../../data/5.converted_data/normalized_feature_selected_output.parquet"
).resolve(strict=True)

# custom aggregation type
custom_aggregated_type_path = pathlib.Path(
    "../utils/morphology_aggregation_method.csv"
).resolve(strict=True)

# output data paths
# mean aggregated data
mean_aggregated_data_path = pathlib.Path(
    "../../data/5.converted_data/mean_aggregated_data.parquet"
).resolve()

# sum aggregated data
sum_aggregated_data_path = pathlib.Path(
    "../../data/5.converted_data/sum_aggregated_data.parquet"
).resolve()

# custom aggregation data
custom_aggregated_data_path = pathlib.Path(
    "../../data/5.converted_data/custom_aggregated_data.parquet"
).resolve()

# unaggregated data path
unaggregated_data_path = pathlib.Path(
    "../../data/5.converted_data/non_aggregated_data.parquet"
).resolve()

In [3]:
# read the data
unaggregated_fs_data = pd.read_parquet(unaggregated_fs_data_path)

# show the data shape
print(f"unaggregated_fs_data shape: {unaggregated_fs_data.shape}")
unaggregated_fs_data.head()

unaggregated_fs_data shape: (136, 268)


Unnamed: 0,Metadata_Image_FileName_OP,Metadata_ObjectNumber,Metadata_Object_ConvertImageToObjects_Number_Object_Number,Metadata_Object_ConvertImageToObjects_AreaShape_BoundingBoxArea,Metadata_Object_ConvertImageToObjects_AreaShape_BoundingBoxMaximum_X,Metadata_Object_ConvertImageToObjects_AreaShape_BoundingBoxMaximum_Y,Metadata_Object_ConvertImageToObjects_AreaShape_BoundingBoxMinimum_X,Metadata_Object_ConvertImageToObjects_AreaShape_BoundingBoxMinimum_Y,Metadata_Object_ConvertImageToObjects_Location_CenterMassIntensity_X_OP,Metadata_Object_ConvertImageToObjects_Location_CenterMassIntensity_Y_OP,...,Texture_SumEntropy_OP_3_02_256,Texture_SumEntropy_OP_3_03_256,Texture_SumVariance_OP_3_00_256,Texture_SumVariance_OP_3_01_256,Texture_SumVariance_OP_3_02_256,Texture_SumVariance_OP_3_03_256,Texture_Variance_OP_3_00_256,Texture_Variance_OP_3_01_256,Texture_Variance_OP_3_02_256,Texture_Variance_OP_3_03_256
0,MAX_high_10_L.tiff,1,1,38250.0,269.0,182.0,44.0,12.0,131.546149,118.641091,...,0.841475,0.834574,1.149186,1.1522,1.078905,1.144848,1.040009,1.02574,1.045492,1.027617
1,MAX_high_10_R.tiff,1,1,34170.0,208.0,245.0,38.0,44.0,106.962058,162.30419,...,0.45248,0.436138,0.112178,0.131982,0.108602,0.1257,0.075717,0.071207,0.076477,0.067922
2,MAX_high_11_L.tiff,1,1,41736.0,250.0,267.0,62.0,45.0,131.359827,159.444463,...,0.494813,0.490844,-0.01614,0.001041,-0.021997,-0.020992,-0.033934,-0.042635,-0.034983,-0.038763
3,MAX_high_11_R.tiff,1,1,43616.0,212.0,272.0,24.0,40.0,101.069901,185.48643,...,0.878729,0.865617,1.120905,1.140604,1.074529,1.117941,1.031585,1.015172,1.03515,1.019188
4,MAX_high_12_L.tiff,2,2,25894.0,283.0,155.0,69.0,34.0,164.579054,105.266522,...,0.476768,0.482674,0.177886,0.147933,0.103313,0.156457,0.111765,0.107846,0.117614,0.108859


## Get the features for the fs data

In [4]:
# get the metadata
metadata_cols = unaggregated_fs_data.columns[
    unaggregated_fs_data.columns.str.contains("Metadata")
]
metadata_cols

# get the features df
fs_data = unaggregated_fs_data.drop(metadata_cols, axis=1)
fs_data.insert(0, "Metadata_replicate", unaggregated_fs_data["Metadata_replicate"])
fs_data.insert(0, "Metadata_genotype", unaggregated_fs_data["Metadata_genotype"])
fs_data.insert(0, "Metadata_side", unaggregated_fs_data["Metadata_side"])
print(f"fs_data shape: {fs_data.shape}")
fs_data.head()

fs_data shape: (136, 247)


Unnamed: 0,Metadata_side,Metadata_genotype,Metadata_replicate,AreaShape_Area,AreaShape_CentralMoment_0_0,AreaShape_CentralMoment_0_1,AreaShape_CentralMoment_0_2,AreaShape_CentralMoment_0_3,AreaShape_CentralMoment_1_0,AreaShape_CentralMoment_1_1,...,Texture_SumEntropy_OP_3_02_256,Texture_SumEntropy_OP_3_03_256,Texture_SumVariance_OP_3_00_256,Texture_SumVariance_OP_3_01_256,Texture_SumVariance_OP_3_02_256,Texture_SumVariance_OP_3_03_256,Texture_Variance_OP_3_00_256,Texture_Variance_OP_3_01_256,Texture_Variance_OP_3_02_256,Texture_Variance_OP_3_03_256
0,L,high,10,1.706234,1.706234,2.652373,3.280425,-1.992966,0.18463,-1.404376,...,0.841475,0.834574,1.149186,1.1522,1.078905,1.144848,1.040009,1.02574,1.045492,1.027617
1,R,high,10,0.771674,0.771674,-1.332747,0.164304,0.244371,0.17376,-1.885144,...,0.45248,0.436138,0.112178,0.131982,0.108602,0.1257,0.075717,0.071207,0.076477,0.067922
2,L,high,11,2.180858,2.180858,2.142686,1.838038,-0.077515,0.00865,0.876597,...,0.494813,0.490844,-0.01614,0.001041,-0.021997,-0.020992,-0.033934,-0.042635,-0.034983,-0.038763
3,R,high,11,2.081557,2.081557,-0.880954,1.014901,0.660697,3.593763,-1.417922,...,0.878729,0.865617,1.120905,1.140604,1.074529,1.117941,1.031585,1.015172,1.03515,1.019188
4,L,high,12,0.659368,0.659368,-0.472511,1.810874,-1.929153,0.242599,-0.841314,...,0.476768,0.482674,0.177886,0.147933,0.103313,0.156457,0.111765,0.107846,0.117614,0.108859


### Mean

In [5]:
# aggregate the data and get the mean
mean_aggreated_data = (
    fs_data.groupby(["Metadata_genotype", "Metadata_replicate", "Metadata_side"])
    .mean()
    .reset_index()
)
print(f"aggreated_data shape: {mean_aggreated_data.shape}")
# save the data
mean_aggreated_data.to_parquet(mean_aggregated_data_path)

aggreated_data shape: (83, 247)


### Sum

In [6]:
# aggregate the data and get the sum
sum_aggreated_data = (
    fs_data.groupby(["Metadata_genotype", "Metadata_replicate", "Metadata_side"])
    .sum()
    .reset_index()
)
print(f"aggreated_data shape: {sum_aggreated_data.shape}")
# save the data
sum_aggreated_data.to_parquet(sum_aggregated_data_path)

aggreated_data shape: (83, 247)


### Custom defined per feature

In [7]:
# read in the custom aggregation method
custom_aggregated_method = pd.read_csv(custom_aggregated_type_path)
print(custom_aggregated_method.shape)
# Double check that the features are in the data
for feature in custom_aggregated_method["Feature"]:
    assert feature in fs_data.columns, f"{feature} not found in the data"

(244, 3)


In [8]:
# define an output dataframe to store the custom aggregated data
custom_aggregated_data = pd.DataFrame()

# define the metadata columns to aggregate by
metadata_cols = ["Metadata_genotype", "Metadata_replicate", "Metadata_side"]

# loop through the features and aggregate the data
for feature in custom_aggregated_method["Feature"]:
    # get the aggregation method
    agg_method = custom_aggregated_method[
        custom_aggregated_method["Feature"] == feature
    ]["Aggregation"].values[0]
    # get the data
    feature_data = fs_data[metadata_cols + [feature]]
    # aggregate the data
    if agg_method == "Mean":
        feature_data = feature_data.groupby(metadata_cols).mean().reset_index()
    elif agg_method == "Sum":
        feature_data = feature_data.groupby(metadata_cols).sum().reset_index()
    else:
        raise ValueError(f"Aggregation method {agg_method} not recognized")
    # add the data to the output
    custom_aggregated_data = pd.concat([custom_aggregated_data, feature_data], axis=1)

# drop duplicate columns
custom_aggregated_data = custom_aggregated_data.loc[
    :, ~custom_aggregated_data.columns.duplicated()
]
print(f"custom_aggregated_data shape: {custom_aggregated_data.shape}")

# save the data
custom_aggregated_data.to_parquet(custom_aggregated_data_path)

custom_aggregated_data shape: (83, 247)


## non-aggregated features

In [9]:
# non aggregated data
non_aggregated_data = fs_data.copy()
non_aggregated_data.to_parquet(unaggregated_data_path)