## Data Transformation 
#### This notebook is used to transform the data from it's Original Format to a one more suited for the study

Importing Necessary Libraries and setting up logger

In [1]:
import re
import json
from ast import literal_eval
import logging
from pathlib import Path

from src.utils.data import *
from src.utils.memory_management import *
from src.configs.config_parser import PathConfigParser, data_config_file, project_root

# Set up logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Create a console handler
sh = logging.StreamHandler()
sh.setLevel(logging.INFO)
# Add the handler to the logger
logger.addHandler(sh)

Loading Configs from Config Parser

In [2]:
# Configs Directory
parser = PathConfigParser(str(data_config_file))
parser.load()

# Access paths
RAW_DATA_PATH = project_root / parser.get("data_paths", {}).get("raw_data") 
PROCESSED_DATA_PATH = project_root / parser.get("data_paths", {}).get("processed_data")
GE_PATH = parser.get("data_paths", {}).get("brain_regions_genes_ge")
PROCESSED_DONORS_GE_PATH = PROCESSED_DATA_PATH / GE_PATH

# Donors_ids
DONORS_IDS = parser.get("donors_ids")

### Downloading the Dataset

In [None]:
with open(project_root/ Path("data/download_dataset.py")) as file:
    exec(file.read())

Helper Functions for the transformation

In [11]:
def transform_sample_annotations(donor_sa: pd.DataFrame, left_mask : pd.Series) -> pd.DataFrame:
    """
        Transforming Sample Annotations initial files
    """
    # Keep certain columns in the SampleAnnot.csv file
    donor_sa_processed = keep_df_cols(donor_sa, ["structure_id", "structure_name", "well_id", "mri_voxel_x", "mri_voxel_y", "mri_voxel_z",	"mni_x", "mni_y", "mni_z"])
    deallocate_df(donor_sa)
    # Applying mask on Sample annotations file
    donor_sa_filtered= donor_sa_processed[left_mask].reset_index(drop=True)
    deallocate_df(donor_sa_processed)
    return donor_sa_filtered

def transform_gene_expressions(donor_ge: pd.DataFrame, left_mask : pd.Series) -> pd.DataFrame:
    """
        Transforming Gene Expression initial files
    """
    # Applying mask on the Columns of the donor_ge file
    left_mask_ge = pd.concat([pd.Series([True], index=[0]), left_mask], ignore_index=True)
    left_mask_ge_filtered = left_mask_ge[left_mask_ge].index.values.tolist()
    donor_ge_filtered  = donor_ge.iloc[:, left_mask_ge_filtered]
    deallocate_df(donor_ge)
    return donor_ge_filtered

def write_geneexpressions_to_json(df: pd.DataFrame, pth: Path) -> None:
    """
        Specific to writing gene_expression files to json and keep the lists as numbers
    """
    # Ensure 'gene_expression_values' is a list, not a string
    df["gene_expression_values"] = df["gene_expression_values"].apply(
        lambda x: literal_eval(x) if isinstance(x, str) else x
    )
    # Group by `brain_region`
    grouped = (
        df.groupby("brain_region")
        .apply(lambda x: x[["gene_id", "gene_expression_values"]].to_dict(orient="records"))
        .to_dict()
    )
    # Change group keys to number
    grouped = {int(key): value for key, value in grouped.items()}

    # Write grouped data to JSON
    with open(pth, "w") as f:
        json.dump(grouped, f, indent=4)


### Creation of Grouped CSV Files for each Donor

Defining a regex to load the files based upon

In [12]:
donor_pattern = r"^normalized_microarray_donor\d+$"
donor_dirs = [d for d in RAW_DATA_PATH.iterdir() if d.is_dir() and re.match(donor_pattern, d.name)]

Loading the files and processing them individually in `data/processed/`

In [None]:
for donor_path in donor_dirs:
    # Processing path
    logger.info(f"Processing data of {donor_path}")

    # Processing SampleAnnot to get the brain region id ("structure id")
    donor_sa = load_df_from_csv(donor_path / "SampleAnnot.csv")
    # Create a mask for left hemisphere entries
    left_mask = mask_left_hemisphere(donor_sa)
    # Transform donor_sa
    donor_sa_filtered = transform_sample_annotations(donor_sa, left_mask)

    # Loading Gene Expressions
    donor_ge = load_df_from_csv(donor_path / "MicroarrayExpression.csv")
    # Transform Gene Expressions
    donor_ge_filtered = transform_gene_expressions(donor_ge, left_mask)
    # Load probes data 
    donor_probes = load_df_from_csv(donor_path / "Probes.csv")
    # Add Brain Region id as a column name in the gene_expression data
    donor_ge_filtered.columns = ['probe_id'] + list(donor_sa_filtered["structure_id"])
    deallocate_df(donor_sa_filtered)
    # Add gene_id Column in the gene_expressions data
    donor_ge_filtered.insert(0, 'gene_id', donor_probes['gene_id'])

    # Drop probe_id column        
    donor_ge_filtered = donor_ge_filtered.drop(columns=['probe_id'])
    
    # Create a new CSV with grouped gene_ids by melting the DataFrame to make it easier to manipulate
    df_melted = donor_ge_filtered.melt(id_vars=["gene_id"], var_name="brain_region", value_name="gene_expression_values")
    # Now, group by brain_region and gene_id  and aggregate the test values into lists
    df_grouped = df_melted.groupby(["brain_region", "gene_id"])["gene_expression_values"].apply(list).reset_index()
    # *Apply json dumps to be able to load the file appropiately
    df_grouped["gene_expression_values"] = df_grouped["gene_expression_values"].apply(json.dumps)
    # Save the grouped csvs per each donor
    write_df_to_csv(df_grouped, PROCESSED_DATA_PATH / f"brain_regions_genes_geneexpressions/{get_donor_id_from_path(donor_path)}_grouped.csv")
    deallocate_df(df_melted)
    deallocate_df(donor_ge_filtered)



In [None]:
for donor_path in donor_dirs:
    # Processing path
    logger.info(f"Processing data of {donor_path}")

    # Processing SampleAnnot to get the brain region id ("structure id")
    donor_sa = load_df_from_csv(donor_path / "SampleAnnot.csv")
    # Create a mask for left hemisphere entries
    left_mask = mask_left_hemisphere(donor_sa)
    # Transform donor_sa
    donor_sa_filtered = transform_sample_annotations(donor_sa, left_mask)

    # Loading Gene Expressions
    donor_ge = load_df_from_csv(donor_path / "MicroarrayExpression.csv")
    # Transform Gene Expressions
    donor_ge_filtered = transform_gene_expressions(donor_ge, left_mask)
    # Load probes data 
    donor_probes = load_df_from_csv(donor_path / "Probes.csv")
    # Add Brain Region id as a column name in the gene_expression data
    donor_ge_filtered.columns = ['probe_id'] + list(donor_sa_filtered["structure_id"])
    
        
    # donor_ge_filtered.columns = donor_ge_filtered.columns + list(donor_sa_filtered["mri_voxel_x"])
    
    deallocate_df(donor_sa_filtered)
    # Add gene_id Column in the gene_expressions data
    donor_ge_filtered.insert(0, 'gene_id', donor_probes['gene_id'])

    # Drop probe_id column        
    donor_ge_filtered = donor_ge_filtered.drop(columns=['probe_id'])
    
    
    # Add additional columns from donor_sa_filtered as a new row
    # Extract the additional columns
    additional_columns = donor_sa_filtered[["mri_voxel_x", "mri_voxel_y", "mri_voxel_z", "mni_x", "mni_y", "mni_z"]]
    
    # Transpose the additional columns to make them a row
    additional_row = additional_columns.T  # Transpose to make columns into a row
    additional_row.columns = donor_ge_filtered.columns  # Align columns with donor_ge_filtered
    
    # Append the additional row to donor_ge_filtered
    donor_ge_filtered = pd.concat([donor_ge_filtered, additional_row], ignore_index=True)

    
    # Create a new CSV with grouped gene_ids by melting the DataFrame to make it easier to manipulate
    df_melted = donor_ge_filtered.melt(id_vars=["gene_id"], var_name="brain_region", value_name="gene_expression_values")
    # Now, group by brain_region and gene_id  and aggregate the test values into lists
    df_grouped = df_melted.groupby(["brain_region", "gene_id"])["gene_expression_values"].apply(list).reset_index()
    # *Apply json dumps to be able to load the file appropiately
    df_grouped["gene_expression_values"] = df_grouped["gene_expression_values"].apply(json.dumps)
    # Save the grouped csvs per each donor
    write_df_to_csv(df_grouped, PROCESSED_DATA_PATH / f"brain_regions_genes_geneexpressions/{get_donor_id_from_path(donor_path)}_grouped.csv")
    deallocate_df(df_melted)
    deallocate_df(donor_ge_filtered)



### Meta Donor CSV Creation and Selecting only Common brain regions for the study

Creating `meta_donor.csv` which is the file having the combined data from all donors

In [None]:
donor_ges = []
for donor in DONORS_IDS:
    # Load donor .csv from processed data 
    donor_ge = load_df_from_csv(PROCESSED_DONORS_GE_PATH / Path(f"{donor}_grouped.csv"))
    logger.info(f"Donor Id: {str(donor)}")
    logger.info(f"Number of brain regions: {donor_ge['brain_region'].nunique()}")
    logger.info(f"Number of gene ids: {donor_ge['gene_id'].nunique()}")
    donor_ge["gene_expression_values"]=donor_ge["gene_expression_values"].apply(json.loads)
    donor_ges.append(donor_ge)
    deallocate_df(donor_ge)

In [None]:
all_brain_region = set.union(*(set(donor_ge['brain_region']) for donor_ge in donor_ges))
len(all_brain_region)

In [None]:
# Finding common brain regions and filtering the others out
common_brain_regions = set.intersection(*(set(donor_ge['brain_region']) for donor_ge in donor_ges))
logger.info(f"Number of common brain regions: {len(common_brain_regions)}")

# Keep only the common columns in each DataFrame
filtered_donors_ges = [donor_ge[donor_ge['brain_region'].isin(common_brain_regions)] for donor_ge in donor_ges]
set(filtered_donors_ges[3]['brain_region'])==common_brain_regions

# Concatenate all filtered DataFrames
meta_donor_df = pd.concat(filtered_donors_ges, ignore_index=True)
logger.info(f"meta_donor_df size:{len(meta_donor_df)}")
logger.info(f"Meta Donor DF has only list of common_brain_regions: {set(meta_donor_df['brain_region'])==common_brain_regions}")

concatenated_ges = meta_donor_df.groupby(["brain_region", "gene_id"])["gene_expression_values"].apply(lambda x: sum(x, [])).reset_index()

write_df_to_csv(concatenated_ges, PROCESSED_DONORS_GE_PATH / f"meta_donor.csv")

### [Optional] Transforming files to .json for hierarchal structure. 

In [None]:
# Creating .json files for Hierarchal Data
for donor in DONORS_IDS:
    logger.info(f"Creating donor id: {str(donor)} .json file")
    # Load donor .csv from processed data 
    donor_ge = load_df_from_csv(PROCESSED_DONORS_GE_PATH / Path(f"{donor}_grouped.csv"))
    write_geneexpressions_to_json(donor_ge, PROCESSED_DONORS_GE_PATH / Path(f"{donor}_grouped.json"))
# Creating Meta Donor Json File
logger.info(f"Creating meta_donor.json file")
meta_donor_ge = load_df_from_csv(PROCESSED_DONORS_GE_PATH / Path(f"meta_donor.csv"))
write_geneexpressions_to_json(meta_donor_ge, PROCESSED_DONORS_GE_PATH / Path(f"meta_donor.json"))