In [1]:
import argparse
import itertools
import pathlib

import numpy as np
import pandas as pd
import toml

In [2]:
argparser = argparse.ArgumentParser()
argparser.add_argument("--cell_type", default="all")

args = argparser.parse_args()

cell_type = args.cell_type

usage: ipykernel_launcher.py [-h] [--cell_type CELL_TYPE]
ipykernel_launcher.py: error: unrecognized arguments: --f=/home/lippincm/.local/share/jupyter/runtime/kernel-v2-269506aACVuRK4WtMt.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [4]:
# Parameters
aggregation = True
nomic = True

In [5]:
MODEL_TYPE = "regression"

In [6]:
# toml file path
TOML_PATH = pathlib.Path("../splits.toml")
# read toml file via toml
data_splits_by_treatments = toml.load(TOML_PATH)

# define the 100% test set data treatments
test_100_percent = data_splits_by_treatments["splits"]["data_splits_100"]
test_75_percent = data_splits_by_treatments["splits"]["data_splits_75"]

In [7]:
aggregate_and_nomic_path = pathlib.Path(
    f"../../../data/{cell_type}_preprocessed_sc_norm_aggregated_nomic.parquet"
).resolve(strict=True)
aggregate_path = pathlib.Path(
    f"../../../data/{cell_type}_preprocessed_sc_norm_aggregated.parquet"
).resolve(strict=True)
data_df = pd.read_parquet(aggregate_and_nomic_path)

data_df.head()

morphology_df = pd.read_parquet(aggregate_path)

In [8]:
# get the NSU columns
nsu_cols = [col for col in data_df.columns if "NSU" in col]
nomic_df = data_df[nsu_cols]
nomic_df.loc["Metadata_Well"] = data_df["Metadata_Well"]
nomic_df.loc["oneb_Treatment_Dose_Inhibitor_Dose"] = data_df[
    "oneb_Treatment_Dose_Inhibitor_Dose"
]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nomic_df.loc["Metadata_Well"] = data_df["Metadata_Well"]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nomic_df.loc["oneb_Treatment_Dose_Inhibitor_Dose"] = data_df[


In [9]:
# subset each column that contains metadata
metadata = data_df.filter(regex="Metadata")

# get all columns that are not metadata except for metadata_Well
data = data_df.drop(metadata.columns, axis=1)

# get the metadata_Well column
metadata_well = metadata[
    ["Metadata_Well", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
]

data_df = pd.merge(data, metadata_well, left_index=True, right_index=True)

In [10]:
# drop morphology metadata
morphology_df = morphology_df.drop(
    morphology_df.filter(regex="Metadata").columns, axis=1
)
morphology_df.head()

Unnamed: 0,Cytoplasm_AreaShape_Compactness,Cytoplasm_AreaShape_FormFactor,Cytoplasm_AreaShape_MajorAxisLength,Cytoplasm_AreaShape_MinorAxisLength,Cytoplasm_AreaShape_Orientation,Cytoplasm_AreaShape_Zernike_0_0,Cytoplasm_AreaShape_Zernike_1_1,Cytoplasm_AreaShape_Zernike_2_0,Cytoplasm_AreaShape_Zernike_2_2,Cytoplasm_AreaShape_Zernike_3_1,...,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_CorrMito_3_02_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_00_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_02_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_03_256,Nuclei_Texture_SumEntropy_CorrPM_3_01_256,Nuclei_Texture_SumVariance_CorrPM_3_01_256,Nuclei_Texture_Variance_CorrER_3_00_256,Nuclei_Texture_Variance_CorrGasdermin_3_00_256
0,-0.032252,0.060488,0.105783,-0.029521,0.00512,-0.052854,-0.039927,-0.004497,0.022885,-0.045796,...,0.06666,0.053661,-0.196861,-0.206514,-0.203146,-0.211731,0.211322,0.014317,-0.03597,-0.012618
1,-0.033178,0.054449,0.060468,-0.073702,-0.012549,-0.076747,-0.072953,0.003722,-0.001647,-0.075508,...,0.098272,0.215308,-0.080188,-0.095363,-0.092275,-0.096811,0.086683,0.0079,-0.037725,-0.025987
2,-0.095278,0.085873,-0.114237,-0.175646,-0.003159,-0.054675,-0.120368,0.084387,-0.047532,-0.086031,...,-0.045934,-0.530389,-0.0244,-0.023351,-0.022373,-0.027236,-0.008216,-0.037703,-0.002063,0.006181
3,-0.081,0.078124,-0.104225,-0.164719,0.001145,-0.070612,-0.131044,0.088143,-0.062731,-0.094268,...,-0.008374,-1.001879,-0.015672,-0.018851,-0.021075,-0.021581,-0.014699,-0.031968,-0.025621,-0.019639
4,0.022371,-0.022614,-0.039573,-0.0493,-0.013386,-0.035579,-0.026341,-0.034624,-0.015618,-0.018042,...,0.06117,0.279401,0.037046,0.029486,0.030131,0.033986,-0.032881,-0.010667,-0.02728,-0.026652


In [11]:
# define the list of the channels
channel_list = ["DNA", "Gasdermin", "ER", "Mito", "PM"]

In [12]:
# set up the LOO channel with recursion for dropping multiple channels


def channel_drop(df, channel):
    df = df.drop(df.filter(regex=channel).columns, axis=1)
    return df

In [13]:
# dictionary for each df to go into
results_dict = {}

In [14]:
# get all of the the channel combinations
for i in channel_list:
    tmp = channel_drop(morphology_df, i[0])
    tmp_df = tmp
    # get the remaining channels for indexing purposes
    channel_list_index = [x for x in channel_list if x not in i]
    channel_list_index = "_".join(channel_list_index)
    results_dict[channel_list_index] = tmp_df
else:
    print("channel length error")

channel length error


In [15]:
# set path to save
pathlib.Path(f"../indexes/{cell_type}/regression/channels").mkdir(
    parents=True, exist_ok=True
)

# loop through the dictionary and save each dataframe
for i in results_dict:
    print(i)
    print(results_dict[i].shape)
    # rename the dictionary keys
    # combine the metadata and morphology dataframes
    new_df = pd.merge(results_dict[i], metadata_well, left_index=True, right_index=True)
    # combine the cytokine dataframes
    new_df = pd.merge(new_df, nomic_df, left_index=True, right_index=True)
    # set file path
    file_path = pathlib.Path(f"../indexes/{cell_type}/regression/channels/{i}.parquet")
    # save the dataframe
    new_df.to_parquet(file_path)

Gasdermin_ER_Mito_PM
(154, 309)
DNA_ER_Mito_PM
(154, 1138)
DNA_Gasdermin_Mito_PM
(154, 955)
DNA_Gasdermin_ER_PM
(154, 416)
DNA_Gasdermin_ER_Mito
(154, 725)


In [16]:
# get the list of the dictionary keys
index_list = list(results_dict.keys())
index_list_new = []
for i in index_list:
    index_list_new.append(i + ".parquet")
# write the list to a text file
# file path
file_write_path = pathlib.Path(f"../cytokine_list/channel_splits.txt")
with open(file_write_path, "w") as f:
    for i in index_list_new:
        f.write("%s\n" % i)