# Process Data from Article for Pipeline

separate into WT and MT gene expression matrices

https://www.nature.com/articles/s41588-022-01179-9

https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE158067

In [None]:
import pandas as pd

expr = pd.read_csv(
    "../../resources/GSE158067/original/GSE158067_gene_exp_mtx.txt",
    sep=" ",
    index_col=0,
)
expr

In [None]:
expr.to_csv("../../resources/GSE158067/GSE158067_gene_exp_mtx.txt", sep="\t")

In [None]:
meta = pd.read_excel(
    "../../resources/GSE158067/original/GSE158067_scRNA_cell_metadata.xlsx",
    engine="calamine",
    index_col=0,
    skiprows=1,
)
meta

In [None]:
mt_cells = meta[meta["Genotype"] == "Mutant"].index.tolist()
wt_cells = meta[meta["Genotype"] == "WT"].index.tolist()
genotyped_cells = meta[meta["Genotype"] != "not_genotyped"].index.tolist()
len(mt_cells), len(wt_cells), len(genotyped_cells)

balanced data!

In [None]:
expr_mt = expr[mt_cells]
expr_mt

In [None]:
expr_wt = expr[wt_cells]
expr_wt

In [None]:
expr_filtered = expr[genotyped_cells]
expr_filtered

In [None]:
expr_mt.to_csv(
    "../../resources/GSE158067/GSE158067_gene_exp_mtx_filtered_mt.txt",
    sep="\t",
    header=True,
    index=True,
)
expr_wt.to_csv(
    "../../resources/GSE158067/GSE158067_gene_exp_mtx_filtered_wt.txt",
    sep="\t",
    header=True,
    index=True,
)
expr_filtered.to_csv(
    "../../resources/GSE158067/GSE158067_gene_exp_mtx_filtered.txt",
    sep="\t",
    header=True,
    index=True,
)

In [None]:
perturbations_list = meta.copy().reset_index()
perturbations_list["Perturbation"] = perturbations_list["Genotype"].apply(
    lambda x: {"Mutant": "MT", "WT": "WT", "not_genotyped": "NG"}[x]
)
perturbations_list = perturbations_list[["RNA_cell_ID", "Perturbation"]]

In [None]:
perturbations_list.to_csv(
    "../../resources/GSE158067/GSE158067_cell_to_perturbation.tsv",
    sep="\t",
    header=True,
    index=False,
)
perturbations_list