# Process Data from Article for Pipeline

separate into WT and MT gene expression matrices

https://www.nature.com/articles/s41588-022-01179-9

https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE158067

In [10]:
import pandas as pd

expr = pd.read_csv(
    "../../resources/GSE158067/original/GSE158067_gene_exp_mtx.txt",
    sep=" ",
    index_col=0,
)
expr

Unnamed: 0,CH02_EC_A10_S34,CH02_EC_A11_S35,CH02_EC_A12_S36,CH02_EC_A1_S25,CH02_EC_A2_S26,CH02_EC_A3_S27,CH02_EC_A4_S28,CH02_EC_A5_S29,CH02_EC_A6_S30,CH02_EC_A7_S31,...,CH04.3.cDNA.H12_S296,CH04.3.cDNA.H1_S285,CH04.3.cDNA.H2_S286,CH04.3.cDNA.H3_S287,CH04.3.cDNA.H4_S288,CH04.3.cDNA.H5_S289,CH04.3.cDNA.H6_S290,CH04.3.cDNA.H7_S291,CH04.3.cDNA.H8_S292,CH04.3.cDNA.H9_S293
MIR1302-2HG,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
FAM138A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR4F5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR4F29,2,2,3,0,0,3,0,0,9,4,...,0,0,0,0,0,0,0,0,0,0
OR4F16,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MT-ND4,12978,10108,5077,6704,5632,9842,8432,2586,8901,9382,...,5056,3403,4492,4588,1676,3413,4729,8041,3356,5452
MT-ND5,5584,1450,1403,697,1233,791,1197,1362,2417,2248,...,1680,1381,1087,1343,933,1917,2183,2498,2125,1731
MT-ND6,237,182,0,10,46,187,76,137,140,159,...,284,119,87,246,147,234,465,225,287,117
MT-CYB,12532,6580,2782,5710,3100,7376,5319,1796,6687,6578,...,3820,2530,4654,3590,1002,2712,3019,6890,2564,3836


In [11]:
meta = pd.read_excel(
    "../../resources/GSE158067/original/GSE158067_scRNA_cell_metadata.xlsx",
    engine="calamine",
    index_col=0,
    skiprows=1,
)
meta

Unnamed: 0_level_0,# Reads,# Genes,% Mitochondrial reads,% Ribosomal reads,No_mutant_reads,No_wt_reads,Genotype,Plate,Sample,Type
RNA_cell_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
CH02_EC_A10_S34,887866,4263,17.178155,40.434593,8,0,Mutant,CH02_1,CH02,CD34+
CH02_EC_A12_S36,362491,2590,11.259038,21.168250,0,0,not_genotyped,CH02_1,CH02,CD34+
CH02_EC_A2_S26,758769,4241,4.475670,32.569596,103,178,Mutant,CH02_1,CH02,CD34+
CH02_EC_A3_S27,837047,3825,7.897884,32.365686,13,10,Mutant,CH02_1,CH02,CD34+
CH02_EC_A4_S28,938404,3842,6.578616,30.265536,1,40,not_genotyped,CH02_1,CH02,CD34+
...,...,...,...,...,...,...,...,...,...,...
CH04.3.cDNA.H5_S289,725464,6306,3.768760,9.706340,22,87595,WT,CH04_3,CH04,CD34+
CH04.3.cDNA.H6_S290,763177,6541,4.842520,17.126040,24,74,Mutant,CH04_3,CH04,CD34+
CH04.3.cDNA.H7_S291,936236,7512,5.776321,10.382318,30,122165,WT,CH04_3,CH04,CD34+
CH04.3.cDNA.H8_S292,375044,5101,8.139845,7.903073,24,75,Mutant,CH04_3,CH04,CD34+


In [12]:
mt_cells = meta[meta["Genotype"] == "Mutant"].index.tolist()
wt_cells = meta[meta["Genotype"] == "WT"].index.tolist()
genotyped_cells = meta[meta["Genotype"] != "not_genotyped"].index.tolist()
len(mt_cells), len(wt_cells), len(genotyped_cells)

(240, 201, 441)

balanced data!

In [13]:
expr_mt = expr[mt_cells]
expr_mt

Unnamed: 0,CH02_EC_A10_S34,CH02_EC_A2_S26,CH02_EC_A3_S27,CH02_EC_A6_S30,CH02_EC_B10_S82,CH02_EC_B11_S83,CH02_EC_B12_S84,CH02_EC_B1_S73,CH02_EC_B4_S76,CH02_EC_B5_S77,...,CH04.3.cDNA.F11_S271,CH04.3.cDNA.F3_S263,CH04.3.cDNA.G3_S275,CH04.3.cDNA.H11_S295,CH04.3.cDNA.H2_S286,CH04.3.cDNA.H3_S287,CH04.3.cDNA.H4_S288,CH04.3.cDNA.H6_S290,CH04.3.cDNA.H8_S292,CH04.3.cDNA.H9_S293
MIR1302-2HG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
FAM138A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR4F5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR4F29,2,0,3,9,2,1,0,0,1,1,...,0,0,1,0,0,0,0,0,0,0
OR4F16,0,0,0,0,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MT-ND4,12978,5632,9842,8901,8148,5384,5891,4748,7320,4521,...,5725,1946,4517,2570,4492,4588,1676,4729,3356,5452
MT-ND5,5584,1233,791,2417,2478,1110,1086,808,1714,662,...,2120,1785,1247,743,1087,1343,933,2183,2125,1731
MT-ND6,237,46,187,140,110,241,21,238,305,148,...,151,141,194,71,87,246,147,465,287,117
MT-CYB,12532,3100,7376,6687,6357,3240,7271,1956,4041,3391,...,8263,2297,3002,1932,4654,3590,1002,3019,2564,3836


In [14]:
expr_wt = expr[wt_cells]
expr_wt

Unnamed: 0,CH02_EC_A5_S29,CH02_EC_A8_S32,CH02_EC_A9_S33,CH02_EC_B9_S81,CH02_EC_C11_S131,CH02_EC_C3_S123,CH02_EC_C6_S126,CH02_EC_C9_S129,CH02_EC_D12_S180,CH02_EC_D9_S177,...,CH04.3.cDNA.G5_S277,CH04.3.cDNA.G6_S278,CH04.3.cDNA.G7_S279,CH04.3.cDNA.G8_S280,CH04.3.cDNA.G9_S281,CH04.3.cDNA.H10_S294,CH04.3.cDNA.H12_S296,CH04.3.cDNA.H1_S285,CH04.3.cDNA.H5_S289,CH04.3.cDNA.H7_S291
MIR1302-2HG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FAM138A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR4F5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR4F29,0,1,0,4,3,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
OR4F16,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MT-ND4,2586,13618,5384,3807,5532,5869,5068,4843,6548,4894,...,6801,3888,7542,3617,2796,6628,5056,3403,3413,8041
MT-ND5,1362,2080,1309,1472,835,1491,2036,980,2488,379,...,1416,1186,1659,1455,1727,3085,1680,1381,1917,2498
MT-ND6,137,59,0,29,110,106,153,210,26,111,...,162,99,105,95,255,523,284,119,234,225
MT-CYB,1796,7146,1986,2232,3562,3597,4024,2714,5668,3868,...,4801,2762,4882,3486,2396,5342,3820,2530,2712,6890


In [15]:
expr_filtered = expr[genotyped_cells]
expr_filtered

Unnamed: 0,CH02_EC_A10_S34,CH02_EC_A2_S26,CH02_EC_A3_S27,CH02_EC_A5_S29,CH02_EC_A6_S30,CH02_EC_A8_S32,CH02_EC_A9_S33,CH02_EC_B10_S82,CH02_EC_B11_S83,CH02_EC_B12_S84,...,CH04.3.cDNA.H12_S296,CH04.3.cDNA.H1_S285,CH04.3.cDNA.H2_S286,CH04.3.cDNA.H3_S287,CH04.3.cDNA.H4_S288,CH04.3.cDNA.H5_S289,CH04.3.cDNA.H6_S290,CH04.3.cDNA.H7_S291,CH04.3.cDNA.H8_S292,CH04.3.cDNA.H9_S293
MIR1302-2HG,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
FAM138A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR4F5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR4F29,2,0,3,0,9,1,0,2,1,0,...,0,0,0,0,0,0,0,0,0,0
OR4F16,0,0,0,0,0,0,0,0,4,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MT-ND4,12978,5632,9842,2586,8901,13618,5384,8148,5384,5891,...,5056,3403,4492,4588,1676,3413,4729,8041,3356,5452
MT-ND5,5584,1233,791,1362,2417,2080,1309,2478,1110,1086,...,1680,1381,1087,1343,933,1917,2183,2498,2125,1731
MT-ND6,237,46,187,137,140,59,0,110,241,21,...,284,119,87,246,147,234,465,225,287,117
MT-CYB,12532,3100,7376,1796,6687,7146,1986,6357,3240,7271,...,3820,2530,4654,3590,1002,2712,3019,6890,2564,3836


In [16]:
expr_mt.to_csv(
    "../../resources/GSE158067/GSE158067_gene_exp_mtx_filtered_mt.txt",
    sep="\t",
    header=True,
    index=True,
)
expr_wt.to_csv(
    "../../resources/GSE158067/GSE158067_gene_exp_mtx_filtered_wt.txt",
    sep="\t",
    header=True,
    index=True,
)
expr_filtered.to_csv(
    "../../resources/GSE158067/GSE158067_gene_exp_mtx_filtered.txt",
    sep="\t",
    header=True,
    index=True,
)

In [17]:
perturbations_list = meta.copy().reset_index()
perturbations_list = perturbations_list[
    perturbations_list["Genotype"] != "not_genotyped"
]
perturbations_list["Perturbation"] = perturbations_list["Genotype"].apply(
    lambda x: "DNMT3A_R882C" if x == "Mutant" else "Control"
)
perturbations_list = perturbations_list[["RNA_cell_ID", "Perturbation"]]

In [None]:
perturbations_list.to_csv(
    "../../resources/GSE158067/GSE158067_cell_to_perturbation.tsv",
    sep="\t",
    header=True,
    index=False,
)