In [7]:
import numpy as np 
from matplotlib import pyplot as plt 
import scanpy as sc
import pandas as pd
import scipy.sparse as sp

In [2]:
%%time 
adata = sc.read_h5ad("../../perturb_cite_seq_data/SCP1064/ready/control/gene_filtered_adata.h5ad")

CPU times: user 115 ms, sys: 117 ms, total: 232 ms
Wall time: 15 s


In [12]:
def generate_nodags_data(adata_path):
    adata = sc.read_h5ad(adata_path)
    data = sp.csr_matrix.toarray(adata.X)
    
    final_genes = adata.var.index
    
    obs_data = data[adata.obs['targets'] == '',:]
    datasets = [obs_data]
    
    for gene in final_genes:
        datasets.append(
            data[adata.obs['targets'] == gene, :]
        )

    intervention_sets = [[None]] + [[i] for i in range(61)]
    
    return datasets, intervention_sets

    

In [13]:
control_datasets, control_interventions = generate_nodags_data("../../perturb_cite_seq_data/SCP1064/ready/control/gene_filtered_adata.h5ad")
cocult_datasets, cocult_interventions = generate_nodags_data("../../perturb_cite_seq_data/SCP1064/ready/cocult/gene_filtered_adata.h5ad")
ifn_datasets, ifn_interventions = generate_nodags_data("../../perturb_cite_seq_data/SCP1064/ready/ifn/gene_filtered_adata.h5ad")


In [14]:
control_training_data, control_training_interventions = control_datasets[:-6], control_interventions[:-6]
cocult_training_data, cocult_training_interventions = cocult_datasets[:-6], cocult_interventions[:-6]
ifn_training_data, ifn_training_interventions = ifn_datasets[:-6], ifn_interventions[:-6]

control_validation_data, control_validation_interventions = control_datasets[-6:], control_interventions[-6:]
cocult_validation_data, cocult_validation_interventions = cocult_datasets[-6:], cocult_interventions[-6:]
ifn_validation_data, ifn_validation_interventions = ifn_datasets[-6:], ifn_interventions[-6:]


In [17]:
# saving control data
for i, dataset in enumerate(control_training_data):
    np.save("../../perturb_cite_seq_data/nodags_data/control/training_data/dataset_{}.npy".format(i), dataset)
np.save("../../perturb_cite_seq_data/nodags_data/control/training_data/intervention_sets.npy", control_training_interventions)

for i, dataset in enumerate(control_validation_data):
    np.save("../../perturb_cite_seq_data/nodags_data/control/validation_data/dataset_{}.npy".format(i), dataset)
np.save("../../perturb_cite_seq_data/nodags_data/control/validation_data/intervention_sets.npy", control_validation_interventions)

In [18]:
# saving cocult data
for i, dataset in enumerate(cocult_training_data):
    np.save("../../perturb_cite_seq_data/nodags_data/cocult/training_data/dataset_{}.npy".format(i), dataset)
np.save("../../perturb_cite_seq_data/nodags_data/cocult/training_data/intervention_sets.npy", cocult_training_interventions)

for i, dataset in enumerate(cocult_validation_data):
    np.save("../../perturb_cite_seq_data/nodags_data/cocult/validation_data/dataset_{}.npy".format(i), dataset)
np.save("../../perturb_cite_seq_data/nodags_data/cocult/validation_data/intervention_sets.npy", cocult_validation_interventions)

In [19]:
# saving ifn data
for i, dataset in enumerate(ifn_training_data):
    np.save("../../perturb_cite_seq_data/nodags_data/ifn/training_data/dataset_{}.npy".format(i), dataset)
np.save("../../perturb_cite_seq_data/nodags_data/ifn/training_data/intervention_sets.npy", ifn_training_interventions)

for i, dataset in enumerate(ifn_validation_data):
    np.save("../../perturb_cite_seq_data/nodags_data/ifn/validation_data/dataset_{}.npy".format(i), dataset)
np.save("../../perturb_cite_seq_data/nodags_data/ifn/validation_data/intervention_sets.npy", ifn_validation_interventions)

In [8]:
# saving control data
for i, dataset in enumerate(control_datasets):
    np.save("../../perturb_cite_seq_data/nodags_data/control/dataset_{}.npy".format(i), dataset)
np.save("../../perturb_cite_seq_data/nodags_data/control/intervention_sets.npy", control_interventions)

# saving cocult data
for i, dataset in enumerate(cocult_datasets):
    np.save("../../perturb_cite_seq_data/nodags_data/cocult/dataset_{}.npy".format(i), dataset)
np.save("../../perturb_cite_seq_data/nodags_data/cocult/intervention_sets.npy", cocult_interventions)

# saving ifn data
for i, dataset in enumerate(ifn_datasets):
    np.save("../../perturb_cite_seq_data/nodags_data/ifn/dataset_{}.npy".format(i), dataset)
np.save("../../perturb_cite_seq_data/nodags_data/ifn/intervention_sets.npy", ifn_interventions)



In [20]:
np.save("../../perturb_cite_seq_data/nodags_data/control/training_data/weights.npy", np.eye(61))
np.save("../../perturb_cite_seq_data/nodags_data/cocult/training_data/weights.npy", np.eye(61))
np.save("../../perturb_cite_seq_data/nodags_data/ifn/training_data/weights.npy", np.eye(61))

np.save("../../perturb_cite_seq_data/nodags_data/control/validation_data/weights.npy", np.eye(61))
np.save("../../perturb_cite_seq_data/nodags_data/cocult/validation_data/weights.npy", np.eye(61))
np.save("../../perturb_cite_seq_data/nodags_data/ifn/validation_data/weights.npy", np.eye(61))

In [69]:
data = sp.csr_matrix.toarray(adata.X)

In [70]:
data.shape

(57523, 61)

In [None]:
adata.obs

In [9]:
adata.obs[adata.obs["condition"] == "Control"]["targets"].value_counts()

                                                        13142
TSC22D3                                                   216
CTSD                                                      212
NT5E                                                      211
RTP4                                                      208
                                                        ...  
CGAS,UQCRH                                                  1
ACSL3,CHCHD2,CTSD,DAG1,IDH2,PTMA,SMAD3,TIMP2,TSC22D3        1
CHCHD2,CGAS                                                 1
CHCHD2,CITED1,FRZB,FSTL3,IDH2,IFNGR2,MT2A                   1
DDR1,NME1                                                   1
Name: targets, Length: 13092, dtype: int64

In [10]:
final_genes = adata.var.index

gene_int_samples = {gene: 0 for gene in final_genes}
gene_int_samples[''] = 0

In [13]:
def checkTargetsinFinalGenes(targets, final_genes):
    targets_list = targets.split(",")
    ans = True
    for target in targets_list:
        if targets not in final_genes:
            ans = False
            
    return ans

In [12]:
useful_samples = 0
single_inter_cells = 0
for index, row in adata.obs.iterrows():
    if checkTargetsinFinalGenes(row['targets'], final_genes):
        useful_samples += 1
        gene_int_samples[row['targets']] += 1
    if row['MOI'] == 1 or row["MOI"] == 0:
        single_inter_cells += 1

print(useful_samples)
print(single_inter_cells)

8013
35428


In [74]:
obs_data = data[adata.obs['targets'] == '',:]
datasets = [obs_data]

for gene in final_genes:
    datasets.append(
        data[adata.obs['targets'] == gene, :]
    )

intervention_sets = [[None]] + [[i] for i in range(61)]

In [75]:
for dataset in datasets:
    print(len(dataset))

13142
84
197
168
108
166
164
193
151
78
182
161
112
68
63
134
131
129
144
121
204
81
195
122
115
146
161
190
40
154
174
187
175
152
81
36
83
50
61
87
139
54
78
33
169
174
71
165
200
158
110
139
169
195
133
179
62
179
170
52
167
169


In [76]:

gene_int_samples

{'ACSL3': 84,
 'ACTA2': 197,
 'B2M': 168,
 'CCND1': 108,
 'CD274': 166,
 'CD58': 164,
 'CD59': 193,
 'CDK4': 151,
 'CDK6': 78,
 'CDKN1A': 182,
 'CKS1B': 161,
 'CST3': 112,
 'CTPS1': 68,
 'DNMT1': 63,
 'EIF3K': 134,
 'EVA1A': 131,
 'FKBP4': 129,
 'FOS': 144,
 'GSEC': 121,
 'GSN': 204,
 'HASPIN': 81,
 'HLA-A': 195,
 'HLA-B': 122,
 'HLA-C': 115,
 'HLA-E': 146,
 'IFNGR1': 161,
 'IFNGR2': 190,
 'ILF2': 40,
 'IRF3': 154,
 'JAK1': 174,
 'JAK2': 187,
 'LAMP2': 175,
 'LGALS3': 152,
 'MRPL47': 81,
 'MYC': 36,
 'P2RX4': 83,
 'PABPC1': 50,
 'PAICS': 61,
 'PET100': 87,
 'PTMA': 139,
 'PUF60': 54,
 'RNASEH2A': 78,
 'RRS1': 33,
 'SAT1': 169,
 'SEC11C': 174,
 'SINHCAF': 71,
 'SMAD4': 165,
 'SOX4': 200,
 'SP100': 158,
 'SSR2': 110,
 'STAT1': 139,
 'STOM': 169,
 'TGFB1': 195,
 'TIMP2': 133,
 'TM4SF1': 179,
 'TMED10': 62,
 'TMEM173': 179,
 'TOP1MT': 170,
 'TPRKB': 52,
 'TXNDC17': 167,
 'VDAC2': 169,
 '': 13142}

In [60]:
obs_data = data[adata.obs['targets'] == '',:]
b2m_data = data[adata.obs['targets'] == 'B2M',:]

In [61]:
fig, axs = plt.subplots(2, 2)

axs[0, 0].hist(obs_data[:, 0])
axs[0, 1].hist(obs_data[:, 2])
axs[1, 0].hist(b2m_data[:, 0], color='red')
_ = axs[1, 1].hist(b2m_data[:, 2], color='red')

<IPython.core.display.Javascript object>

In [56]:
adata.var

Unnamed: 0,n_cells,highly_variable,highly_variable_rank,means,variances,variances_norm,targeted,in_final
ACSL3,161125,False,,1.733973,3.038900,0.882086,1.0,True
ACTA2,3754,True,337.0,0.019250,0.034226,1.669910,1.0,True
B2M,217428,True,402.0,46.534925,1835.067748,1.560129,1.0,True
CCND1,214664,True,933.0,12.715430,113.059342,1.193414,1.0,True
CD274,36150,True,814.0,0.216551,0.311365,1.231807,1.0,True
...,...,...,...,...,...,...,...,...
TMEM173,15183,False,,0.074009,0.077751,0.955152,1.0,True
TOP1MT,37052,False,,0.194714,0.212611,0.945510,1.0,True
TPRKB,158681,False,,1.773193,3.284436,0.922588,1.0,True
TXNDC17,185733,False,,2.762841,6.197372,0.887228,1.0,True
