In [44]:
import pandas as pd
import os
import multiprocessing
import get_chdir_signatures
import re
import multiprocessing_logging
import logging

In [45]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# log를 파일에 출력
file_handler = logging.FileHandler('log.log')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)


In [14]:
%run get_chdir_signatures.py

In [53]:
folder = "../data/LINCS_CFDE/GSE92742_full_batches"
pred_folder = "../data/LINCS_CFDE/L1000_GSE92742_prediction_results_step2_batch_divided_38_35"
gene_filename = folder+"/GSE92742_genes.txt"
inst_filename = "../data/L1000/GSE92742_Broad_LINCS_inst_info.txt"
l1000_folder = "../data/LINCS_CFDE/GSE92742_full_batches"

output_folder = "../data/dexamethasone_v2"

In [16]:
# load gene index
with open(gene_filename, "r") as f:
    gene_list = [x.strip() for x in f.readlines()]

In [17]:
# load inst info
inst_info = pd.read_csv(inst_filename, sep="\t")


# generate experiment column
inst_info["batch"] = inst_info["rna_plate"].replace(to_replace='_X[0-9]', value='', regex=True)

# dexamethasone inst info
dexamethasone_inst_info = inst_info[inst_info["pert_iname"]=="dexamethasone"]

pert_ids = dexamethasone_inst_info["pert_id"].unique()

  interactivity=interactivity, compiler=compiler, result=result)


In [18]:
# get common genes
prediction_gene_filename = "../output_step2/35/logs/prediction_gene_list.txt"
with open(prediction_gene_filename, "r") as f:
    prediction_gene_list = [x.strip() for x in f.readlines()]

common_gene_list = list(set(gene_list).intersection(prediction_gene_list))

In [59]:
def chdir(batchfile):
    ''' 
    {batchfile} contains all Level 3 instances belonging to a single batch. 
    Compute all signatures in the batch via characteristic direction, by 
    comparing the perturbation replicate instances (cases) against all other 
    instances in the batch (controls). 
    '''

    # ignore this pesky file
    if batchfile == '.DS_Store':
        return None
    
    
    batch = batchfile.replace(".f", "")
    sig_df = inst_info[inst_info['batch'] == batch]
    
    signatures_list = list()
    signatures_list_l1000 = list()
    processed_pert_list = list()
    if sig_df.shape[0] == 0:
        logger.info("Passed: "+batchfile)
        return None
    else:
        batch_data_df = load_feather(f"{pred_folder}/{batchfile}", None)
        batch_data_df = batch_data_df[common_gene_list]

        batch_data_df_l1000 = load_feather(f"{l1000_folder}/{batchfile}", gene_list).T
        batch_data_df_l1000 = batch_data_df_l1000[common_gene_list]

        batch_pert_ids = sig_df["pert_id"].unique()
        for batch_pert_id in batch_pert_ids:
            if pert_ids is not None:
                if batch_pert_id not in pert_ids:
                    
                    continue
            logger.info(batch_pert_id+"in"+batchfile) 

            # single pert_id is case
            control_inst_id = sig_df.loc[sig_df["pert_id"]!=batch_pert_id, "inst_id"].tolist()
            case_inst_id = sig_df.loc[sig_df["pert_id"]==batch_pert_id, "inst_id"].tolist()

            control_gex_data_df = batch_data_df.loc[control_inst_id, :]
            case_gex_data_df = batch_data_df.loc[case_inst_id, :]
            signature = get_signatures(control_gex_data_df.T, case_gex_data_df.T)
            signatures_list.append(signature)

            control_gex_data_df_l1000 = batch_data_df_l1000.loc[control_inst_id, :]
            case_gex_data_df_l1000 = batch_data_df_l1000.loc[case_inst_id, :]
            signature_l1000 = get_signatures(control_gex_data_df_l1000.T, case_gex_data_df_l1000.T)
            signatures_list_l1000.append(signature_l1000)

            processed_pert_list.append(batch_pert_id)
            # break
    if len(signatures_list) > 0:
        signature_df = pd.concat(signatures_list, axis=1)
        signature_df.columns = processed_pert_list

        signature_df_l1000 = pd.concat(signatures_list_l1000, axis=1)
        signature_df_l1000.columns = processed_pert_list

        # save signature_df 
        signature_df.to_csv(f"{output_folder}/predicted_rnaseq_signature_{batch}.csv")
        signature_df_l1000.to_csv(f"{output_folder}/l1000_signature_{batch}.csv")
        logger.info(f"Saved!{output_folder}/predicted_rnaseq_signature_{batch}.csv")
        logger.info(f"Saved!{output_folder}/l1000_signature_{batch}.csv")
    # return signature_df


In [58]:
# example
chdir("CPC004_A375_6H.f")

INFO:root:BRD-A10188456inCPC004_A375_6H.f
2021-06-10 00:21:12,813 - root - INFO - BRD-A10188456inCPC004_A375_6H.f
2021-06-10 00:21:12,813 - root - INFO - BRD-A10188456inCPC004_A375_6H.f
INFO:root:BRD-K47635719inCPC004_A375_6H.f
2021-06-10 00:21:22,880 - root - INFO - BRD-K47635719inCPC004_A375_6H.f
2021-06-10 00:21:22,880 - root - INFO - BRD-K47635719inCPC004_A375_6H.f
INFO:root:Saved!../data/dexamethasone_v2/predicted_rnaseq_signature_CPC004_A375_6H.csv
2021-06-10 00:21:32,923 - root - INFO - Saved!../data/dexamethasone_v2/predicted_rnaseq_signature_CPC004_A375_6H.csv
2021-06-10 00:21:32,923 - root - INFO - Saved!../data/dexamethasone_v2/predicted_rnaseq_signature_CPC004_A375_6H.csv
INFO:root:Saved!../data/dexamethasone_v2/l1000_signature_CPC004_A375_6H.csv
2021-06-10 00:21:32,925 - root - INFO - Saved!../data/dexamethasone_v2/l1000_signature_CPC004_A375_6H.csv
2021-06-10 00:21:32,925 - root - INFO - Saved!../data/dexamethasone_v2/l1000_signature_CPC004_A375_6H.csv


Unnamed: 0,BRD-A10188456,BRD-K47635719
CXCL8,0.076309,0.023644
HOXA10,0.062403,-0.014514
CXCL2,0.053610,0.024308
CXCL1,0.048457,0.035831
PPEF2,0.046383,-0.006764
...,...,...
ZNF302,-0.040048,-0.003634
EPHB2,-0.042114,-0.036061
COL4A1,-0.056763,0.007695
MEST,-0.066673,0.061537


In [62]:
for batchfile in batchfiles:
    chdir(batchfile)

INFO:root:BRD-K38775274inCPC020_HCC515_6H.f
2021-06-10 00:29:58,439 - root - INFO - BRD-K38775274inCPC020_HCC515_6H.f
2021-06-10 00:29:58,439 - root - INFO - BRD-K38775274inCPC020_HCC515_6H.f
INFO:root:Saved!../data/dexamethasone_v2/predicted_rnaseq_signature_CPC020_HCC515_6H.csv
2021-06-10 00:30:03,401 - root - INFO - Saved!../data/dexamethasone_v2/predicted_rnaseq_signature_CPC020_HCC515_6H.csv
2021-06-10 00:30:03,401 - root - INFO - Saved!../data/dexamethasone_v2/predicted_rnaseq_signature_CPC020_HCC515_6H.csv
INFO:root:Saved!../data/dexamethasone_v2/l1000_signature_CPC020_HCC515_6H.csv
2021-06-10 00:30:03,403 - root - INFO - Saved!../data/dexamethasone_v2/l1000_signature_CPC020_HCC515_6H.csv
2021-06-10 00:30:03,403 - root - INFO - Saved!../data/dexamethasone_v2/l1000_signature_CPC020_HCC515_6H.csv
INFO:root:BRD-A10188456inCPC004_HCC515_24H.f
2021-06-10 00:30:18,856 - root - INFO - BRD-A10188456inCPC004_HCC515_24H.f
2021-06-10 00:30:18,856 - root - INFO - BRD-A10188456inCPC004_HCC51

In [60]:
pool = multiprocessing.Pool(2)
batchfiles = os.listdir(pred_folder)

pool.map(chdir, batchfiles)
pool.close()

INFO:root:BRD-K38775274inCPC020_HCC515_6H.f
2021-06-10 00:24:55,426 - root - INFO - BRD-K38775274inCPC020_HCC515_6H.f
2021-06-10 00:24:55,426 - root - INFO - BRD-K38775274inCPC020_HCC515_6H.f
INFO:root:BRD-A10188456inCPC004_VCAP_24H.f
2021-06-10 00:24:58,546 - root - INFO - BRD-A10188456inCPC004_VCAP_24H.f
2021-06-10 00:24:58,546 - root - INFO - BRD-A10188456inCPC004_VCAP_24H.f


KeyboardInterrupt: 

In [197]:
l1000_signature_list = list()
predicted_rnaseq_signature_list = list()

exp_plates = dexamethasone_inst_info["exp_plate"].unique()
for exp_plate in exp_plates:
    print(exp_plate)
    filename = exp_plate+".f"
    pert_ids = dexamethasone_inst_info.loc[dexamethasone_inst_info["exp_plate"]==exp_plate,"pert_id"].unique()
    for pert_id in pert_ids:
        print(pert_id)
        l1000_data = load_feather(folder+filename, index=gene_list).loc[common_gene_list, :]
        predicted_rnaseq_data = load_feather(pred_folder+filename, index=None)[common_gene_list].T

        inst_info_batch = inst_info[inst_info["rna_plate"].str.startswith(exp_plate)]

        control_inst_ids = inst_info_batch.loc[inst_info_batch["pert_id"]==pert_id, "inst_id"].tolist()
        case_inst_ids = inst_info_batch.loc[inst_info_batch["pert_id"]!=pert_id, "inst_id"].tolist()

        # get signatures from real L1000 profiles
        l1000_signature = get_signatures(l1000_data[control_inst_ids], l1000_data[case_inst_ids])        
        l1000_signature = l1000_signature.rank(ascending=False)# values to rank
        l1000_signature_list.append(l1000_signature)

        # get signatures from predicted RNA-seq profiles
        predicted_rnaseq_signature = get_signatures(predicted_rnaseq_data[control_inst_ids], predicted_rnaseq_data[case_inst_ids])
        predicted_rnaseq_signature = predicted_rnaseq_signature.rank(ascending=False)
        predicted_rnaseq_signature_list.append(predicted_rnaseq_signature)
    break


CPC004_A375_6H
BRD-A10188456
BRD-K47635719


In [200]:
# concat
mean_l1000_signature = pd.concat(l1000_signature_list, axis=1).mean(axis=1)
mean_predicted_rnaseq_signature = pd.concat(predicted_rnaseq_signature_list, axis=1).mean(axis=1)

In [None]:
# bridge plot

# Compute Gene Signatures of Real L1000 Profiles

In [134]:
filenames = sorted(os.listdir(folder))

In [159]:
for filename in filenames:
    if filename.endswith(".f") == False:
        continue
    print(filename)
    data = load_feather(folder+filename, index=gene_list).loc[common_gene_list, :]
    inst_info_batch = inst_info[inst_info["rna_plate"].str.startswith(filename.replace(".f", ""))]
    pert_ids = sorted(inst_info_batch["pert_id"].unique())

    
    for pert_id in pert_ids:
        print(pert_id)
        control_inst_ids = inst_info_batch.loc[inst_info_batch["pert_id"]==pert_id, "inst_id"].tolist()
        case_inst_ids = inst_info_batch.loc[inst_info_batch["pert_id"]!=pert_id, "inst_id"].tolist()
        signature = get_signatures(data[control_inst_ids], data[case_inst_ids])
        break
    break

AML001_CD34_24H.f
BRD-A03772856


# Compute Gene Signatures of Predicted RNA-seq Profiles

In [136]:
pred_filenames = sorted(os.listdir(pred_folder))

In [160]:
for filename in pred_filenames:
    if filename.endswith(".f") == False:
        continue
    print(pred_folder+filename)
    data = load_feather(pred_folder+filename, index=None)[common_gene_list].T
    
    inst_info_batch = inst_info[inst_info["rna_plate"].str.startswith(filename.replace(".f", ""))]
    pert_ids = sorted(inst_info_batch["pert_id"].unique())
    
    for pert_id in pert_ids:
        print(pert_id)
        control_inst_ids = inst_info_batch.loc[inst_info_batch["pert_id"]==pert_id, "inst_id"].tolist()
        case_inst_ids = inst_info_batch.loc[inst_info_batch["pert_id"]!=pert_id, "inst_id"].tolist()
        signature = get_signatures(data[control_inst_ids], data[case_inst_ids])
        break
    break

../data/LINCS_CFDE/L1000_GSE92742_prediction_results_step2_batch_divided_38_35/AML001_CD34_24H.f
BRD-A03772856
