# Script for Processing L1000 Data

In [None]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import numpy as np
import numpy_indexed as npi
import random

import sys, h5py, time
import cmapPy.pandasGEXpress.parse_gctx as parse_gctx
import cmapPy.pandasGEXpress.parse_gct as parse_gct

from scipy import stats
from numpy.random import seed

import scipy.stats as ss
import warnings
import numpy as np
from maayanlab_bioinformatics.normalization import quantile_normalize
import pickle

randomState = 123
seed(randomState)
random.seed(randomState)

## Initialize

Parameters

In [None]:
n_sampling = 50000

Input Filenames

In [None]:
l1000_filename = "../data/L1000/GSE92742_Broad_LINCS_Level3_INF_mlr12k_n1319138x12328.gctx"
l1000_geneinfo_filename = "../data/L1000/GSE92742_Broad_LINCS_gene_info.txt"
l1000_instinfo_filename = "../data/L1000/GSE92742_Broad_LINCS_inst_info.txt"
overlap_landmark_gene_list = "../data/processed/overlap_landmark_gene_file.txt"

Output Filenames

In [None]:
l1000_output_filename = "../data/processed/L1000/L1000_filtered_GSE92742_Broad_LINCS_Level3_INF_mlr12k_n{}x{}.f" # only landmark genes
l1000_overlap_landmark_output_filename = "../data/processed/L1000/L1000_filtered_GSE92742_Broad_LINCS_Level3_INF_mlr12k_n{}x{}.f" # n_samplingx967
l1000_sampled_instinfo_filename = "../data/processed/L1000/GSE92742_Broad_LINCS_inst_info_sampled.csv"

## Load landmark genes

In [None]:
with open(overlap_landmark_gene_list, "r") as f:
    landmark_gene = [x.strip() for x in f.readlines()]



## Load L1000 (GSE92742/Level 3) ~4 min

https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE92742 downloaded @ ../data

In [None]:
print('Loading L1000 data.....')
l1000_data = parse_gctx.parse(l1000_filename,convert_neg_666=True).data_df

In [None]:
gene_info = pd.read_csv(l1000_geneinfo_filename,header = 0, sep = '\t')

In [None]:
# create a probe_id to gene name dictionary 
gene_dict = dict(zip([str(x) for x in gene_info['pr_gene_id']], gene_info['pr_gene_symbol']))

# label rows with gene names 
l1000_data.index = [gene_dict[x] for x in l1000_data.index.values]


In [None]:
# filter landmark genes
filtered_l1000_data = l1000_data[l1000_data.index.isin(landmark_gene)]

In [None]:
filtered_l1000_data.shape

In [None]:
del l1000_data

## Random Sampling

In [None]:
filtered_l1000_data_sampled = filtered_l1000_data.sample(axis='columns', n=n_sampling).T

In [None]:
filtered_l1000_data_sampled.loc[:, landmark_gene].sort_index(axis=1)

In [None]:
filtered_l1000_data_sampled.reset_index().to_feather(l1000_overlap_landmark_output_filename.format(filtered_l1000_data_sampled.shape[0], filtered_l1000_data_sampled.shape[1]))
print(l1000_overlap_landmark_output_filename.format(filtered_l1000_data_sampled.shape[0], filtered_l1000_data_sampled.shape[1]))

## Save Cell line info of randomly sampled samples

In [None]:
inst_info = pd.read_csv(l1000_instinfo_filename, sep="\t")
inst_info["exp_plate"] = inst_info["pert_id"]+inst_info["cell_id"]+inst_info["pert_time"].map(str)+inst_info["pert_dose"].map(str)#["_".join(x.split("_")[:3]) for x in inst_info["rna_plate"]]

In [None]:
filtered_l1000_data_sampled_inst_info = inst_info[inst_info["inst_id"].isin(filtered_l1000_data_sampled.index)]
filtered_l1000_data_sampled_inst_info[["inst_id", "cell_id", "pert_id"]].to_csv(l1000_sampled_instinfo_filename, index=None)

In [None]:
filtered_l1000_data_sampled_inst_info

In [None]:
print("cell line size", len(filtered_l1000_data_sampled_inst_info["cell_id"].unique()), "out of", len(inst_info["cell_id"].unique()))

In [None]:
print("pert size", len(filtered_l1000_data_sampled_inst_info["pert_id"].unique()), "out of", len(inst_info["pert_id"].unique()))

In [None]:
print("exp_plate size", len(filtered_l1000_data_sampled_inst_info["exp_plate"].unique()), "out of", len(inst_info["exp_plate"].unique()))

In [None]:
filtered_l1000_data_sampled_inst_info

In [None]:
filtered_l1000_data_sampled_inst_info[filtered_l1000_data_sampled_inst_info["exp_plate"].duplicated()].sort_values("exp_plate")