# Script for Processing L1000 Data

In [13]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import numpy as np
import numpy_indexed as npi
import random

import sys, h5py, time
import cmapPy.pandasGEXpress.parse_gctx as parse_gctx
import cmapPy.pandasGEXpress.parse_gct as parse_gct

from scipy import stats
from numpy.random import seed

import scipy.stats as ss
import warnings
import numpy as np
from maayanlab_bioinformatics.normalization import quantile_normalize
import pickle

randomState = 123
seed(randomState)
random.seed(randomState)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Initialize

Parameters

In [3]:
n_sampling = 50000

Input Filenames

In [4]:
l1000_filename = "../data/L1000/GSE92742_Broad_LINCS_Level3_INF_mlr12k_n1319138x12328.gctx"
l1000_geneinfo_filename = "../data/L1000/GSE92742_Broad_LINCS_gene_info.txt"

overlap_landmark_gene_list = "../data/processed/overlap_landmark_gene_file.txt"

Output Filenames

In [5]:
l1000_output_filename = "../data/processed/L1000/L1000_filtered_GSE92742_Broad_LINCS_Level3_INF_mlr12k_n{}x{}.f" # only landmark genes
l1000_overlap_landmark_output_filename = "../data/processed/L1000/L1000_filtered_GSE92742_Broad_LINCS_Level3_INF_mlr12k_n{}x{}.f" # n_samplingx967


## Load landmark genes

In [6]:
with open(overlap_landmark_gene_list, "r") as f:
    landmark_gene = [x.strip() for x in f.readlines()]



## Load L1000 (GSE92742/Level 3) ~4 min

https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE92742 downloaded @ ../data

In [7]:
print('Loading L1000 data.....')
l1000_data = parse_gctx.parse(l1000_filename,convert_neg_666=True).data_df

Loading L1000 data.....


In [8]:
gene_info = pd.read_csv(l1000_geneinfo_filename,header = 0, sep = '\t')

In [9]:
# create a probe_id to gene name dictionary 
gene_dict = dict(zip([str(x) for x in gene_info['pr_gene_id']], gene_info['pr_gene_symbol']))

# label rows with gene names 
l1000_data.index = [gene_dict[x] for x in l1000_data.index.values]


In [11]:
# filter landmark genes
filtered_l1000_data = l1000_data[l1000_data.index.isin(landmark_gene)]

In [17]:
filtered_l1000_data.shape

(962, 1319138)

In [18]:
del l1000_data

## Random Sampling

In [19]:
filtered_l1000_data_sampled = filtered_l1000_data.sample(axis='columns', n=n_sampling).T

In [20]:
filtered_l1000_data_sampled.loc[:, landmark_gene].sort_index(axis=1)

Unnamed: 0_level_0,AARS,ABCB6,ABCC5,ABCF1,ABCF3,ABHD4,ABHD6,ABL1,ACAA1,ACAT2,...,ZMIZ1,ZMYM2,ZNF131,ZNF274,ZNF318,ZNF395,ZNF451,ZNF586,ZNF589,ZW10
cid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CPC012_HT29_6H_X5_B5_DUO52HI53LO:D04,9.209949,5.75080,4.308750,13.440675,4.622875,5.19140,7.18615,6.47970,5.17415,10.9587,...,13.514800,4.995975,7.87965,5.59720,4.91680,15.0000,5.09905,7.02635,8.00165,11.113700
KDB009_HCC515_96H_X2_F1B5_DUO52HI53LO:G16,10.403400,6.45310,4.241800,12.498200,7.515600,5.61990,7.18070,7.70290,5.85860,11.1633,...,10.180600,7.015600,6.70020,7.26735,7.05015,15.0000,7.02680,6.49510,7.06650,8.621300
ERG012_VCAP_24H_X1.A2_B7_DUO52HI53LO:M15,9.008100,9.70340,5.993200,10.855900,6.824400,6.86080,6.70180,9.30120,8.43960,9.6219,...,9.790900,9.381300,8.38970,7.69920,6.86740,12.3333,8.77630,7.75080,7.88865,9.637700
KDB003_PC3_144H_X1_B1_DUO52HI53LO:H08,9.590100,7.94465,3.987100,9.209700,9.610900,8.00420,7.46010,7.27510,9.70110,9.6650,...,10.777349,9.638600,7.60975,7.42100,5.01470,10.3589,7.16915,7.21880,6.62430,9.972300
KDB010_VCAP_120H_X3_F1B4_DUO52HI53LO:H09,9.603950,10.38880,4.646700,11.107000,8.660601,6.59900,6.28720,5.16425,6.45430,5.3859,...,9.398350,9.022350,6.45430,7.41420,7.08405,13.8868,8.35270,11.37980,8.70010,8.768700
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RAD001_MCF7_24H_X3_F1B5_DUO52HI53LO:E11,11.564250,10.25535,5.034800,11.748100,11.769800,6.15640,5.93450,6.48730,8.29680,9.2389,...,10.749600,11.998900,9.18330,7.92250,5.63655,13.7482,6.63240,5.71355,5.59950,9.722651
CYT001_A375_2H_X2_B7_DUO52HI53LO:B07,10.309200,8.54990,4.838800,11.140800,8.370350,5.58470,6.25640,9.10400,6.58430,9.2730,...,9.868851,8.608600,8.09070,7.33825,5.82690,11.5249,6.67700,7.05360,7.20355,9.138300
CVD001_PHH_24H_X3_F1B3_DUO52HI53LO:O21,10.088100,9.75205,9.042150,11.943300,8.692300,8.59400,9.22200,7.51105,8.57660,11.9849,...,9.937800,6.865950,6.79795,12.01450,7.72400,13.6431,7.52230,12.67740,7.22620,10.482700
KDD002_HA1E_96H_X1_F2B5_DUO52HI53LO:J04,10.313000,8.41290,5.256250,12.392900,9.361800,7.61315,6.58695,10.69220,7.16120,11.4972,...,10.504000,10.153900,10.87960,8.35675,6.78865,12.6318,7.63130,6.22670,7.33675,12.795300


In [23]:
filtered_l1000_data_sampled.reset_index().to_feather(l1000_overlap_landmark_output_filename.format(filtered_l1000_data_sampled.shape[0], filtered_l1000_data_sampled.shape[1]))
print(l1000_overlap_landmark_output_filename.format(filtered_l1000_data_sampled.shape[0], filtered_l1000_data_sampled.shape[1]))

../data/processed/L1000/L1000_filtered_GSE92742_Broad_LINCS_Level3_INF_mlr12k_n50000x962.f
