In [3]:
import pandas as pd
import numpy as np
import h5py as h5
import re

In [27]:
species = "human"
version = "2.2"
gsm4sig_version = 2

single_cell_prob_thresh = 0.5

In [5]:
f = h5.File('../'+species+"_gene_v"+version+".h5", "r")
gse_scprob = np.array([
    f["meta"]["samples"]["series_id"], 
    f["meta"]["samples"]["geo_accession"],
    f["meta"]["samples"]["singlecellprobability"],
    f["meta"]["samples"]["title"]
]).T
f.close()

In [6]:
gse_scprob[:, 0:2] = gse_scprob[:, 0:2].astype(str)
gse_scprob[:, 3] = pd.Series(gse_scprob[:, 3]).str.decode("utf-8")
print(gse_scprob.shape, "\n", gse_scprob[:10])

(722425, 4) 
 [['GSE29282' 'GSM1000981' 0.007336067256529114
  'OCI-LY1_48hrs_mRNAseq_3x_siNT_R1']
 ['GSE29282' 'GSM1000982' -0.006492279198109079
  'OCI-LY1_48hrs_mRNAseq_3x_siNT_R2']
 ['GSE29282' 'GSM1000983' -0.006492279198109079
  'OCI-LY1_48hrs_mRNAseq_3x_siNT_R3']
 ['GSE29282' 'GSM1000984' 0.007336067256529114
  'OCI-LY1_48hrs_mRNAseq_3x_siBCL6_R1']
 ['GSE29282' 'GSM1000985' 0.03063220168783499
  'OCI-LY1_48hrs_mRNAseq_3x_siBCL6_R2']
 ['GSE29282' 'GSM1000986' 0.007095580518495443
  'OCI-LY1_48hrs_mRNAseq_3x_siBCL6_R3']
 ['GSE40819,GSE40820' 'GSM1002540' 0.1138180228552808 'mix1-rep1']
 ['GSE40819,GSE40820' 'GSM1002541' 0.08025961223571804 'mix2-rep1']
 ['GSE40819,GSE40820' 'GSM1002542' 0.25810680912829964 'mix3-rep1']
 ['GSE40819,GSE40820' 'GSM1002543' 0.05067398039179029 'mix4-rep1']]


In [7]:
single_cell_samp = np.argwhere(gse_scprob[:, 2] > single_cell_prob_thresh)       #identify samples w/single cell prob > thresh
single_cell_study = np.unique(gse_scprob[single_cell_samp][:,:,0])               #identify studies corresponding to sc samples
bulk_study_bool = np.isin(gse_scprob[:, 0], single_cell_study, invert = True)    #boolean mask s.t. {T = bulk, F = sc}
bulk_study_idx = np.arange(0, len(bulk_study_bool))[bulk_study_bool]             #index corresponding to bulk RNA-seq (T)
bulk_study_meta = np.append(gse_scprob[bulk_study_bool],                         #filtering out scRNA-seq via boolean indexing
                            bulk_study_idx[:, np.newaxis], axis = 1)             #and appending corresponding h5 index

In [8]:
pd.DataFrame(bulk_study_meta).to_csv("data/"
    +species+"_bulk_study_meta.csv", 
    header=["series_id", "geo_accession", "singlecellprobability", "sample_title", "h5_idx"])

In [9]:
bulk_study_meta.shape, len(np.unique(bulk_study_meta[:, 0]))

((344273, 5), 19665)

In [19]:
frequent_terms_dict = {}
for r in bulk_study_meta:
    label = r[3]
    terms = re.split(r'[_\-. ,()\[\]]', label.strip('[]'))
    for t in terms:
        try:
            if str(int(t)) == t:
                continue
        except:
            t = t.lower()
            if len(t.strip()) < 2:
                continue
            if t not in frequent_terms_dict:
                frequent_terms_dict[t] = 0
            frequent_terms_dict[t] += 1

In [28]:
def categorize_labels(t):
    if t in ['wt', 'wildtype', 'control', 'cntrl', 'ctrl', 'uninfected', 'normal', 'untreated', 'unstimulated', 'shctrl', 'ctl', 'healthy', 'sictrl', 'sicontrol', 'ctr', 'wild']:
        return 1
    elif t in ['ko','kd', 'knockout', 'infected', 'treatment', 'knockdown', 'overexpression', 'overexpressed', 'case', 'crispr', 'mut', 'treated', 'tumor', 'cancer', 'wild']:
        return 2
    else: return 0

frequent_terms = pd.DataFrame(frequent_terms_dict, index=['Frequency']).T.sort_values(by='Frequency', ascending=False).reset_index()
frequent_terms.columns = ['Label', 'Frequency']
frequent_terms['Category'] = frequent_terms['Label'].apply(categorize_labels)
frequent_terms[frequent_terms['Frequency'] > 100].to_csv(f"data/{species}v{gsm4sig_version}_frequent_terms.csv")
frequent_terms.head(100)

Unnamed: 0,Label,Frequency,Category
0,rna,32778,0
1,seq,25912,0
2,rep1,16422,0
3,rep2,16134,0
4,control,13200,1
...,...,...,...
95,baseline,1132,0
96,lps,1126,0
97,muscle,1115,0
98,s1,1107,0
