In [20]:
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm

In [4]:
species="human"
gsm4sig_version = 2

In [7]:
bulk_study_meta = pd.read_csv("data/"+species+"_bulk_study_meta.csv", index_col=0)
frequent_terms = pd.read_csv(f"data/{species}v{gsm4sig_version}_frequent_terms.csv", index_col=0)

In [8]:
frequent_terms
# Category 0: Neutral
# Category 1: Control term
# Category 2: Perturbation term

Unnamed: 0,Label,Frequency,Category
0,rna,32778,0
1,seq,25912,0
2,rep1,16422,0
3,rep2,16134,0
4,control,13200,1
...,...,...,...
1344,lapc4,101,0
1345,trimester,101,0
1346,t39,101,0
1347,vec,101,0


In [9]:
bulk_study_meta["sample_title_set"] = (bulk_study_meta["sample_title"]             # processing the sample titles by 
                                       .str.lower().str.findall(r"[a-z]+")         # extracting the keywords and 
                                       .apply(frozenset))                          # saving them as a (frozen) set

ctrl_set = set(frequent_terms.loc[frequent_terms["Category"] == 1]["Label"])
pert_set = set(frequent_terms.loc[frequent_terms["Category"] == 2]["Label"])

In [10]:
bulk_study_meta

Unnamed: 0,series_id,geo_accession,singlecellprobability,sample_title,h5_idx,sample_title_set
0,GSE29282,GSM1000981,0.007336,OCI-LY1_48hrs_mRNAseq_3x_siNT_R1,0,"(hrs, r, ly, oci, x, mrnaseq, sint)"
1,GSE29282,GSM1000982,-0.006492,OCI-LY1_48hrs_mRNAseq_3x_siNT_R2,1,"(hrs, r, ly, oci, x, mrnaseq, sint)"
2,GSE29282,GSM1000983,-0.006492,OCI-LY1_48hrs_mRNAseq_3x_siNT_R3,2,"(hrs, r, ly, oci, x, mrnaseq, sint)"
3,GSE29282,GSM1000984,0.007336,OCI-LY1_48hrs_mRNAseq_3x_siBCL6_R1,3,"(hrs, r, sibcl, ly, oci, x, mrnaseq)"
4,GSE29282,GSM1000985,0.030632,OCI-LY1_48hrs_mRNAseq_3x_siBCL6_R2,4,"(hrs, r, sibcl, ly, oci, x, mrnaseq)"
...,...,...,...,...,...,...
344268,GSE40705,GSM999533,0.003373,PolyA,722387,(polya)
344269,GSE40705,GSM999534,0.004548,Total,722388,(total)
344270,GSE40705,GSM999535,0.006154,TruSeq,722389,(truseq)
344271,GSE40705,GSM999536,0.046501,NuGEN 1i,722390,"(nugen, i)"


In [11]:
def categorize_cp(test_set, ctrl_set = ctrl_set, pert_set = pert_set): # categorizes sample as ctrl or pert
    difference = len(test_set & ctrl_set) - len(test_set & pert_set)   # 0 - undetermined
    if difference == 0: return 0                                       # 1 - control group
    return 1 if difference > 0 else 2                                  # 2 - perturbation group

In [12]:
bulk_study_meta["cp_group"] = bulk_study_meta["sample_title_set"].apply(categorize_cp)

In [13]:
len(bulk_study_meta["series_id"].unique()) # unique studies before filtering

19665

In [14]:
no_undet = []
for gse in bulk_study_meta.groupby("series_id")["sample_title_set"]:   
    if ((gse[1].value_counts() >= 2).all()): no_undet.append(gse[0])  # keeping studies where theres >= 2 samples per category
        
no_undet_study = bulk_study_meta[bulk_study_meta["series_id"].isin(no_undet)].copy()

In [15]:
len(no_undet)                            # unique studies post filtering

13007

In [None]:
#gse_per_term = []                                                                              # counts unique gse for each 
#def det_gse(test_set, term): return 1 if len(test_set & {term}) > 0 else 0                     # control or perturbation term
#for i in frequent_terms["Label"]:
#    bulk_study_meta["det_gse"] = bulk_study_meta["sample_title_set"].apply(det_gse, term=i)
#    gse_per_term.append(len(bulk_study_meta.loc[bulk_study_meta["det_gse"] == 1]["series_id"].unique()))
#    bulk_study_meta["det_gse"] = np.zeros(162372)
    
#frequent_terms["num_gse"] = gse_per_term
#frequent_terms.to_csv(species+"_frequent_terms_gse_count.csv", index=False)

In [None]:
#def gsm_title_cluster(series, cluster_esp = 0.01, cluster_min_samp = 2):                     # grouping gsm by clustering words
#    mlb = MultiLabelBinarizer()
#    term_mat = mlb.fit_transform(series["sample_title_set"])
#    if 1 in term_mat.shape: return np.zeros(len(series)).astype(int)
#    pca = PCA()
#    pca.fit(term_mat.T)
#    clustering = DBSCAN(eps=0.01, min_samples=2)
#    #print(list(zip(pca.components_[0], pca.components_[1])))
#    #clustering.fit(list(zip(pca.components_[0], pca.components_[1])))
#    clustering.fit(pca.components_[0].reshape(-1, 1))
#    return clustering.labels_


# for gse in no_undet_study["series_id"].unique():
#    series = no_undet_study.loc[no_undet_study["series_id"] == gse]
#    no_undet_study.loc[no_undet_study["series_id"] == gse, "gsm_cluster"] = gsm_title_cluster(series)

In [16]:
def count_series(series):
    '''
    Categorize an input series and return (undet, ctrl, pert) tuple
    '''
    series_counter = (Counter(list(zip(*series.groupby(["cp_group", "sample_title_set"])
                                       .groups.keys()))[0]))
                                          # Subgroup counts for:
    return (min(series_counter[0], 3),    # undetermined group
            min(series_counter[1], 3),    # control group
            min(series_counter[2], 3))    # perturbation group

In [21]:
gse_types = {}
for n, gse in tqdm(enumerate(no_undet_study["series_id"].unique())):               # categorizing each of the filtered study
    series = no_undet_study.loc[no_undet_study["series_id"] == gse]
    gse_types[gse] = count_series(series)
    
categorized_gse = pd.DataFrame({"gse":gse_types.keys(), "gse_type":gse_types.values()})

13007it [01:55, 112.52it/s]


In [22]:
print("(?, c, p)    freq   3 := >2")
print(categorized_gse["gse_type"].value_counts().to_string())

(?, c, p)    freq   3 := >2
gse_type
(3, 0, 0)    3392
(2, 0, 0)    3149
(1, 0, 0)    1481
(1, 1, 0)    1294
(0, 1, 1)     491
(3, 1, 0)     410
(2, 1, 0)     394
(2, 2, 0)     341
(3, 3, 0)     197
(3, 2, 0)     197
(1, 0, 1)     181
(0, 0, 2)     131
(0, 2, 2)     129
(0, 0, 3)     116
(2, 0, 2)      73
(1, 1, 1)      70
(0, 1, 2)      59
(2, 0, 1)      57
(0, 0, 1)      51
(0, 1, 0)      50
(3, 0, 1)      49
(0, 3, 3)      48
(1, 2, 0)      43
(0, 2, 0)      42
(3, 0, 3)      40
(2, 1, 1)      34
(1, 0, 3)      33
(3, 0, 2)      33
(1, 0, 2)      31
(2, 3, 0)      30
(1, 1, 2)      27
(0, 1, 3)      27
(0, 3, 0)      24
(1, 3, 0)      23
(0, 2, 3)      22
(3, 1, 1)      22
(1, 2, 1)      21
(3, 3, 3)      21
(3, 2, 2)      15
(2, 2, 2)      14
(3, 2, 1)      13
(3, 1, 2)      12
(0, 2, 1)      11
(1, 1, 3)      11
(2, 0, 3)      10
(3, 3, 1)       8
(1, 3, 2)       8
(2, 1, 2)       8
(1, 2, 2)       7
(3, 2, 3)       7
(1, 3, 3)       6
(2, 2, 1)       6
(3, 3, 2)       6
(0, 3, 2)

In [23]:
def extrap_score(study_cat):
    '''
    Determine extrapolation score for a given study category tuple by 
    penalizing each of inferred type (ctrl, pert) and direction
    '''
    undet, ctrl, pert = study_cat
    score = 0
    if undet == 0 or pert == 0:                 #(0, x, y) or (y, x, 0)
        if ctrl == 1:                           #(0, 1, y) or (y, 1, 0)
            if undet != 0: score += 1           #penalty for inferred pert
        if ctrl < 1:                            #(0, 0, y) or (y, 0, 0)
            score += 1                          #penalty for inferred ctrl
            score += 2 if undet != 0 else 1     #penalty for inferred direction and/or inferred pert
    else:
        if ctrl < 1:                            #(x, 0, y)
            score += 1 if undet == 1 else -1    #penality for inferred ctrl
        else:
            score = -1                          #unaccounted for
    return score
        

def to_calc_sig_format(study_cat, studies = no_undet_study, categorized_gse = categorized_gse):
    '''
    Convert studies corresponding to an input study category type to 
    a signature-calculation-friendly dataframe format with rows: 
    gse, [ctrl_gsm0, ctrl_gsm1, ...], [pert_gsm0, pert_gsm1, ...], extrap_score
    '''
    undet, ctrl, pert = study_cat
    study_group = studies.loc[studies.series_id.isin(
        categorized_gse.loc[categorized_gse["gse_type"] == study_cat]["gse"])].copy()
    if ctrl == 0 and undet == 1: #(1, 0, x)
        study_group.replace({'cp_group':{0:"ctrl_gsm", 2:"pert_gsm"}}, inplace=True)
    else:
        study_group.replace({'cp_group':{1:"ctrl_gsm", 0:"pert_gsm", 2:"pert_gsm"}}, inplace=True)
        
    if ctrl == 0 and (undet == 0 or pert == 0):
        study_cp_series = study_group.groupby(["series_id", "sample_title_set"])["cp_group"].unique()
        new_ctrl_idx = study_cp_series.index[::2]
        study_cp_series[study_cp_series.index.isin(new_ctrl_idx)] = "ctrl_gsm"
        study_group_map = dict(study_cp_series.explode())
        study_group = (study_group.assign(cp_group=study_group[["series_id","sample_title_set"]]
                                         .apply(tuple, axis=1).map(study_group_map)))
            
    return (study_group
            .groupby(["series_id", "cp_group","sample_title_set"])["geo_accession"]
            .aggregate(lambda x: list(x))                  # aggregates gsms as a list (ie [gsm1, gsm2, ...])
           #.aggregate(lambda x: "|".join(x))              # aggregates gsms separated by pipes (ie gsm1|gsm2|...)
            .unstack()
            .apply(lambda x: list(x.dropna()), axis=1)
            .unstack()
            .explode("ctrl_gsm")
            .explode("pert_gsm")
            .assign(extrap_score=extrap_score(study_cat)))

In [24]:
# valid types used to generate final list of gsms for signature calculation
valid_types = [(0, 1, 1), (0, 1, 2), (0, 1, 3), 
               (1, 1, 0), (2, 1, 0), (3, 1, 0), 
               (0, 0, 2), (2, 0, 0),
               (1, 0, 1), (1, 0, 2), (1, 0, 3)]
gsm4sig = pd.concat([to_calc_sig_format(i) for i in valid_types]).reset_index()
gsm4sig["series_id"] = gsm4sig["series_id"].replace(regex=r"\t", value="-")
gsm4sig.to_csv(f"data/{species}_gsm4sig_v{gsm4sig_version}.csv")