AR signature in python
converted from Artem Sokolov and Vlado Uzunangelov's R code

In [94]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None # supress pandas copy by ref warning


In [33]:
X_file = "data/AR_sig_celline.tab"
Y_file = "data/ar_cell_line_clin.txt"
X_WCDT_file = "data/filtered_mrna.tab"

In [43]:
X_df = pd.read_csv(X_file, delimiter="\t")
Y_df = pd.read_csv(Y_file, delimiter="\t")

In [95]:
def data_prep(X_df, Y_df):
    # take "ligand" column as Y label and set "on" to 1 and "no ligand" to 0
    Y_df = Y_df[["GSM_ID", "ligand"]]
    Y_df['ligand'] = Y_df['ligand'].replace(['on'], 1)
    Y_df['ligand'] = Y_df['ligand'].replace(['no ligand'], 0)

    
    # Drop sample GSM288299, GSM288300, GSM288301 (QA by Robert and Jack)
    Y_df = Y_df[~Y_df.GSM_ID.isin(["GSM288299", "GSM288300", "GSM288301"])]
    
    # remove non-overlapping samples in X and Y
    common_samples = sorted(list(set(Y_df.GSM_ID).intersection(set(X_df.columns[1:]))))
    X_df = X_df[["Gene Symbol"] + common_samples]
    Y_df = Y_df[Y_df.GSM_ID.isin(common_samples)]
    
    # remove genes with NA
    X_df = X_df.dropna()
    
    # get overlap of genes between X_df and WCDT file
    WCDT_df = pd.read_csv(X_WCDT_file, delimiter="\t")
    common_genes = set(WCDT_df["gene_id"]).intersection(set(X_df["Gene Symbol"]))
    X_df = X_df[X_df["Gene Symbol"].isin(common_genes)]
        
    # rank transformation + normalization (with quantile function of exponential distribution) 
    X_df.set_index("Gene Symbol", inplace=True)
    X_orig_df = X_df.copy()
    X_df = (X_df.rank()-1)/len(X_df)
    X_df = X_df.applymap(lambda x: -np.log(1-x))

    return X_orig_df, X_df, Y_df

In [96]:
_, X, Y = data_prep(X_df, Y_df)
print(Y)

       GSM_ID  ligand
0    GSM63051       0
1    GSM64845       0
2    GSM64852       1
3    GSM64855       0
4    GSM64858       0
5    GSM64859       1
6    GSM64861       0
7    GSM64864       0
8    GSM64865       1
9   GSM560828       0
10  GSM560829       0
11  GSM560830       0
12  GSM560831       1
13  GSM560832       1
14  GSM560833       1
15  GSM560834       0
16  GSM560835       0
17  GSM560836       0
18  GSM560837       1
19  GSM560838       1
20  GSM560839       1
33  GSM801007       0
34  GSM801008       1
35  GSM801009       0
36  GSM801010       1
37  GSM823844       1
38  GSM823845       0
39  GSM823846       0
40  GSM823847       0
41  GSM823848       1
42  GSM823849       1
43  GSM823850       1
44  GSM823851       0
45  GSM823852       0
46  GSM823853       1
47  GSM426518       0
48  GSM426519       0
49  GSM426520       0
50  GSM426521       1
51  GSM426522       1
52  GSM426523       1
