In [1]:
# Import dependencies
import pandas as pd
import statistics
import math
from maayanlab_bioinformatics.dge.characteristic_direction import characteristic_direction

In [2]:
# Function for computing signatures with characteristic direction
def cd_signature(control, treatment, dataset, normalization, meta_class_column_name):
    tmp_normalization = normalization
    
    signatures = dict()
    
    signature_label = normalization
    
    cls1_sample_ids = dataset["dataset_metadata"].loc[dataset["dataset_metadata"][meta_class_column_name]==control, :].index.tolist() #control
    cls2_sample_ids = dataset["dataset_metadata"].loc[dataset["dataset_metadata"][meta_class_column_name]==treatment,:].index.tolist() #case
    
    signature = characteristic_direction(dataset[tmp_normalization].loc[:, cls1_sample_ids], dataset[normalization].loc[:, cls2_sample_ids], calculate_sig=True)
    signature = signature.sort_values("CD-coefficient", ascending=False)
    
    signatures[signature_label] = signature
    
    return signatures

In [3]:
# Function for computing signatures with characteristic direction
def logFC(control, treatment, dataset, normalization, meta_class_column_name):
    tmp_normalization = normalization
    
    signatures = dict()
    
    signature_label = normalization
    
    cls1_sample_ids = dataset["dataset_metadata"].loc[dataset["dataset_metadata"][meta_class_column_name]==control, :].index.tolist() #control
    cls2_sample_ids = dataset["dataset_metadata"].loc[dataset["dataset_metadata"][meta_class_column_name]==treatment,:].index.tolist() #case
    
    values = []
    for i in range(len(expr_df)):
        case_mean = statistics.mean(expr_df.iloc[i][cls2_sample_ids])
        control_mean = statistics.mean(expr_df.iloc[i][cls1_sample_ids])
        if case_mean == 0 or control_mean == 0:
            values.append('NA')
        else:
            values.append(math.log(case_mean/control_mean,2))

    signature = pd.DataFrame(values, columns = ['logFC'])
    signature.index = expr_df.index
    signature = signature[signature['logFC'] != 'NA']
    signature = signature.sort_values("logFC", ascending=False)
    
    signatures[signature_label] = signature
    
    return signatures

# GSE159084

In [7]:
# load in data 
meta_class_column_name = 'Treatment'
control_name = 'control'
treatment = 'dex'
meta_data_filename = 'GSE159084_metadata.txt'
rnaseq_data_filename = 'GSE159084_Gene_Expression_Profiling.xlsx'
low_expression_threshold = 0.3

try:
    meta_df = pd.read_csv(meta_data_filename, sep="\t", index_col=0, dtype=str)
except:
    print("Error! Please load a metadata file in txt or tsv format")
try:
    expr_df = pd.read_excel(rnaseq_data_filename).sort_index()
except:
    print("Error! Please load a RNA-seq expression file in txt or tsv format")
expr_df = expr_df.set_index('gene_short_name')
meta_df.index = meta_df.index.map(str)

# Match samples between the metadata and the datasets
if meta_class_column_name not in meta_df.columns:
    print(f"Error! Column '{meta_class_column_name}' is not in metadata")
    
meta_df = meta_df[meta_df.index.isin(expr_df.columns)]

# sort metadata by class labels; control first
classes = list(meta_df[meta_class_column_name].unique())
classes.remove(control_name)
classes.insert(0, control_name)
meta_df['tmp_class'] = pd.Categorical(meta_df[meta_class_column_name], classes)
meta_df = meta_df.sort_values('tmp_class')
meta_df = meta_df.drop('tmp_class', axis=1)
expr_df = expr_df.loc[:,meta_df.index]
expr_df = expr_df.groupby(expr_df.index).sum()
try:
    assert(meta_df.shape[0]==expr_df.shape[1])
except:
    print("Error! Input files are in a wrong format.")

dataset = dict()
current_dataset = 'rawdata'
dataset[current_dataset] = expr_df
dataset['dataset_metadata'] = meta_df

## Filter out non-expressed genes
expr_df = expr_df.loc[expr_df.sum(axis=1) > 0, :]
## Filter out lowly expressed genes
mask_low_vals = (expr_df > low_expression_threshold).sum(axis=1) > 2
expr_df = expr_df.loc[mask_low_vals, :]
current_dataset += '+filter_genes'
dataset[current_dataset] = expr_df

In [5]:
expr_df

Sample_title,CON1_FPKM,CON2_FPKM,CON3_FPKM,CON4_FPKM,DEX1_FPKM,DEX2_FPKM,DEX3_FPKM,DEX4_FPKM
0,6.088300,4.693950,6.546880,5.157240,4.890360,3.303080,3.485380,0.823810
2,0.613928,0.189342,0.404009,0.097095,0.377022,0.629754,0.287443,0.609918
4,995.610000,571.503000,690.374000,637.734000,672.273000,678.245000,649.617000,354.162000
5,10.451900,16.212900,6.084100,7.135920,10.153800,13.068000,11.227300,10.321400
6,0.369604,0.896261,1.802320,0.072348,1.011430,1.205760,0.319378,0.890901
...,...,...,...,...,...,...,...,...
22551,0.575961,0.372559,0.342286,0.359761,2.561410,0.000000,0.197913,0.000000
22552,1.748870,1.560420,1.899630,2.088580,1.539570,1.206660,0.477881,1.144170
22555,1.957540,1.818660,2.289500,1.227500,2.730500,3.532190,2.087320,3.345870
22556,4.931240,4.676480,6.855870,5.807760,4.910870,5.717560,3.574900,1.565490


In [8]:
# compute signature
signature = cd_signature(control_name, treatment, dataset, 'rawdata+filter_genes', meta_class_column_name)
signature['rawdata+filter_genes'].to_csv('GSE159084.txt', sep="\t")

# GSE163065

In [12]:
# load in data 
meta_class_column_name = 'Treatment'
control_name = 'control'
treatment = 'dex'
meta_data_filename = 'GSE163065_metadata.txt'
rnaseq_data_filename = 'GSE163065_counts_RPMs_RPKMs.xlsx'
low_expression_threshold = 0.3

try:
    meta_df = pd.read_csv(meta_data_filename, sep="\t", index_col=0, dtype=str)
except:
    print("Error! Please load a metadata file in txt or tsv format")
try:
    expr_df = pd.read_excel(rnaseq_data_filename).sort_index()
except:
    print("Error! Please load a RNA-seq expression file in txt or tsv format")
expr_df = expr_df.set_index('GeneName')
meta_df.index = meta_df.index.map(str)

# Match samples between the metadata and the datasets
if meta_class_column_name not in meta_df.columns:
    print(f"Error! Column '{meta_class_column_name}' is not in metadata")
    
meta_df = meta_df[meta_df.index.isin(expr_df.columns)]

# sort metadata by class labels; control first
classes = list(meta_df[meta_class_column_name].unique())
classes.remove(control_name)
classes.insert(0, control_name)
meta_df['tmp_class'] = pd.Categorical(meta_df[meta_class_column_name], classes)
meta_df = meta_df.sort_values('tmp_class')
meta_df = meta_df.drop('tmp_class', axis=1)
expr_df = expr_df.loc[:,meta_df.index]
expr_df = expr_df.groupby(expr_df.index).sum()
try:
    assert(meta_df.shape[0]==expr_df.shape[1])
except:
    print("Error! Input files are in a wrong format.")

dataset = dict()
current_dataset = 'rawdata'
dataset[current_dataset] = expr_df
dataset['dataset_metadata'] = meta_df

## Filter out non-expressed genes
expr_df = expr_df.loc[expr_df.sum(axis=1) > 0, :]
## Filter out lowly expressed genes
mask_low_vals = (expr_df > low_expression_threshold).sum(axis=1) > 2
expr_df = expr_df.loc[mask_low_vals, :]
current_dataset += '+filter_genes'
dataset[current_dataset] = expr_df

In [13]:
expr_df

Sample_title,NA-SD-19,NA-SD-23,NA-SD-27,NA-SD-21,NA-SD-25,NA-SD-29
GeneName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0610009B22Rik,10.249212,8.887646,8.911430,8.453046,9.798075,9.361375
0610009E02Rik,0.450035,0.629510,0.600657,0.473945,0.514389,0.626822
0610009L18Rik,6.792555,6.689296,7.051307,2.790350,2.573155,2.951682
0610010F05Rik,5.297814,5.220476,5.595217,15.850395,15.601677,16.558084
0610010K14Rik,1.024511,0.997721,1.038536,1.208253,1.456840,1.212021
...,...,...,...,...,...,...
mt-Ts1,8.196085,8.529308,43.756983,5.269952,3.798602,8.462547
mt-Tt,68.993909,62.114822,78.370716,84.665324,82.753702,105.425325
mt-Tw,225.698161,222.107544,274.787321,226.588286,235.622359,265.109552
mt-Ty,0.733978,0.000000,0.000000,0.542726,0.601845,0.562268


In [14]:
# compute signature
signature = cd_signature(control_name, treatment, dataset, 'rawdata+filter_genes', meta_class_column_name)
signature['rawdata+filter_genes'].to_csv('GSE163065.txt', sep="\t")

# GSE186950

In [16]:
# load in data 
meta_class_column_name = 'Treatment'
control_name = 'control'
treatment = 'dex'
meta_data_filename = 'GSE186950_metadata.txt'
rnaseq_data_filename = 'GSE186950_NormCount.xlsx'
low_expression_threshold = 0.3

try:
    meta_df = pd.read_csv(meta_data_filename, sep="\t", index_col=0, dtype=str)
except:
    print("Error! Please load a metadata file in txt or tsv format")
try:
    expr_df = pd.read_excel(rnaseq_data_filename).sort_index()
except:
    print("Error! Please load a RNA-seq expression file in txt or tsv format")
expr_df = expr_df.set_index('Gene_Name')
meta_df.index = meta_df.index.map(str)

# Match samples between the metadata and the datasets
if meta_class_column_name not in meta_df.columns:
    print(f"Error! Column '{meta_class_column_name}' is not in metadata")
    
meta_df = meta_df[meta_df.index.isin(expr_df.columns)]

# sort metadata by class labels; control first
classes = list(meta_df[meta_class_column_name].unique())
classes.remove(control_name)
classes.insert(0, control_name)
meta_df['tmp_class'] = pd.Categorical(meta_df[meta_class_column_name], classes)
meta_df = meta_df.sort_values('tmp_class')
meta_df = meta_df.drop('tmp_class', axis=1)
expr_df = expr_df.loc[:,meta_df.index]
expr_df = expr_df.groupby(expr_df.index).sum()
try:
    assert(meta_df.shape[0]==expr_df.shape[1])
except:
    print("Error! Input files are in a wrong format.")

dataset = dict()
current_dataset = 'rawdata'
dataset[current_dataset] = expr_df
dataset['dataset_metadata'] = meta_df

## Filter out non-expressed genes
expr_df = expr_df.loc[expr_df.sum(axis=1) > 0, :]
## Filter out lowly expressed genes
mask_low_vals = (expr_df > low_expression_threshold).sum(axis=1) > 2
expr_df = expr_df.loc[mask_low_vals, :]
current_dataset += '+filter_genes'
dataset[current_dataset] = expr_df

In [17]:
expr_df

Sample_title,AF25,AF26,AF29,AF25DEX,AF26DEX,AF29DEX
Gene_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A1BG-AS1,101.238412,69.133212,85.371425,75.057577,76.011260,57.365282
A2M,947.239407,974.232499,1287.837024,792.393564,698.353451,510.029505
A2M-AS1,8.803340,24.560483,5.449240,3.216753,14.252111,0.000000
AAAS,572.217114,402.973854,455.011530,494.307758,597.400996,493.341423
AACS,492.106718,352.033592,316.055913,551.137066,477.445726,398.427957
...,...,...,...,...,...,...
ZXDA,47.538037,57.307794,35.420059,28.950780,45.131686,32.333159
ZXDB,85.392400,140.085719,103.535558,60.046062,67.697528,61.537302
ZXDC,450.731019,448.456230,479.533109,572.582088,516.639032,517.330541
ZYG11B,377.663295,441.179050,340.577492,580.087846,427.563337,470.395311


In [18]:
# compute signature
signature = cd_signature(control_name, treatment, dataset, 'rawdata+filter_genes', meta_class_column_name)
signature['rawdata+filter_genes'].to_csv('GSE186950.txt', sep="\t")

# GSE189182

In [22]:
# load in data 
meta_class_column_name = 'Treatment'
control_name = 'control'
treatment = 'dex'
meta_data_filename = 'GSE189182_metadata.txt'
rnaseq_data_filename = 'GSE189182_readcount_in_vivo.xlsx'
low_expression_threshold = 0.3

try:
    meta_df = pd.read_csv(meta_data_filename, sep="\t", index_col=0, dtype=str)
except:
    print("Error! Please load a metadata file in txt or tsv format")
try:
    expr_df = pd.read_excel(rnaseq_data_filename).sort_index()
except:
    print("Error! Please load a RNA-seq expression file in txt or tsv format")
expr_df = expr_df.set_index('gene_name')
meta_df.index = meta_df.index.map(str)

# Match samples between the metadata and the datasets
if meta_class_column_name not in meta_df.columns:
    print(f"Error! Column '{meta_class_column_name}' is not in metadata")
    
meta_df = meta_df[meta_df.index.isin(expr_df.columns)]

# sort metadata by class labels; control first
classes = list(meta_df[meta_class_column_name].unique())
classes.remove(control_name)
classes.insert(0, control_name)
meta_df['tmp_class'] = pd.Categorical(meta_df[meta_class_column_name], classes)
meta_df = meta_df.sort_values('tmp_class')
meta_df = meta_df.drop('tmp_class', axis=1)
expr_df = expr_df.loc[:,meta_df.index]
expr_df = expr_df.groupby(expr_df.index).sum()
try:
    assert(meta_df.shape[0]==expr_df.shape[1])
except:
    print("Error! Input files are in a wrong format.")

dataset = dict()
current_dataset = 'rawdata'
dataset[current_dataset] = expr_df
dataset['dataset_metadata'] = meta_df

## Filter out non-expressed genes
expr_df = expr_df.loc[expr_df.sum(axis=1) > 0, :]
## Filter out lowly expressed genes
mask_low_vals = (expr_df > low_expression_threshold).sum(axis=1) > 2
expr_df = expr_df.loc[mask_low_vals, :]
current_dataset += '+filter_genes'
dataset[current_dataset] = expr_df

In [31]:
expr_df = expr_df.iloc[21: , :]

In [32]:
# compute signature
signature = cd_signature(control_name, treatment, dataset, 'rawdata+filter_genes', meta_class_column_name)
signature['rawdata+filter_genes'].to_csv('GSE189182.txt', sep="\t")

# GSE189305	

In [33]:
# load in data 
meta_class_column_name = 'Treatment'
control_name = 'control'
treatment = 'dex'
meta_data_filename = 'GSE189305_metadata.txt'
rnaseq_data_filename = 'GSE189305_readcount_in_vitro.xlsx'
low_expression_threshold = 0.3

try:
    meta_df = pd.read_csv(meta_data_filename, sep="\t", index_col=0, dtype=str)
except:
    print("Error! Please load a metadata file in txt or tsv format")
try:
    expr_df = pd.read_excel(rnaseq_data_filename).sort_index()
except:
    print("Error! Please load a RNA-seq expression file in txt or tsv format")
expr_df = expr_df.set_index('gene_name')
meta_df.index = meta_df.index.map(str)

# Match samples between the metadata and the datasets
if meta_class_column_name not in meta_df.columns:
    print(f"Error! Column '{meta_class_column_name}' is not in metadata")
    
meta_df = meta_df[meta_df.index.isin(expr_df.columns)]

# sort metadata by class labels; control first
classes = list(meta_df[meta_class_column_name].unique())
classes.remove(control_name)
classes.insert(0, control_name)
meta_df['tmp_class'] = pd.Categorical(meta_df[meta_class_column_name], classes)
meta_df = meta_df.sort_values('tmp_class')
meta_df = meta_df.drop('tmp_class', axis=1)
expr_df = expr_df.loc[:,meta_df.index]
expr_df = expr_df.groupby(expr_df.index).sum()
try:
    assert(meta_df.shape[0]==expr_df.shape[1])
except:
    print("Error! Input files are in a wrong format.")

dataset = dict()
current_dataset = 'rawdata'
dataset[current_dataset] = expr_df
dataset['dataset_metadata'] = meta_df

## Filter out non-expressed genes
expr_df = expr_df.loc[expr_df.sum(axis=1) > 0, :]
## Filter out lowly expressed genes
mask_low_vals = (expr_df > low_expression_threshold).sum(axis=1) > 2
expr_df = expr_df.loc[mask_low_vals, :]
current_dataset += '+filter_genes'
dataset[current_dataset] = expr_df

In [37]:
expr_df = expr_df.iloc[19: , :]

In [38]:
# compute signature
signature = cd_signature(control_name, treatment, dataset, 'rawdata+filter_genes', meta_class_column_name)
signature['rawdata+filter_genes'].to_csv('GSE189305.txt', sep="\t")

# GSE195617

In [55]:
control_counts = pd.read_csv('GSM5841784_Controlrawcounts.txt', sep=" ", header=None).sort_index()
control_counts = control_counts.rename(columns={0:'gene_id', 1:'GSM5841784'})
control_counts = control_counts.set_index('gene_id')
control_counts

Unnamed: 0_level_0,GSM5841784
gene_id,Unnamed: 1_level_1
ENSG00000223972,0
ENSG00000227232,892
ENSG00000243485,0
ENSG00000237613,0
ENSG00000268020,0
...,...
ENSG00000198695,1569
ENSG00000210194,7
ENSG00000198727,5022
ENSG00000210195,0


In [59]:
dex_counts = pd.read_csv('GSM5841787_Dexamethasone_500uMrawcounts.txt', sep=" ", header=None).sort_index()
dex_counts = dex_counts.rename(columns={0:'gene_id', 1:'GSM5841787'})
dex_counts = dex_counts.set_index('gene_id')
dex_counts

Unnamed: 0_level_0,GSM5841787
gene_id,Unnamed: 1_level_1
ENSG00000223972,0
ENSG00000227232,762
ENSG00000243485,0
ENSG00000237613,0
ENSG00000268020,0
...,...
ENSG00000198695,1103
ENSG00000210194,5
ENSG00000198727,2843
ENSG00000210195,0


In [61]:
# load in data 
meta_class_column_name = 'Treatment'
control_name = 'control'
treatment = 'dex'
meta_data_filename = 'GSE195617_metadata.txt'
low_expression_threshold = 0.3

try:
    meta_df = pd.read_csv(meta_data_filename, sep="\t", index_col=0, dtype=str)
except:
    print("Error! Please load a metadata file in txt or tsv format")
expr_df = pd.concat([control_counts, dex_counts], axis=1)
meta_df.index = meta_df.index.map(str)

# Match samples between the metadata and the datasets
if meta_class_column_name not in meta_df.columns:
    print(f"Error! Column '{meta_class_column_name}' is not in metadata")
    
meta_df = meta_df[meta_df.index.isin(expr_df.columns)]

# sort metadata by class labels; control first
classes = list(meta_df[meta_class_column_name].unique())
classes.remove(control_name)
classes.insert(0, control_name)
meta_df['tmp_class'] = pd.Categorical(meta_df[meta_class_column_name], classes)
meta_df = meta_df.sort_values('tmp_class')
meta_df = meta_df.drop('tmp_class', axis=1)
expr_df = expr_df.loc[:,meta_df.index]
expr_df = expr_df.groupby(expr_df.index).sum()
try:
    assert(meta_df.shape[0]==expr_df.shape[1])
except:
    print("Error! Input files are in a wrong format.")

dataset = dict()
current_dataset = 'rawdata'
dataset[current_dataset] = expr_df
dataset['dataset_metadata'] = meta_df

## Filter out non-expressed genes
expr_df = expr_df.loc[expr_df.sum(axis=1) > 0, :]
## Filter out lowly expressed genes
mask_low_vals = (expr_df > low_expression_threshold).sum(axis=1) > 2
expr_df = expr_df.loc[mask_low_vals, :]
current_dataset += '+filter_genes'
dataset[current_dataset] = expr_df

In [67]:
# compute signature
signature = logFC(control_name, treatment, dataset, 'rawdata+filter_genes', meta_class_column_name)
signature['rawdata+filter_genes'].to_csv('GSE195617.txt', sep="\t")

# GSE176277

In [8]:
# load in data 
meta_class_column_name = 'Treatment'
control_name = 'control mut'
treatment = 'dex mut'
meta_data_filename = 'GSE176277_metadata.txt'
rnaseq_data_filename = 'GSE176277_counts.txt'
low_expression_threshold = 0.3

try:
    meta_df = pd.read_csv(meta_data_filename, sep="\t", index_col=0, dtype=str)
except:
    print("Error! Please load a metadata file in txt or tsv format")
try:
    expr_df = pd.read_csv(rnaseq_data_filename, index_col=0, sep="\t").sort_index()
except:
    print("Error! Please load a RNA-seq expression file in txt or tsv format")
meta_df.index = meta_df.index.map(str)

# Match samples between the metadata and the datasets
if meta_class_column_name not in meta_df.columns:
    print(f"Error! Column '{meta_class_column_name}' is not in metadata")
    
meta_df = meta_df[meta_df.index.isin(expr_df.columns)]

# sort metadata by class labels; control first
classes = list(meta_df[meta_class_column_name].unique())
classes.remove(control_name)
classes.insert(0, control_name)
meta_df['tmp_class'] = pd.Categorical(meta_df[meta_class_column_name], classes)
meta_df = meta_df.sort_values('tmp_class')
meta_df = meta_df.drop('tmp_class', axis=1)
expr_df = expr_df.loc[:,meta_df.index]
expr_df = expr_df.groupby(expr_df.index).sum()
try:
    assert(meta_df.shape[0]==expr_df.shape[1])
except:
    print("Error! Input files are in a wrong format.")

dataset = dict()
current_dataset = 'rawdata'
dataset[current_dataset] = expr_df
dataset['dataset_metadata'] = meta_df

## Filter out non-expressed genes
expr_df = expr_df.loc[expr_df.sum(axis=1) > 0, :]
## Filter out lowly expressed genes
mask_low_vals = (expr_df > low_expression_threshold).sum(axis=1) > 2
expr_df = expr_df.loc[mask_low_vals, :]
current_dataset += '+filter_genes'
dataset[current_dataset] = expr_df

In [11]:
# signatures for mutated mice
control_name = 'control mut'
treatment = 'dex mut'
signature = cd_signature(control_name, treatment, dataset, 'rawdata+filter_genes', meta_class_column_name)
signature['rawdata+filter_genes'].to_csv('GSE176277_mut.txt', sep="\t")

In [12]:
# signatures for wildtype mice
control_name = 'control wt'
treatment = 'dex wt'
signature = cd_signature(control_name, treatment, dataset, 'rawdata+filter_genes', meta_class_column_name)
signature['rawdata+filter_genes'].to_csv('GSE176277_wt.txt', sep="\t")

# GSE159952

In [4]:
# load in data 
meta_class_column_name = 'Treatment'
control_name = 'control'
treatment = 'dex'
meta_data_filename = 'GSE159952_metadata.txt'
rnaseq_data_filename = 'GSE159952_all.tpm.txt'
low_expression_threshold = 0.3

try:
    meta_df = pd.read_csv(meta_data_filename, sep="\t", index_col=0, dtype=str)
except:
    print("Error! Please load a metadata file in txt or tsv format")
try:
    expr_df = pd.read_csv(rnaseq_data_filename, index_col=0, sep="\t").sort_index()
except:
    print("Error! Please load a RNA-seq expression file in txt or tsv format")
meta_df.index = meta_df.index.map(str)

# Match samples between the metadata and the datasets
if meta_class_column_name not in meta_df.columns:
    print(f"Error! Column '{meta_class_column_name}' is not in metadata")
    
meta_df = meta_df[meta_df.index.isin(expr_df.columns)]

# sort metadata by class labels; control first
classes = list(meta_df[meta_class_column_name].unique())
classes.remove(control_name)
classes.insert(0, control_name)
meta_df['tmp_class'] = pd.Categorical(meta_df[meta_class_column_name], classes)
meta_df = meta_df.sort_values('tmp_class')
meta_df = meta_df.drop('tmp_class', axis=1)
expr_df = expr_df.loc[:,meta_df.index]
expr_df = expr_df.groupby(expr_df.index).sum()
try:
    assert(meta_df.shape[0]==expr_df.shape[1])
except:
    print("Error! Input files are in a wrong format.")

dataset = dict()
current_dataset = 'rawdata'
dataset[current_dataset] = expr_df
dataset['dataset_metadata'] = meta_df

## Filter out non-expressed genes
expr_df = expr_df.loc[expr_df.sum(axis=1) > 0, :]
## Filter out lowly expressed genes
mask_low_vals = (expr_df > low_expression_threshold).sum(axis=1) > 2
expr_df = expr_df.loc[mask_low_vals, :]
current_dataset += '+filter_genes'
dataset[current_dataset] = expr_df

In [6]:
signature = cd_signature(control_name, treatment, dataset, 'rawdata+filter_genes', meta_class_column_name)
signature['rawdata+filter_genes'].to_csv('GSE159952.txt', sep="\t")

# GSE151572

In [3]:
# load in data 
meta_class_column_name = 'Treatment'
control_name = 'control 24h'
treatment = 'dex 24h'
meta_data_filename = 'GSE151572_metadata.txt'
rnaseq_data_filename = 'GSE151572_expressed_gene_reads.txt'
low_expression_threshold = 0.3

try:
    meta_df = pd.read_csv(meta_data_filename, sep="\t", index_col=0, dtype=str)
except:
    print("Error! Please load a metadata file in txt or tsv format")
try:
    expr_df = pd.read_csv(rnaseq_data_filename, index_col=0, sep="\t").sort_index()
except:
    print("Error! Please load a RNA-seq expression file in txt or tsv format")
meta_df.index = meta_df.index.map(str)

# Match samples between the metadata and the datasets
if meta_class_column_name not in meta_df.columns:
    print(f"Error! Column '{meta_class_column_name}' is not in metadata")
    
meta_df = meta_df[meta_df.index.isin(expr_df.columns)]

# sort metadata by class labels; control first
classes = list(meta_df[meta_class_column_name].unique())
classes.remove(control_name)
classes.insert(0, control_name)
meta_df['tmp_class'] = pd.Categorical(meta_df[meta_class_column_name], classes)
meta_df = meta_df.sort_values('tmp_class')
meta_df = meta_df.drop('tmp_class', axis=1)
expr_df = expr_df.loc[:,meta_df.index]
expr_df = expr_df.groupby(expr_df.index).sum()
try:
    assert(meta_df.shape[0]==expr_df.shape[1])
except:
    print("Error! Input files are in a wrong format.")

dataset = dict()
current_dataset = 'rawdata'
dataset[current_dataset] = expr_df
dataset['dataset_metadata'] = meta_df

## Filter out non-expressed genes
expr_df = expr_df.loc[expr_df.sum(axis=1) > 0, :]
## Filter out lowly expressed genes
mask_low_vals = (expr_df > low_expression_threshold).sum(axis=1) > 2
expr_df = expr_df.loc[mask_low_vals, :]
current_dataset += '+filter_genes'
dataset[current_dataset] = expr_df

In [4]:
meta_df

Unnamed: 0_level_0,Treatment
Sample_title,Unnamed: 1_level_1
Con_24h_1,control 24h
Con_24h_2,control 24h
Con_24h_3,control 24h
Con_6h_1,control 6h
Con_6h_2,control 6h
Con_6h_3,control 6h
Dex_24h_1,dex 24h
Dex_24h_2,dex 24h
Dex_24h_3,dex 24h
Dex_6h_1,dex 6h


In [5]:
expr_df

Sample_title,Con_24h_1,Con_24h_2,Con_24h_3,Con_6h_1,Con_6h_2,Con_6h_3,Dex_24h_1,Dex_24h_2,Dex_24h_3,Dex_6h_1,Dex_6h_2,Dex_6h_3
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ENSRNOG00000000001,17,16,7,9,11,8,18,15,6,22,20,24
ENSRNOG00000000007,3,9,14,17,37,10,18,23,40,8,31,28
ENSRNOG00000000009,0,2,2,0,0,0,2,0,0,0,0,0
ENSRNOG00000000010,2,6,4,0,22,4,0,1,2,3,3,8
ENSRNOG00000000012,13,10,4,9,12,12,31,3,21,37,14,19
...,...,...,...,...,...,...,...,...,...,...,...,...
ENSRNOG00000061925,44,15,25,16,21,21,35,25,16,71,34,37
ENSRNOG00000061928,1065,1422,1373,1371,1356,1430,1251,1139,1494,1321,1193,1810
ENSRNOG00000061934,1,2,0,0,0,1,0,3,1,13,6,5
ENSRNOT00000072207,383,65,31,82,141,70,175,99,89,505,85,130


In [11]:
# 24 hour signatures
control_name = 'control 24h'
treatment = 'dex 24h'
signature = cd_signature(control_name, treatment, dataset, 'rawdata+filter_genes', meta_class_column_name)
signature['rawdata+filter_genes'].to_csv('GSE151572_24h.txt', sep="\t")

In [12]:
# 6 hour signatures
control_name = 'control 6h'
treatment = 'dex 6h'
signature = cd_signature(control_name, treatment, dataset, 'rawdata+filter_genes', meta_class_column_name)
signature['rawdata+filter_genes'].to_csv('GSE151572_6h.txt', sep="\t")

# GSE149113

In [94]:
# load in data
meta_class_column_name = 'Sample_characteristics_ch1'
control_name = 'vehicle'
treatment = 'dexamethasone'
meta_data_filename = 'GSE149113_series_matrix.txt'
rnaseq_data_filename = 'GSE149113_NormCount.xlsx'
low_expression_threshold = 0.3

try:
    meta_df = pd.read_csv(meta_data_filename, sep="\t", index_col=0, dtype=str)
except:
    print("Error! Please load a metadata file in txt or tsv format")
try:
    expr_df = pd.read_excel(rnaseq_data_filename).sort_index()
except:
    print("Error! Please load a RNA-seq expression file in txt or tsv format")
expr_df = expr_df.set_index('Gene_Name')
meta_df.index = meta_df.index.map(str)

# Match samples between the metadata and the datasets
if meta_class_column_name not in meta_df.columns:
    print(f"Error! Column '{meta_class_column_name}' is not in metadata")
    
meta_df = meta_df[meta_df.index.isin(expr_df.columns)]

# sort metadata by class labels; control first
classes = list(meta_df[meta_class_column_name].unique())
classes.remove(control_name)
classes.insert(0, control_name)
meta_df['tmp_class'] = pd.Categorical(meta_df[meta_class_column_name], classes)
meta_df = meta_df.sort_values('tmp_class')
meta_df = meta_df.drop('tmp_class', axis=1)
expr_df = expr_df.loc[:,meta_df.index]
expr_df = expr_df.groupby(expr_df.index).sum()
try:
    assert(meta_df.shape[0]==expr_df.shape[1])
except:
    print("Error! Input files are in a wrong format.")

dataset = dict()
current_dataset = 'rawdata'
dataset[current_dataset] = expr_df
dataset['dataset_metadata'] = meta_df

## Filter out non-expressed genes
expr_df = expr_df.loc[expr_df.sum(axis=1) > 0, :]
## Filter out lowly expressed genes
mask_low_vals = (expr_df > low_expression_threshold).sum(axis=1) > 2
expr_df = expr_df.loc[mask_low_vals, :]
current_dataset += '+filter_genes'
dataset[current_dataset] = expr_df

In [96]:
# compute signatures
signature = cd_signature(control_name, treatment, dataset, 'rawdata+filter_genes', meta_class_column_name)
signature['rawdata+filter_genes']

Unnamed: 0_level_0,CD-coefficient,Significance
Gene_Name,Unnamed: 1_level_1,Unnamed: 2_level_1
RMRP,0.262771,-0.250352
FKBP5,0.222084,-0.202657
HIST1H1B,0.217820,-0.229088
MT-RNR2,0.203790,-0.272381
IL7R,0.190625,-0.241421
...,...,...
CTSW,-0.093907,-0.154090
CD44,-0.096249,-0.126348
BTG2,-0.105324,-0.104364
IL2RB,-0.132847,-0.241186


In [97]:
signature['rawdata+filter_genes'].to_csv("GSE149113.txt", sep="\t")

# GSE141967

In [85]:
# load in data
meta_class_column_name = 'Sample_characteristics_ch1'
control_name = 'treatment condition: VEH'
treatment = 'treatment condition: DEX'
meta_data_filename = 'GSE141967_series_matrix'
rnaseq_data_filename = 'GSE141967_wandler_murine_tall_fpkm'
low_expression_threshold = 0.3

try:
    meta_df = pd.read_csv(meta_data_filename, sep="\t", index_col=0, dtype=str)
except:
    print("Error! Please load a metadata file in txt or tsv format")
try:
    expr_df = pd.read_csv(rnaseq_data_filename, index_col=0, sep="\t").sort_index()
except:
    print("Error! Please load a RNA-seq expression file in txt or tsv format")
meta_df.index = meta_df.index.map(str)

# Match samples between the metadata and the datasets
if meta_class_column_name not in meta_df.columns:
    print(f"Error! Column '{meta_class_column_name}' is not in metadata")
    
meta_df = meta_df[meta_df.index.isin(expr_df.columns)]

# sort metadata by class labels; control first
classes = list(meta_df[meta_class_column_name].unique())
classes.remove(control_name)
classes.insert(0, control_name)
meta_df['tmp_class'] = pd.Categorical(meta_df[meta_class_column_name], classes)
meta_df = meta_df.sort_values('tmp_class')
meta_df = meta_df.drop('tmp_class', axis=1)
expr_df = expr_df.loc[:,meta_df.index]
expr_df = expr_df.groupby(expr_df.index).sum()
try:
    assert(meta_df.shape[0]==expr_df.shape[1])
except:
    print("Error! Input files are in a wrong format.")

dataset = dict()
current_dataset = 'rawdata'
dataset[current_dataset] = expr_df
dataset['dataset_metadata'] = meta_df

## Filter out non-expressed genes
expr_df = expr_df.loc[expr_df.sum(axis=1) > 0, :]
## Filter out lowly expressed genes
mask_low_vals = (expr_df > low_expression_threshold).sum(axis=1) > 2
expr_df = expr_df.loc[mask_low_vals, :]
current_dataset += '+filter_genes'
dataset[current_dataset] = expr_df

Error! Please load a metadata file in txt or tsv format
Error! Please load a RNA-seq expression file in txt or tsv format


KeyError: "None of ['Gene_Name'] are in the columns"

In [69]:
# compute signatures
signature = cd_signature(control_name, treatment, dataset, 'rawdata+filter_genes', meta_class_column_name)
signature['rawdata+filter_genes']

Unnamed: 0,CD-coefficient,Significance
GM26870,0.503758,-0.099836
GM15662,0.276006,-0.091508
RNU3B3,0.243052,-0.102627
RNU3B1,0.243052,-0.102651
RMRP,0.160266,-0.103765
...,...,...
RNU2-10,-0.095053,-0.102675
GM23971,-0.095053,-0.097549
RNU3B2,-0.105754,-0.102639
RNU3B4,-0.105754,-0.102615


In [70]:
signature['rawdata+filter_genes'].to_csv("GSE141967.txt", sep="\t")

# GSE137535

In [61]:
# load in data
meta_class_column_name = 'Sample_characteristics_ch1'
control_name = 'treatment: No Drug'
treatment = 'treatment: DEX'
meta_data_filename = 'GSE137535_series_matrix.txt'
rnaseq_data_filename = 'GSE137535_MB231_DEXwSH454TreamtentsFeatureCountsedited_SH454wDEX.counts.txt'
low_expression_threshold = 0.3

try:
    meta_df = pd.read_csv(meta_data_filename, sep="\t", index_col=0, dtype=str)
except:
    print("Error! Please load a metadata file in txt or tsv format")
try:
    expr_df = pd.read_csv(rnaseq_data_filename, index_col=0, sep="\t").sort_index()
except:
    print("Error! Please load a RNA-seq expression file in txt or tsv format")
meta_df.index = meta_df.index.map(str)

# Match samples between the metadata and the datasets
if meta_class_column_name not in meta_df.columns:
    print(f"Error! Column '{meta_class_column_name}' is not in metadata")
    
meta_df = meta_df[meta_df.index.isin(expr_df.columns)]

# sort metadata by class labels; control first
classes = list(meta_df[meta_class_column_name].unique())
classes.remove(control_name)
classes.insert(0, control_name)
meta_df['tmp_class'] = pd.Categorical(meta_df[meta_class_column_name], classes)
meta_df = meta_df.sort_values('tmp_class')
meta_df = meta_df.drop('tmp_class', axis=1)
expr_df = expr_df.loc[:,meta_df.index]
expr_df = expr_df.groupby(expr_df.index).sum()
try:
    assert(meta_df.shape[0]==expr_df.shape[1])
except:
    print("Error! Input files are in a wrong format.")

dataset = dict()
current_dataset = 'rawdata'
dataset[current_dataset] = expr_df
dataset['dataset_metadata'] = meta_df

## Filter out non-expressed genes
expr_df = expr_df.loc[expr_df.sum(axis=1) > 0, :]
## Filter out lowly expressed genes
mask_low_vals = (expr_df > low_expression_threshold).sum(axis=1) > 2
expr_df = expr_df.loc[mask_low_vals, :]
current_dataset += '+filter_genes'
dataset[current_dataset] = expr_df

In [64]:
meta_df

Unnamed: 0_level_0,Sample_characteristics_ch1
Sample_geo_accession,Unnamed: 1_level_1
GSM4081451,treatment: No Drug
GSM4081452,treatment: No Drug
GSM4081453,treatment: No Drug
GSM4081442,treatment: SH5-54
GSM4081443,treatment: SH5-54
GSM4081444,treatment: SH5-54
GSM4081445,treatment: DEX
GSM4081446,treatment: DEX
GSM4081447,treatment: DEX
GSM4081448,treatment: DEX+SH4-54


In [65]:
# compute signatures
signature = cd_signature(control_name, treatment, dataset, 'rawdata+filter_genes', meta_class_column_name)
signature['rawdata+filter_genes']

Unnamed: 0_level_0,CD-coefficient,Significance
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1
THBS1,0.689267,-0.000284
SERPINE1,0.231069,-0.015688
ANKRD1,0.182091,0.023851
ANXA2,0.116774,0.025859
ACTB,0.112534,0.009169
...,...,...
VIM,-0.070506,0.001158
PKM,-0.086295,-0.036769
ITGB1,-0.126496,-0.033229
FTL,-0.142021,-0.003442


In [66]:
signature['rawdata+filter_genes'].to_csv("GSE137535.txt", sep="\t")

# GSE119092

In [20]:
# load in data
meta_class_column_name = 'Sample_characteristics_ch1'
control_name = 'treatment: control siRNA'
treatment = 'treatment: control siRNA + DEX'
meta_data_filename = 'GSE119092_series_matrix.txt'
rnaseq_data_filename = 'GSE119092_gene-fpkm-table.txt'
low_expression_threshold = 0.3

try:
    meta_df = pd.read_csv(meta_data_filename, sep="\t", index_col=0, dtype=str)
except:
    print("Error! Please load a metadata file in txt or tsv format")
try:
    expr_df = pd.read_csv(rnaseq_data_filename, index_col=0, sep="\t").sort_index()
except:
    print("Error! Please load a RNA-seq expression file in txt or tsv format")
meta_df.index = meta_df.index.map(str)

# Match samples between the metadata and the datasets
if meta_class_column_name not in meta_df.columns:
    print(f"Error! Column '{meta_class_column_name}' is not in metadata")
    
meta_df = meta_df[meta_df.index.isin(expr_df.columns)]

# sort metadata by class labels; control first
classes = list(meta_df[meta_class_column_name].unique())
classes.remove(control_name)
classes.insert(0, control_name)
meta_df['tmp_class'] = pd.Categorical(meta_df[meta_class_column_name], classes)
meta_df = meta_df.sort_values('tmp_class')
meta_df = meta_df.drop('tmp_class', axis=1)
expr_df = expr_df.loc[:,meta_df.index]
expr_df = expr_df.groupby(expr_df.index).sum()
try:
    assert(meta_df.shape[0]==expr_df.shape[1])
except:
    print("Error! Input files are in a wrong format.")

dataset = dict()
current_dataset = 'rawdata'
dataset[current_dataset] = expr_df
dataset['dataset_metadata'] = meta_df

## Filter out non-expressed genes
expr_df = expr_df.loc[expr_df.sum(axis=1) > 0, :]
## Filter out lowly expressed genes
mask_low_vals = (expr_df > low_expression_threshold).sum(axis=1) > 2
expr_df = expr_df.loc[mask_low_vals, :]
current_dataset += '+filter_genes'
dataset[current_dataset] = expr_df

In [21]:
meta_df

Unnamed: 0_level_0,Sample_title,Sample_characteristics_ch1
Sample_geo_accession,Unnamed: 1_level_1,Unnamed: 2_level_1
GSM3357729,control siRNA [RNA-seq],treatment: control siRNA
GSM3357730,control siRNA + DEX [RNA-seq],treatment: control siRNA + DEX
GSM3357731,GR siRNA [RNA-seq],treatment: GR siRNA
GSM3357732,GR siRNA + DEX [RNA-seq],treatment: GR siRNA + DEX


In [22]:
expr_df

Sample_geo_accession,GSM3357729,GSM3357730,GSM3357731,GSM3357732
0610007P14Rik,34.51470,36.129600,31.53560,31.204600
0610009B22Rik,18.61930,18.948000,17.74630,22.355400
0610009L18Rik,1.12781,0.917989,1.30297,0.984707
0610009O20Rik,27.28910,25.834000,23.02970,22.621500
0610010F05Rik,7.39597,7.555420,7.45545,7.611530
...,...,...,...,...
Zyg11b,6.92010,6.446620,6.46096,6.257910
Zyx,158.55200,154.454000,186.03500,172.987000
Zzef1,8.35275,7.479040,9.22757,7.883160
Zzz3,12.25630,11.795000,10.57260,10.678900


In [30]:
# compute signatures for dex pertubation
signature = logFC(control_name, treatment, dataset, 'rawdata+filter_genes', meta_class_column_name)
signature['rawdata+filter_genes']

Unnamed: 0,logFC
Apoo-ps,4.892671
Mir6403,2.650563
Penk,2.338619
Defb25,2.287987
Sap25,2.247801
...,...
Mir6363,-1.841295
Hist1h4a,-2.104328
1810012K16Rik,-2.326717
Tmem190,-2.445362


In [31]:
signature['rawdata+filter_genes'].to_csv("GSE119092_dex.txt", sep="\t")

In [32]:
# signatures for GR KO
control_name = 'treatment: control siRNA'
treatment = 'treatment: GR siRNA'
signature = logFC(control_name, treatment, dataset, 'rawdata+filter_genes', meta_class_column_name)
signature['rawdata+filter_genes']

Unnamed: 0,logFC
Apoo-ps,5.924251
Rps27rt,5.148558
Defb25,3.124452
Penk,2.781566
Arxes2,2.682746
...,...
Lcn2,-3.274058
Rmrp,-3.276428
Hist2h4,-3.575981
Mir546,-3.75073


In [33]:
signature['rawdata+filter_genes'].to_csv("GSE119092_gr.txt", sep="\t")

# GSE186104

In [16]:
# load in data 
meta_class_column_name = 'Sample_characteristics_ch1'
control_name = 'treatment: standard cultivation medium'
treatment = 'treatment: standard cultivation medium + dexamethason'
meta_data_filename = 'GSE186104_series_matrix.txt'
rnaseq_data_filename = 'GSE186104_cross_tabulation_of_gene_expression.txt'
low_expression_threshold = 0.3

try:
    meta_df = pd.read_csv(meta_data_filename, sep="\t", index_col=0, dtype=str)
except:
    print("Error! Please load a metadata file in txt or tsv format")
try:
    expr_df = pd.read_csv(rnaseq_data_filename, index_col=0, sep="\t").sort_index()
except:
    print("Error! Please load a RNA-seq expression file in txt or tsv format")
meta_df.index = meta_df.index.map(str)

# Match samples between the metadata and the datasets
if meta_class_column_name not in meta_df.columns:
    print(f"Error! Column '{meta_class_column_name}' is not in metadata")
    
meta_df = meta_df[meta_df.index.isin(expr_df.columns)]

# sort metadata by class labels; control first
classes = list(meta_df[meta_class_column_name].unique())
classes.remove(control_name)
classes.insert(0, control_name)
meta_df['tmp_class'] = pd.Categorical(meta_df[meta_class_column_name], classes)
meta_df = meta_df.sort_values('tmp_class')
meta_df = meta_df.drop('tmp_class', axis=1)
expr_df = expr_df.loc[:,meta_df.index]
expr_df = expr_df.groupby(expr_df.index).sum()
try:
    assert(meta_df.shape[0]==expr_df.shape[1])
except:
    print("Error! Input files are in a wrong format.")

dataset = dict()
current_dataset = 'rawdata'
dataset[current_dataset] = expr_df
dataset['dataset_metadata'] = meta_df

## Filter out non-expressed genes
expr_df = expr_df.loc[expr_df.sum(axis=1) > 0, :]
## Filter out lowly expressed genes
mask_low_vals = (expr_df > low_expression_threshold).sum(axis=1) > 2
expr_df = expr_df.loc[mask_low_vals, :]
current_dataset += '+filter_genes'
dataset[current_dataset] = expr_df

In [17]:
meta_df

Unnamed: 0_level_0,Sample_title,Sample_characteristics_ch1
Sample_geo_accession,Unnamed: 1_level_1,Unnamed: 2_level_1
GSM5632354,C1: Untreated control replicate 1,treatment: standard cultivation medium
GSM5632355,C2: Untreated control replicate 2,treatment: standard cultivation medium
GSM5632356,C3: Untreated control replicate 3,treatment: standard cultivation medium
GSM5632357,Dex1: Dexamethason treated sample replicate 1,treatment: standard cultivation medium + dexam...
GSM5632358,Dex2: Dexamethason treated sample replicate 2,treatment: standard cultivation medium + dexam...
GSM5632359,Dex3: Dexamethason treated sample replicate 3,treatment: standard cultivation medium + dexam...


In [18]:
expr_df

Sample_geo_accession,GSM5632354,GSM5632355,GSM5632356,GSM5632357,GSM5632358,GSM5632359
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A1i3,160,60,196,328,274,287
A2m,32,15,19,62,65,36
A3galt2,80,36,75,68,70,47
A4galt,0,0,2,0,1,2
Aaas,541,306,506,295,298,209
...,...,...,...,...,...,...
l7Rn6,392,199,385,174,185,142
mrpl11,374,218,233,144,192,133
mrpl24,611,344,560,222,316,222
mrpl9,1048,598,922,505,596,399


In [19]:
# compute signatures
signature = cd_signature(control_name, treatment, dataset, 'rawdata+filter_genes', meta_class_column_name)
signature['rawdata+filter_genes']

Unnamed: 0_level_0,CD-coefficient,Significance
gene,Unnamed: 1_level_1,Unnamed: 2_level_1
H19,0.540918,0.194621
Col6a3,0.355815,0.127919
Acan,0.233381,0.012220
Igf2,0.148825,0.197593
Col2a1,0.134666,0.127396
...,...,...
Serpinh1,-0.111309,0.056964
RGD1566401,-0.116253,0.076178
Col9a1,-0.131756,0.128181
Col27a1,-0.136371,0.127321


In [14]:
signature['rawdata+filter_genes'].to_csv("GSE186104.txt", sep="\t")