In [11]:
# Import dependencies
import pandas as pd
from maayanlab_bioinformatics.dge.characteristic_direction import characteristic_direction

In [1]:
# Function for computing signatures with characteristic direction
def cd_signature(control, treatment, dataset, normalization, meta_class_column_name):
    tmp_normalization = normalization
    
    signatures = dict()
    
    signature_label = normalization
    
    cls1_sample_ids = dataset["dataset_metadata"].loc[dataset["dataset_metadata"][meta_class_column_name]==control, :].index.tolist() #control
    cls2_sample_ids = dataset["dataset_metadata"].loc[dataset["dataset_metadata"][meta_class_column_name]==treatment,:].index.tolist() #case
    
    signature = characteristic_direction(dataset[tmp_normalization].loc[:, cls1_sample_ids], dataset[normalization].loc[:, cls2_sample_ids], calculate_sig=True)
    signature = signature.sort_values("CD-coefficient", ascending=False)
    
    signatures[signature_label] = signature
    
    return signatures

# GSE186104

In [6]:
# load in data 
meta_class_column_name = 'Sample_characteristics_ch1'
control_name = 'treatment: standard cultivation medium'
treatment = 'treatment: standard cultivation medium + dexamethason'
meta_data_filename = 'GSE186104_series_matrix.txt'
rnaseq_data_filename = 'GSE186104_cross_tabulation_of_gene_expression.txt'
low_expression_threshold = 0.3

try:
    meta_df = pd.read_csv(meta_data_filename, sep="\t", index_col=0, dtype=str)
except:
    print("Error! Please load a metadata file in txt or tsv format")
try:
    expr_df = pd.read_csv(rnaseq_data_filename, index_col=0, sep="\t").sort_index()
except:
    print("Error! Please load a RNA-seq expression file in txt or tsv format")
meta_df.index = meta_df.index.map(str)

# Match samples between the metadata and the datasets
if meta_class_column_name not in meta_df.columns:
    print(f"Error! Column '{meta_class_column_name}' is not in metadata")
    
meta_df = meta_df[meta_df.index.isin(expr_df.columns)]

# sort metadata by class labels; control first
classes = list(meta_df[meta_class_column_name].unique())
classes.remove(control_name)
classes.insert(0, control_name)
meta_df['tmp_class'] = pd.Categorical(meta_df[meta_class_column_name], classes)
meta_df = meta_df.sort_values('tmp_class')
meta_df = meta_df.drop('tmp_class', axis=1)
expr_df = expr_df.loc[:,meta_df.index]
expr_df = expr_df.groupby(expr_df.index).sum()
try:
    assert(meta_df.shape[0]==expr_df.shape[1])
except:
    print("Error! Input files are in a wrong format.")

dataset = dict()
current_dataset = 'rawdata'
dataset[current_dataset] = expr_df
dataset['dataset_metadata'] = meta_df

## Filter out non-expressed genes
expr_df = expr_df.loc[expr_df.sum(axis=1) > 0, :]
## Filter out lowly expressed genes
mask_low_vals = (expr_df > low_expression_threshold).sum(axis=1) > 2
expr_df = expr_df.loc[mask_low_vals, :]
current_dataset += '+filter_genes'
dataset[current_dataset] = expr_df

In [7]:
meta_df

Unnamed: 0_level_0,Sample_title,Sample_characteristics_ch1
Sample_geo_accession,Unnamed: 1_level_1,Unnamed: 2_level_1
GSM5632354,C1: Untreated control replicate 1,treatment: standard cultivation medium
GSM5632355,C2: Untreated control replicate 2,treatment: standard cultivation medium
GSM5632356,C3: Untreated control replicate 3,treatment: standard cultivation medium
GSM5632357,Dex1: Dexamethason treated sample replicate 1,treatment: standard cultivation medium + dexam...
GSM5632358,Dex2: Dexamethason treated sample replicate 2,treatment: standard cultivation medium + dexam...
GSM5632359,Dex3: Dexamethason treated sample replicate 3,treatment: standard cultivation medium + dexam...


In [8]:
expr_df

Sample_geo_accession,GSM5632354,GSM5632355,GSM5632356,GSM5632357,GSM5632358,GSM5632359
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A1i3,160,60,196,328,274,287
A2m,32,15,19,62,65,36
A3galt2,80,36,75,68,70,47
A4galt,0,0,2,0,1,2
Aaas,541,306,506,295,298,209
...,...,...,...,...,...,...
l7Rn6,392,199,385,174,185,142
mrpl11,374,218,233,144,192,133
mrpl24,611,344,560,222,316,222
mrpl9,1048,598,922,505,596,399


In [13]:
# compute signatures
signature = cd_signature(control_name, treatment, dataset, 'rawdata+filter_genes', meta_class_column_name)
signature['rawdata+filter_genes']

Unnamed: 0_level_0,CD-coefficient,Significance
gene,Unnamed: 1_level_1,Unnamed: 2_level_1
H19,0.540918,0.064525
Col6a3,0.355815,0.055393
Acan,0.233381,0.007798
Igf2,0.148825,0.061754
Col2a1,0.134666,0.055251
...,...,...
Serpinh1,-0.111309,0.011136
RGD1566401,-0.116253,0.012981
Col9a1,-0.131756,0.055465
Col27a1,-0.136371,0.055230


In [14]:
signature['rawdata+filter_genes'].to_csv("GSE186104.txt", sep="\t")