In [None]:
import pandas as pd
import os
import re
import numpy as np
import matplotlib.pyplot as plt

### FPKM data file
File format 

Columns: Genes and last column with sample labels

rows: samples 

values: FPKM


In [None]:
datafile = '/path/to/FPKM_datafile.csv'
data = pd.read_csv(datafile, index_col=0)
data.head()

In [None]:
class Select_GS_by_FC:
    
    def __init__(self, data, shap_dir, outdir, mfreq=5, fc = -2, prefix='DNN', suffix='_SHAP_Top_20.txt'):
        self.data = data 
        self.shap_dir = shap_dir
        self.outdir = outdir
        self.mfreq = mfreq
        self.fc = fc
        self.prefix = prefix
        self.suffix = suffix
    
    
    # combine all top genes from different cancers
    def get_dataframe(self):
        
        topN = []
        files = [f for f in os.listdir(self.shap_dir) if f.startswith(self.prefix)]

        for f in files:
            model = re.sub(self.suffix,'',f)
            df = pd.read_csv(os.path.join(self.shap_dir, f), index_col=0)
            df.columns = ['label','gene_id','shap']
            df['models'] = model
            topN.append(df)

        topN = pd.concat(topN, axis=0).groupby(['label', 'gene_id'])['models'].count()
        topN = topN.reset_index()

        # applying frequency threshold
        topN = topN[topN['models'] >= self.mfreq]
        topN['mexp'] = -1
        topN['mcount'] = -10
    
        return topN
    
    # get median FPKM values to compare genes across cancer types 
    # Log2(FC) 
    
    def get_enrichment_count_by_fc(self, gene, control):
        
        # this method returns
        # number of cancers showing enrichment for given gene
        # median expression for a given cancer
        
        gx = self.data[[gene, 'label']]
        gx = gx.groupby('label')[gene].median()
        gx = pd.DataFrame(gx)
        gx.columns = ['exp']
         
        # median expression 
        mexp = gx.loc[control, 'exp']
        
        # gene should have median expression greateer than 10 fpkm in control 
        if mexp < 10:
            return -1, mexp
    
    
        # remove all label with median expression less than 10
        gx = gx[gx['exp'] >= 10] 
        
        # calculating fold change 
        # then counting cancers with overexpression  
        fc_count = 0
        
        for l in gx.index.tolist():
            
            if l != control:
                c = gx.loc[control,'exp']
                t = gx.loc[l, 'exp']
                
                # fold change  
                fc = t/c
                fc = np.log2(fc)

                if fc >= self.fc :
                    fc_count += 1
            
        # add 1 for self enrichment 
        fc_count += 1
        
        return fc_count , mexp
    
    def get_gene_signatures(self):
        x = self.get_dataframe()
        for i, gene_id in enumerate(x.gene_id):
            control = x.iloc[i,0]
            mcount, mexp = self.get_enrichment_count_by_fc(gene_id, control)
            x.iloc[i, 3] = mexp
            x.iloc[i, 4] = mcount
        
        x = x[x.mcount != -1]
        x = x[x.mcount <= 5]
        x = x.sort_values(['label','mcount', 'mexp'], ascending=[True, True, False])
        
        print('Gene Signatures: ', len(x.gene_id.unique()) )
        x.to_csv(os.path.join(self.outdir, 'IterX_GS_FC.csv'))

        return x





In [None]:
# folder of files containing top 20 gene per cancer from all models 
datadir = '/path/to/datadir'

# Output folder 
outdir = '/path/to/outputdir'

gs_selection = Select_GS_by_FC(data, datadir, outdir)


In [None]:
mx = gs_selection.get_gene_signatures()