In [1]:
import qnorm
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from IPython.display import HTML, display, Markdown
from matplotlib.gridspec import GridSpec
from matplotlib_venn import venn2
from maayanlab_bioinformatics.normalization import zscore_normalize, log2_normalize
from maayanlab_bioinformatics.dge import limma_voom_differential_expression
from maayanlab_bioinformatics.harmonization.ncbi_genes import ncbi_genes_lookup
import os
lookup = ncbi_genes_lookup()

import sys
import contextlib
@contextlib.contextmanager
def suppress_output(stdout=True, stderr=True, dest='/dev/null'):
    ''' Usage:
    with suppress_output():
        print('hi')
    '''
    dev_null = open(dest, 'a')
    if stdout:
        _stdout = sys.stdout
        sys.stdout = dev_null
    if stderr:
        _stderr = sys.stderr
        sys.stderr = dev_null
    try:
        yield
    finally:
        if stdout:
            sys.stdout = _stdout
        if stderr:
            sys.stderr = _stderr

# Target Identification


In [4]:
# Load background dataset
df_bg_expr = pd.read_csv("/Users/edendeng/MaayanLab/target-identifier-proteomics-lncrnas/final/cell-aging-revisions/data/IPF/GSE92592_gene_counts_ctrl.txt", sep='\t', index_col=0)
df_bg_expr_genes = df_bg_expr.index.astype(str).map(lambda idx: lookup(idx.partition('.')[0]))
df_bg_expr = df_bg_expr.groupby(df_bg_expr_genes, observed=True).median()

# Load RS RNA-seq expression data
df_expr = pd.read_csv("/Users/edendeng/MaayanLab/target-identifier-proteomics-lncrnas/final/cell-aging-revisions/data/IPF/GSE92592_gene_counts_IPF.txt", sep='\t', index_col=0)
df_expr_genes = df_expr.index.astype(str).map(lambda idx: lookup(idx.partition('.')[0]))
df_expr = df_expr.groupby(df_expr_genes, observed=True).median()
    
# Distribution matching between RS samples & the background
common_index = list(set(df_expr.index) & set(df_bg_expr.index))
target_distribution = df_bg_expr.loc[common_index, :].median(axis=1)
df_expr_norm = qnorm.quantile_normalize(df_expr.loc[common_index, :], target=target_distribution)
df_bg_expr_norm = qnorm.quantile_normalize(df_bg_expr.loc[common_index, :], target=target_distribution)
    
# Perform differential expression between samples & the background
with suppress_output():
    dge = limma_voom_differential_expression(
        df_bg_expr_norm, df_expr_norm,
        voom_design=True,
    )
    
# Narrow down candidate set
dge['-log(adj.P.Val)'] = -np.log(dge['adj.P.Val'])
prod = (np.abs(dge['t']) * dge['logFC'])
dge['is_deg'] = dge['adj.P.Val'] < 0.05
dge['is_significant'] = prod > prod.mean() + 3 * prod.std()
dge['score1'] = dge['is_deg'].astype(int) + dge['is_significant'].astype(int)
    
# Save results
path1 = "/Users/edendeng/MaayanLab/target-identifier-proteomics-lncrnas/final/cell-aging-revisions/results/diseases/IPF_degs.csv"
dge[dge.score1 >= 2].sort_values(['score1', 't'], ascending=False).to_csv(path1)

In [5]:
# Load background dataset
df_bg_expr = pd.read_csv("/Users/edendeng/MaayanLab/target-identifier-proteomics-lncrnas/final/cell-aging-revisions/data/HF/GSE55296_gene_counts_ctrl.txt", sep='\t', index_col=0)
df_bg_expr_genes = df_bg_expr.index.astype(str).map(lambda idx: lookup(idx.partition('.')[0]))
df_bg_expr = df_bg_expr.groupby(df_bg_expr_genes, observed=True).median()

# Load RS RNA-seq expression data
df_expr = pd.read_csv("/Users/edendeng/MaayanLab/target-identifier-proteomics-lncrnas/final/cell-aging-revisions/data/HF/GSE55296_gene_counts_HF.txt", sep='\t', index_col=0)
df_expr_genes = df_expr.index.astype(str).map(lambda idx: lookup(idx.partition('.')[0]))
df_expr = df_expr.groupby(df_expr_genes, observed=True).median()
    
# Distribution matching between RS samples & the background
common_index = list(set(df_expr.index) & set(df_bg_expr.index))
target_distribution = df_bg_expr.loc[common_index, :].median(axis=1)
df_expr_norm = qnorm.quantile_normalize(df_expr.loc[common_index, :], target=target_distribution)
df_bg_expr_norm = qnorm.quantile_normalize(df_bg_expr.loc[common_index, :], target=target_distribution)
    
# Perform differential expression between samples & the background
with suppress_output():
    dge = limma_voom_differential_expression(
        df_bg_expr_norm, df_expr_norm,
        voom_design=True,
    )
    
# Narrow down candidate set
dge['-log(adj.P.Val)'] = -np.log(dge['adj.P.Val'])
prod = (np.abs(dge['t']) * dge['logFC'])
dge['is_deg'] = dge['adj.P.Val'] < 0.05
dge['is_significant'] = prod > prod.mean() + 3 * prod.std()
dge['score1'] = dge['is_deg'].astype(int) + dge['is_significant'].astype(int)
    
# Save results
path1 = "/Users/edendeng/MaayanLab/target-identifier-proteomics-lncrnas/final/cell-aging-revisions/results/diseases/HF_degs.csv"
dge[dge.score1 >= 2].sort_values(['score1', 't'], ascending=False).to_csv(path1)

# Compile Results

In [2]:
data = []
home = "/Users/edendeng/MaayanLab/target-identifier-proteomics-lncrnas/final/cell-aging-revisions/results/diseases"
for file in os.listdir(home):
    f = pd.read_csv("{}/{}".format(home, file), index_col=0)
    rs = file.split(sep="_degs")[0]
    #data.append([ rs, *f.index[f.index != 'DEXI'] ])
    data.append([ rs, *f.index ])

pd.DataFrame(data).T.to_csv("/Users/edendeng/MaayanLab/target-identifier-proteomics-lncrnas/final/cell-aging-revisions/results/disease_all_genes.csv",
                            index=False, header=False)