In [1]:
import qnorm
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from IPython.display import HTML, display, Markdown
from matplotlib.gridspec import GridSpec
from matplotlib_venn import venn2
from maayanlab_bioinformatics.normalization import zscore_normalize, log2_normalize
from maayanlab_bioinformatics.dge import limma_voom_differential_expression
from maayanlab_bioinformatics.harmonization.ncbi_genes import ncbi_genes_lookup
import os
lookup = ncbi_genes_lookup()

import sys
import contextlib
@contextlib.contextmanager
def suppress_output(stdout=True, stderr=True, dest='/dev/null'):
    ''' Usage:
    with suppress_output():
        print('hi')
    '''
    dev_null = open(dest, 'a')
    if stdout:
        _stdout = sys.stdout
        sys.stdout = dev_null
    if stderr:
        _stderr = sys.stderr
        sys.stderr = dev_null
    try:
        yield
    finally:
        if stdout:
            sys.stdout = _stdout
        if stderr:
            sys.stderr = _stderr

# Target Identification

Process all 13 replicatively senescent (RS) samples against a single background, saving the significantly differentially expressed candidates.

For the background dataset, use either "gtex-gene-stats.tsv", "archs4-gene-stats.tsv", or "ts_10x_cell-ontology-class_donors_tissue-labels_v1.tsv".

In [28]:
home = "/Users/edendeng/MaayanLab/target-identifier-proteomics-lncrnas/final/cell-aging-revisions/data/RS_genes"

# Load background dataset
df_bg_stats = pd.read_csv("s3://storage/Tumor_Gene_Target_Screener/ts_10x_cell-ontology-class_donors_tissue-labels_v1.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True), sep='\t', index_col=[0,1])
df_bg_genes = df_bg_stats.unstack().index.map(lambda idx: lookup(idx.partition('.')[0]))
df_bg_stats = df_bg_stats.unstack().groupby(df_bg_genes, observed=True).median().stack()
df_bg_expr = df_bg_stats.loc[(slice(None), ['25%', '50%', '75%']), :].unstack()

for file in os.listdir(home):
    # Load RS RNA-seq expression data
    df_expr = pd.read_csv("{}/{}".format(home, file), sep='\t', index_col=0)
    rs = file.split(sep=".tsv")[0]
    df_expr_genes = df_expr.index.astype(str).map(lambda idx: lookup(idx.partition('.')[0]))
    df_expr = df_expr.groupby(df_expr_genes, observed=True).median()
    
    # Distribution matching between RS samples & the background
    common_index = list(set(df_expr.index) & set(df_bg_expr.index))
    target_distribution = df_bg_expr.loc[common_index, :].median(axis=1)
    df_expr_norm = qnorm.quantile_normalize(df_expr.loc[common_index, :], target=target_distribution)
    df_bg_expr_norm = qnorm.quantile_normalize(df_bg_expr.loc[common_index, :], target=target_distribution)
    
    # Perform differential expression between samples & the background
    with suppress_output():
        dge = limma_voom_differential_expression(
            df_bg_expr_norm, df_expr_norm,
            voom_design=True,
        )
    
    # Narrow down candidate set
    dge['-log(adj.P.Val)'] = -np.log(dge['adj.P.Val'])
    prod = (np.abs(dge['t']) * dge['logFC'])
    dge['is_deg'] = dge['adj.P.Val'] < 0.05
    dge['is_significant'] = prod > prod.mean() + 3 * prod.std()
    dge['score1'] = dge['is_deg'].astype(int) + dge['is_significant'].astype(int)

    # Save results
    path1 = "/Users/edendeng/MaayanLab/target-identifier-proteomics-lncrnas/final/cell-aging-revisions/results/RS/genes/all/TabulaSapiens/{}_candidates.csv".format(rs)
    dge[dge.score1 >= 2].sort_values(['score1', 't'], ascending=False).to_csv(path1)

# Compile Results

For each background, aggregate all the significant targets. For the GTEx targets, we filter out DEXI, which appears to be an artifact of the background data. 

In [None]:
data = []
home = "/Users/edendeng/MaayanLab/target-identifier-proteomics-lncrnas/final/cell-aging-revisions/results/RS/genes/all/ARCHS4"
for file in os.listdir(home):
    f = pd.read_csv("{}/{}".format(home, file), index_col=0)
    rs = file.split(sep="_candidates")[0]
    #data.append([ rs, *f.index[f.index != 'DEXI'] ])
    data.append([ rs, *f.index ])

pd.DataFrame(data).T.to_csv("/Users/edendeng/MaayanLab/target-identifier-proteomics-lncrnas/final/cell-aging-revisions/results/archs4_all_genes.csv",
                            index=False, header=False)