In [2]:
import qnorm
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from IPython.display import HTML, display, Markdown
from matplotlib.gridspec import GridSpec
from matplotlib_venn import venn2
from maayanlab_bioinformatics.normalization import zscore_normalize, log2_normalize
from maayanlab_bioinformatics.dge import limma_voom_differential_expression
from maayanlab_bioinformatics.harmonization.ncbi_genes import ncbi_genes_lookup
import os
lookup = ncbi_genes_lookup()

import sys
import contextlib
@contextlib.contextmanager
def suppress_output(stdout=True, stderr=True, dest='/dev/null'):
    ''' Usage:
    with suppress_output():
        print('hi')
    '''
    dev_null = open(dest, 'a')
    if stdout:
        _stdout = sys.stdout
        sys.stdout = dev_null
    if stderr:
        _stderr = sys.stderr
        sys.stderr = dev_null
    try:
        yield
    finally:
        if stdout:
            sys.stdout = _stdout
        if stderr:
            sys.stderr = _stderr

# Target Identification

Process GTEx samples from donors aged 70-79 by tissue against a single background, saving the top 500 candidates from each tissue.

In [5]:
proteins = pd.read_csv('https://lomize-group-membranome.herokuapp.com/proteins?fileFormat=csv')
proteins = proteins[proteins['species_name_cache'] == 'Homo sapiens']
membrane_proteins = proteins['genename'].map(lookup).dropna()
cspa = pd.read_csv("/Users/edendeng/MaayanLab/pub/cell-aging-revisions/data/CSPA_surfaceome.csv")['ENTREZ gene symbol']
go = pd.read_csv("/Users/edendeng/MaayanLab/pub/cell-aging-revisions/data/GO0009986.tsv", sep='\t', header = None)[2]
surface_proteins = pd.concat([cspa, go])
df_transcript_gene_map = pd.read_csv("s3://storage/Tumor_Gene_Target_Screener/transcript-gene-map.tsv.gz", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True), sep='\t', header=0, index_col=0, compression='gzip')

# Load background dataset
df_bg_stats = pd.read_csv("/Users/edendeng/MaayanLab/pub/cell-aging-revisions/data/gtex_20s_transcripts_stats.tsv", sep='\t', index_col=[0,1])
df_bg_transcripts = df_bg_stats.unstack().index.map(lambda idx: idx.partition('.')[0])
df_bg_stats = df_bg_stats.unstack().groupby(df_bg_transcripts, observed=True).sum().stack()
df_bg_expr = df_bg_stats.loc[(slice(None), ['25%', '50%', '75%']), :].unstack()

# Load aged RNA-seq expression data
df_expr_stats = pd.read_csv("/Users/edendeng/MaayanLab/pub/cell-aging-revisions/data/gtex_70s_transcripts_stats.tsv", sep='\t', index_col=[0,1])
df_expr_transcripts = df_expr_stats.unstack().index.map(lambda idx: idx.partition('.')[0])
df_expr_stats = df_expr_stats.unstack().groupby(df_expr_transcripts, observed=True).sum().stack()
df_expr_all = df_expr_stats.loc[(slice(None), ['25%', '50%', '75%']), :].unstack()

for tissue in ["Adipose - Subcutaneous", "Adipose - Visceral (Omentum)",
              "Cells - Cultured fibroblasts", 
              "Heart - Atrial Appendage", "Heart - Left Ventricle",
              "Kidney - Cortex", "Liver", "Lung",
              "Skin - Not Sun Exposed (Suprapubic)",
              "Skin - Sun Exposed (Lower leg)"]:
    df_expr = df_expr_all[tissue]
    df_expr = df_expr.add_prefix('q')
    
    # Distribution matching between RS samples & the background
    common_index = list(set(df_expr.index) & set(df_bg_expr.index))
    target_distribution = df_bg_expr.loc[common_index, :].median(axis=1)
    df_expr_norm = qnorm.quantile_normalize(df_expr.loc[common_index, :], target=target_distribution)
    df_bg_expr_norm = qnorm.quantile_normalize(df_bg_expr.loc[common_index, :], target=target_distribution)
    
    # Perform differential expression between samples & the background
    with suppress_output():
        dge = limma_voom_differential_expression(
            df_bg_expr_norm, df_expr_norm,
            voom_design=True,
        )

        dge = dge.loc[dge.index.isin(df_transcript_gene_map.index)]
        dge['ensembl_transcript_id'] = dge.index
        dge['gene_symbol'] = df_transcript_gene_map.loc[dge.index, 'gene_symbol'].apply(lambda g: lookup(g) or g)
        dge['label'] = dge.apply(lambda r: f"{r['ensembl_transcript_id']} - {r['gene_symbol']}", axis=1)
    
    # Narrow down candidate set
    dge['-log(adj.P.Val)'] = -np.log(dge['adj.P.Val'])
    prod = (np.abs(dge['t']) * dge['logFC'])
    dge['is_deg'] = dge['adj.P.Val'] < 0.05
    dge['is_significant'] = prod > prod.mean() + 3 * prod.std()
    dge['score1'] = dge['is_significant'].astype(int) + dge['is_deg'].astype(int)
    
    # Apply membrane filter
    dge['is_membrane'] = np.in1d(dge['gene_symbol'], membrane_proteins)
    dge['is_surface'] = np.in1d(dge['gene_symbol'], surface_proteins)
    dge['score2'] = dge['score1'] + dge['is_surface'].astype(int)
    
    # Save results
    path1 = "/Users/edendeng/MaayanLab/pub/cell-aging-revisions/results/aging/all/{}_candidates.csv".format(tissue)
    path2 = "/Users/edendeng/MaayanLab/pub/cell-aging-revisions/results/aging/membrane/{}_candidates.csv".format(tissue)
    dge[dge.score1 >= 2].sort_values(['score1', 't'], ascending=False).to_csv(path1)
    dge[dge.score2 >= 3].sort_values(['score2', 't'], ascending=False).to_csv(path2)

# Compile Results

Aggregate all the significant targets from each tissue. 

In [6]:
data = []
home = "/Users/edendeng/MaayanLab/pub/cell-aging-revisions/results/aging/all"
for file in os.listdir(home):
    f = pd.read_csv("{}/{}".format(home, file), index_col=0)
    t = file.split(sep="_candidates")[0]
    data.append([ t, *f.label ])

pd.DataFrame(data).T.to_csv("/Users/edendeng/MaayanLab/pub/cell-aging-revisions/results/aging_all_transcripts.csv",
                            index=False, header=False)

data = []
home = "/Users/edendeng/MaayanLab/pub/cell-aging-revisions/results/aging/membrane"
for file in os.listdir(home):
    f = pd.read_csv("{}/{}".format(home, file), index_col=0)
    t = file.split(sep="_candidates")[0]
    data.append([ t, *f.label ])

pd.DataFrame(data).T.to_csv("/Users/edendeng/MaayanLab/pub/cell-aging-revisions/results/aging_membrane_transcripts.csv",
                            index=False, header=False)