In [1]:
import h5py
import numpy as np
import pandas as pd
# from os import listdir, path

## GCTX File

In [2]:
gctx = h5py.File('GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gctx', 'r')

In [3]:
gctx['0/META/COL/id'].shape, gctx['0/META/ROW/id'].shape, gctx['0/DATA/0/matrix'].shape

((17382,), (56200,), (17382, 56200))

In [4]:
gctx['0/META/COL/id'][:5]

array([b'GTEX-1117F-0226-SM-5GZZ7', b'GTEX-1117F-0426-SM-5EGHI',
       b'GTEX-1117F-0526-SM-5EGHJ', b'GTEX-1117F-0626-SM-5N9CS',
       b'GTEX-1117F-0726-SM-5GIEN'], dtype='|S32')

In [5]:
gctx['0/META/ROW/id'][:5]

array([b'ENSG00000223972.5', b'ENSG00000227232.5', b'ENSG00000278267.1',
       b'ENSG00000243485.5', b'ENSG00000237613.2'], dtype='|S27')

## Divide samples by tissue

In [6]:
sample_meta = pd.read_csv('GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt', sep='\t')
sample_meta['sub'] = sample_meta['SAMPID'].apply(lambda x: '-'.join(x.split('-')[:2]))
sample_meta = sample_meta[['SAMPID', 'SMTS', 'sub']].set_index('SAMPID')
sample_meta.head()

Unnamed: 0_level_0,SMTS,sub
SAMPID,Unnamed: 1_level_1,Unnamed: 2_level_1
GTEX-1117F-0003-SM-58Q7G,Blood,GTEX-1117F
GTEX-1117F-0003-SM-5DWSB,Blood,GTEX-1117F
GTEX-1117F-0003-SM-6WBT7,Blood,GTEX-1117F
GTEX-1117F-0011-R10a-SM-AHZ7F,Brain,GTEX-1117F
GTEX-1117F-0011-R10b-SM-CYKQ8,Brain,GTEX-1117F


## Divide subjects by age

In [7]:
meta = pd.read_csv('GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt', sep='\t')
meta = meta.set_index('SUBJID')

sample_meta['age'] = [meta.loc[row.sub, 'AGE'] for row in sample_meta.itertuples()]
sample_meta['sex'] = [meta.loc[row.sub, 'SEX'] for row in sample_meta.itertuples()]
sample_meta.head()

Unnamed: 0_level_0,SMTS,sub,age,sex
SAMPID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GTEX-1117F-0003-SM-58Q7G,Blood,GTEX-1117F,60-69,2
GTEX-1117F-0003-SM-5DWSB,Blood,GTEX-1117F,60-69,2
GTEX-1117F-0003-SM-6WBT7,Blood,GTEX-1117F,60-69,2
GTEX-1117F-0011-R10a-SM-AHZ7F,Brain,GTEX-1117F,60-69,2
GTEX-1117F-0011-R10b-SM-CYKQ8,Brain,GTEX-1117F,60-69,2


In [9]:
sample_meta.shape[0]

22951

In [11]:
# get rid of discrepancies between metadata and data
sample_meta = sample_meta.loc[gctx['0/META/COL/id'].asstr()[:]]
sample_meta.shape

(17382, 4)

## Map Ensembl genes

In [16]:
# map ensembl genes
gene_info = pd.read_csv('Homo_sapiens.gene_info', sep='\t')[['Symbol', 'dbXrefs']]
gene_info['ensembl'] = [row.dbXrefs.split('Ensembl:')[1] if 'Ensembl' in row.dbXrefs else np.nan for row in gene_info.itertuples()]
gene_info = gene_info[['Symbol', 'ensembl']].dropna(subset=['ensembl']).set_index('ensembl')
gene_info.shape

(35161, 1)

In [17]:
gene_info.head()

Unnamed: 0_level_0,Symbol
ensembl,Unnamed: 1_level_1
ENSG00000121410,A1BG
ENSG00000175899,A2M
ENSG00000256069,A2MP1
ENSG00000171428,NAT1
ENSG00000156006,NAT2


In [19]:
to_keep = []
for g in gctx['0/META/ROW/id'][()].astype(str):
    if g.split('.')[0] in gene_info.index:
        to_keep.append(g)

len(to_keep)

32024

In [20]:
def slice_matrix(gctx, rids): 
    all_rids = gctx['/0/META/ROW/id'].asstr()[:]
    r_mask = np.in1d(all_rids, rids)
    rids_subset = all_rids[r_mask].tolist()
    r_indices = np.array([rids_subset.index(id_) for id_ in rids])
    
    mat = gctx['/0/DATA/0/matrix'][()]
    submat = mat[:, r_mask][:, r_indices]

    return submat

In [21]:
row_ids = sorted(to_keep)
pruned_data = slice_matrix(gctx, row_ids)

In [22]:
pruned_data.shape

(17382, 32024)

In [32]:
data_df = pd.DataFrame(data=pruned_data, columns=row_ids, index=gctx['0/META/COL/id'].asstr()[:])
data_df.head()

Unnamed: 0,ENSG00000000003.14,ENSG00000000005.5,ENSG00000000419.12,ENSG00000000457.13,ENSG00000000460.16,ENSG00000000938.12,ENSG00000000971.15,ENSG00000001036.13,ENSG00000001084.10,ENSG00000001167.14,...,ENSG00000284328.1,ENSG00000284373.1,ENSG00000284377.1,ENSG00000284391.1,ENSG00000284395.1,ENSG00000284438.1,ENSG00000284471.1,ENSG00000284516.1,ENSG00000284523.1,ENSG00000284546.1
GTEX-1117F-0226-SM-5GZZ7,2257.0,434.0,1231.0,688.0,174.0,2175.0,21961.0,3438.0,1204.0,1296.0,...,0.0,0.0,0.0,1.0,11.0,0.0,13.0,0.0,24.0,0.0
GTEX-1117F-0426-SM-5EGHI,408.0,24.0,2447.0,222.0,112.0,216.0,834.0,281.0,414.0,632.0,...,0.0,0.0,0.0,70.0,6.0,0.0,4.0,0.0,15.0,0.0
GTEX-1117F-0526-SM-5EGHJ,1082.0,74.0,1052.0,447.0,118.0,933.0,14106.0,758.0,648.0,954.0,...,0.0,0.0,0.0,55.0,7.0,0.0,50.0,0.0,24.0,0.0
GTEX-1117F-0626-SM-5N9CS,2822.0,245.0,1347.0,766.0,214.0,1011.0,39533.0,1712.0,1091.0,1473.0,...,0.0,0.0,0.0,31.0,0.0,0.0,12.0,0.0,14.0,0.0
GTEX-1117F-0726-SM-5GIEN,778.0,2.0,905.0,335.0,113.0,388.0,62151.0,1245.0,521.0,1085.0,...,0.0,0.0,0.0,55.0,0.0,0.0,10.0,0.0,8.0,0.0


## Remove duplicates by variance

In [37]:
var_df = data_df.var(axis=0).to_frame(name='Var')
var_df['Ens'] = var_df.index.map(lambda x: x.split('.')[0])
var_df.head()

Unnamed: 0,Var,Ens
ENSG00000000003.14,4561226.0,ENSG00000000003
ENSG00000000005.5,278923.8,ENSG00000000005
ENSG00000000419.12,548077.6,ENSG00000000419
ENSG00000000457.13,186486.0,ENSG00000000457
ENSG00000000460.16,80666.38,ENSG00000000460


In [45]:
keep = var_df.sort_values(by=['Ens', 'Var'], ascending=True).drop_duplicates(subset=['Ens'], keep='last').index

In [62]:
full_data_df = data_df[keep].T

## Map gene symbols

In [61]:
gene_info = gene_info.loc[gene_info.index.drop_duplicates(keep='first')]

In [65]:
full_data_df.index = full_data_df.index.map(lambda x: gene_info.loc[x.split('.')[0], 'Symbol'])

In [66]:
full_data_df.head()

Unnamed: 0,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,GTEX-1117F-2826-SM-5GZXL,GTEX-1117F-2926-SM-5GZYI,...,GTEX-ZZPU-1126-SM-5N9CW,GTEX-ZZPU-1226-SM-5N9CK,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
TSPAN6,2257.0,408.0,1082.0,2822.0,778.0,3417.0,2507.0,2167.0,4622.0,852.0,...,926.0,2279.0,1474.0,1142.0,553.0,7515.0,2503.0,574.0,246.0,3502.0
TNMD,434.0,24.0,74.0,245.0,2.0,625.0,13.0,23.0,1837.0,60.0,...,22.0,0.0,8.0,6.0,5.0,83.0,0.0,4.0,309.0,1182.0
DPM1,1231.0,2447.0,1052.0,1347.0,905.0,820.0,1041.0,1615.0,1269.0,1313.0,...,1583.0,2101.0,1080.0,1286.0,1501.0,1340.0,1076.0,1448.0,1312.0,1337.0
SCYL3,688.0,222.0,447.0,766.0,335.0,295.0,635.0,1348.0,871.0,697.0,...,675.0,1252.0,1075.0,720.0,784.0,943.0,681.0,787.0,983.0,1235.0
C1orf112,174.0,112.0,118.0,214.0,113.0,117.0,282.0,312.0,303.0,205.0,...,146.0,279.0,262.0,129.0,241.0,323.0,300.0,184.0,100.0,382.0


In [67]:
full_data_df.shape

(31995, 17382)

## Compute filtered signatures

In [68]:
from maayanlab_bioinformatics.dge.limma_voom import limma_voom_differential_expression
from maayanlab_bioinformatics.normalization.filter import filter_by_expr

In [73]:
comparisons = {}
for tissue in sample_meta['SMTS'].unique():
    sub_meta = sample_meta[sample_meta['SMTS'] == tissue]
    data_df = full_data_df[sub_meta.index.tolist()]
    data_df = filter_by_expr(data_df)
    if not sub_meta[sub_meta['age'] == '20-29'].shape[0] >= 3: 
        print(tissue, "not enough healthy samples")
        continue
    for agegrp in sub_meta['age'].unique():
        if agegrp == '20-29': continue
        if sub_meta[sub_meta['age'] == agegrp].shape[0] >= 3:
            min_samp = min(sub_meta[sub_meta['age']=='20-29'].shape[0], sub_meta[sub_meta['age']==agegrp].shape[0])
            ctl_ids = sub_meta[sub_meta['age'] == '20-29'].sample(n=min_samp, random_state=1).index.tolist()
            pert_ids = sub_meta[sub_meta['age'] == agegrp].sample(n=min_samp, random_state=1).index.tolist()
            ctl_df = data_df[ctl_ids]
            pert_df = data_df[pert_ids]
            limma_voom_differential_expression(ctl_df, pert_df).sort_index() \
                .to_csv(f"GTEx_AgeComparison_Tissue_filtered/GTEx_{tissue.replace(' ', '')}_20-29_vs_{agegrp}.tsv", sep='\t')
            comparisons[f"GTEx_{tissue.replace(' ', '')}_20-29_vs_{agegrp}"] = {'controls': ctl_ids, 'cases': pert_ids}





Attaching package: ‘R.oo’



    throw



    getClasses, getMethods



    attach, detach, load, save



Attaching package: ‘R.utils’



    timestamp



    cat, commandArgs, getOption, inherits, isOpen, nullfile, parse,


Attaching package: ‘RCurl’



    reset



    clone






Attaching package: ‘BiocGenerics’



    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB



    IQR, mad, sd, var, xtabs



    Filter, Find, Map, Position, Reduce, anyDuplicated, append,
    as.data.frame, basename, cbind, colnames, dirname, do.call,
    duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
    lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
    pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table,
    tapply, union, unique, unsplit, which.max, which.min


Attaching package: ‘S4Vectors’



    I, expand.grid, unname



Attachi

## Get metadata

In [74]:
names = [cname for cname, _ in comparisons.items()]
cases = [cdict['cases'] for _, cdict in comparisons.items()]
ctls = [cdict['controls'] for _, cdict in comparisons.items()]

pd.DataFrame(
    data=[cases, ctls],
    index=['pert_samples', 'ctl_samples'],
    columns=names
).T.to_csv('GTEx_AgeComparison_Tissue_metadata.tsv', sep='\t')