In [1]:
import glob

In [2]:
import os

In [3]:
import pandas as pd

In [4]:
import sys

In [5]:
sys.path.append('./../src/')

In [6]:
from aging_tools import inout

In [7]:
from access_science_shared import inout as rinout

In [8]:
p = rinout.get_path(
        'gtex', 'GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_reads.gct')

df_gtex = pd.read_table(   # This is kind of slow
    p,
    sep='\t',
    skiprows=2    # row 1: GTEX version; row 2: dimensions
).rename(
    columns={
        'Name': 'gene_ensembl.version',
        'Description': 'gene_symbol'}).drop(
            'gene_symbol', axis=1)

df_gtex['gene_ensembl'] = df_gtex['gene_ensembl.version'].str.extract(
    '(.*)\.[0-9]*', expand=False)
df_gtex = df_gtex.drop('gene_ensembl.version', axis=1)

In [9]:
# get information on donors
p = rinout.get_path(
    'gtex', 'GTEx_v7_Annotations_SubjectPhenotypesDS.txt')

df_sujects = pd.read_table(p)
df_sujects = df_sujects.rename(columns={' SUBJID': 'SUBJID'})   # typo

df_sujects['SEX'] = df_sujects['SEX'].replace({
    1: 'm',    # manually update according to accompanying GTEX excel file
    2: 'f'
}
)

# get information on specimen
p = rinout.get_path(
    'gtex', 'GTEx_v7_Annotations_SampleAttributes.txt')
df_sample_attributes = pd.read_table(p)

# Add donor information to specimen information
df_sample_attributes.loc[:, 'SUBJID'] = df_sample_attributes[
    'SAMPID'].str.extract(
    '^([^-]*-[^-]*).*',      # Donor appears encoded in part of sample name
        expand=False)

df_meta = pd.merge(df_sample_attributes, df_sujects,
                   how='left', left_on='SUBJID', right_on='SUBJID')
df_meta = df_meta.set_index('SAMPID', verify_integrity=True)

In [10]:
df_gtex = df_gtex.set_index('gene_ensembl')

In [11]:
if df_gtex.columns.value_counts().max() > 1:
    raise ValueError('Columns are ambiguous')

In [12]:
if df_meta.index.value_counts().max() > 1:
    raise ValueError('Rows are ambiguous')

In [13]:
shared = set(df_gtex.columns).intersection(set(df_meta.index))

In [14]:
shared = sorted(list(shared))

In [15]:
df_gtex = df_gtex.loc[:, shared]

In [16]:
df_meta = df_meta.loc[shared, :]

In [17]:
if df_meta[['SUBJID', 'SMTSD']].drop_duplicates().shape[0] != df_meta.shape[0]:
    raise ValueError('multiple donors for at least one tissue')

In [18]:
settings = (
    (
        ( df_meta['SEX'] == 'm'),
        ('dynamic/tstoeger/190427_gtex_m')
    ),
    (
        ( df_meta['SEX'] == 'f'),
        ('dynamic/tstoeger/190427_gtex_f')
    ),

)

In [19]:
for pair in settings:
    f = pair[0]
    out_folder = pair[1]

    sub_meta = df_meta[f].copy()
    sub_data = df_gtex.loc[:, df_gtex.columns.isin(sub_meta.index)]

    sub_meta = sub_meta.loc[:, ['SMTSD', 'AGE']].rename(
        columns={'SMTSD': 'tissue', 'AGE': 'age'}).rename_axis('run_name')

    sub_meta['age'] = sub_meta['age'].apply(lambda x: int(str(x[0])))
    sub_meta.loc[:, 'pfu'] = 0
    sub_meta.loc[:, 'recommend_to_discard'] = 0

    p_out = inout.get_internal_path(out_folder)
    
    p = os.path.join(p_out, 'sample_meta.csv')   
    inout.ensure_presence_of_directory(p)
    sub_meta.to_csv(p)

    for c in sub_data.columns:
        d = sub_data[[c]]
        d.index.name = ''
        p = os.path.join(p_out, 'counts', '{}.counts'.format(c))
        inout.ensure_presence_of_directory(p)
        d.to_csv(p, sep='\t')

       