In [167]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import math 
# import ma=ytplotlib.pyplot as plt
# import seaborn as sns

In [2]:
cell_types = os.listdir('../../../ml_input/')
for c in cell_types: 
    print(c)

L23_IT_CTX_Glut
DG_Glut
CA1-ProS_Glut
Astro-TE_NN
Oligo_NN
L6_IT_CTX_Glut
L5_ET_CTX_Glut
CA3_Glut
Pvalb_Gaba
L6_CT_CTX_Glut
OPC_NN


In [364]:
ct = "OPC_NN"

filepath = f"/gale/ddn/aging/aklein/240903_aging_mice/aging_gene_prediction/aging_gene_prediction/ml_input/{ct}/"

outpath = Path(f"/gale/ddn/aging/aklein/240903_aging_mice/aging_gene_prediction/aging_gene_prediction/ak_workspace/data/v2/{ct}/")
outpath.mkdir(exist_ok=True, parents=True)

l_rna_file = 'luisa_rna'
i_rna_file = 'inhouse_rna'

In [365]:
FEATURE_FILENAME_DICT = {'enhancer_DMR' : 'abc_enhancer.DMR_gene.csv',
                         'enhancer_peak' : 'abc_enhancer.peak_gene.csv', 
                         'DAR' : 'aDAR_gene.csv', 
                         'DMR' : 'aDMR_gene.csv',
                         'loops' : 'Loop_gene.csv.gz', 
                         'mcg_genebody' : 'mCG_genebody_gene.csv', 
                         'mch_genebody' : 'mCH_genebody_gene.csv', 
                         'atac' : 'peak_gene.csv', 
                         'luisa_rna' : 'luisa_RNA_DEG.csv', 
                         'inhouse_rna' : 'RNA_DEG.csv'}

In [366]:
df_rna = pd.read_csv(f'{filepath}/{ct}.{FEATURE_FILENAME_DICT[l_rna_file]}', index_col=0)
print(f"Output for Luisa RNA")
print(f"{outpath}/{ct}.luisa_rna.csv")

df_rna.to_csv(f"{outpath}/{ct}.luisa_rna.csv")

df_rna = pd.read_csv(f'{filepath}/{ct}.{FEATURE_FILENAME_DICT[i_rna_file]}', index_col=0)
print(f"Output for RNA")
print(f"{outpath}/{ct}.inhouse_rna.csv")

df_rna.to_csv(f"{outpath}/{ct}.inhouse_rna.csv")

Output for Luisa RNA
/gale/ddn/aging/aklein/240903_aging_mice/aging_gene_prediction/aging_gene_prediction/ak_workspace/data/v2/OPC_NN/OPC_NN.luisa_rna.csv
Output for RNA
/gale/ddn/aging/aklein/240903_aging_mice/aging_gene_prediction/aging_gene_prediction/ak_workspace/data/v2/OPC_NN/OPC_NN.inhouse_rna.csv


# Functions

## Meta

In [367]:
# blank dataframe for meta
df_meta = pd.DataFrame(index = df_rna.index)
    
# Get the gene length from the genebody files
rel_path = FEATURE_FILENAME_DICT['mcg_genebody']
fullpath = f'{filepath}/{ct}.{rel_path}'
df_feat = pd.read_csv(fullpath).set_index('gene_name')
df_meta['gene_length'] = df_feat.groupby('gene_name', observed=False)['gene_length'].mean()

# get the gene type and chromosome number form the DAR / DMR files
rel_path = FEATURE_FILENAME_DICT['DAR']
fullpath = f'{filepath}/{ct}.{rel_path}'
df_feat = pd.read_csv(fullpath).set_index('gene_name')
df_meta['gene_type'] = df_feat.groupby('gene_name', observed=False)['gene_type'].first()
df_meta['gene_chr'] = df_feat.groupby('gene_name', observed=False)['gene_chr'].first()

df_meta.index.name = 'gene_name'

In [368]:
pd.isna(df_meta).sum() / df_meta.shape[0]

gene_length    0.006685
gene_type      0.789646
gene_chr       0.789646
dtype: float64

In [369]:
print(f"Output for Meta")
print(f"{outpath}/{ct}.meta.csv")

df_meta.to_csv(f"{outpath}/{ct}.meta.csv")

Output for Meta
/gale/ddn/aging/aklein/240903_aging_mice/aging_gene_prediction/aging_gene_prediction/ak_workspace/data/v2/OPC_NN/OPC_NN.meta.csv


## ATAC

In [370]:
# Read the atac file (if it exists)
def gen_atac_features(_feat, fullpath): 
    df_feat = pd.read_csv(fullpath).set_index('gene_name')
    # df_feat.head()

    # The list of features to generate
    ATAC_FEATURES = ['9mo.2mo.log_ratio', '18mo.9mo.log_ratio' , '18mo.2mo.log_ratio', 'log2_distance']
    df_feat['9mo.2mo.log_ratio'] = np.log2(df_feat['9mo'] + 1e-10) - np.log2(df_feat['2mo'] + 1e-10)
    df_feat['18mo.9mo.log_ratio'] = np.log2(df_feat['18mo'] + 1e-10) - np.log2(df_feat['9mo'] + 1e-10)
    df_feat['18mo.2mo.log_ratio'] = np.log2(df_feat['18mo'] + 1e-10) - np.log2(df_feat['2mo'] + 1e-10)
    df_feat['log2_distance'] = np.log2((df_feat['distance']).abs().astype(np.float64) + 1e-10)

    # the aggregating functions (Just mean and std for ATAC Features) 
    df_mean = df_feat.groupby("gene_name", observed=True)[ATAC_FEATURES].mean()
    df_mean.columns = [f"{c}.mean" for c in df_mean.columns]
    
    df_std = df_feat.groupby("gene_name", observed=True)[ATAC_FEATURES].std()
    df_std.columns = [f"{c}.std" for c in df_std.columns]
    
    df_res = df_mean.merge(df_std, left_on='gene_name', right_on = 'gene_name')
    df_res['count'] = df_feat.groupby("gene_name", observed=True).size()

    return df_res
    # print("Missing Values:")
    # print(pd.isna(df_res).sum() / df_res.shape[0] * 100)

In [371]:
_feat = 'atac'
rel_path = FEATURE_FILENAME_DICT[_feat]
fullpath = f'{filepath}/{ct}.{rel_path}'
if Path(fullpath).exists(): 
    df_atac = gen_atac_features(_feat, fullpath)
        
    print(f"Output for {_feat}")
    print(f"{outpath}/{ct}.{_feat}.csv")
    
    df_atac.to_csv(f"{outpath}/{ct}.{_feat}.csv")

Output for atac
/gale/ddn/aging/aklein/240903_aging_mice/aging_gene_prediction/aging_gene_prediction/ak_workspace/data/v2/OPC_NN/OPC_NN.atac.csv


## Genebody

In [372]:
def gen_genebody_features(_feat, fullpath): 
    # Read the dataframe
    df_feat = pd.read_csv(fullpath).set_index('gene_name')
    # display(df_feat.head())

    # The list of features to generate
    GENEBODY_FEATURES = ['9mo.2mo.log_ratio', '18mo.9mo.log_ratio', '18mo.2mo.log_ratio']
    df_feat['9mo.2mo.log_ratio'] = np.log2(df_feat['9mo'] + 1e-10) - np.log2(df_feat['2mo'] + 1e-10)
    df_feat['18mo.9mo.log_ratio'] = np.log2(df_feat['18mo'] + 1e-10) - np.log2(df_feat['9mo'] + 1e-10)
    df_feat['18mo.2mo.log_ratio'] = np.log2(df_feat['18mo'] + 1e-10) - np.log2(df_feat['2mo'] + 1e-10)

    df_res = df_feat.groupby("gene_name", observed=True)[GENEBODY_FEATURES].mean()
    return df_res

In [373]:
_feat = 'mch_genebody'
rel_path = FEATURE_FILENAME_DICT[_feat]
fullpath = f'{filepath}/{ct}.{rel_path}'
if Path(fullpath).exists(): 
    df_gb_mch = gen_genebody_features(_feat, fullpath)
    
    print(f"Output for {_feat}")
    print(f"{outpath}/{ct}.{_feat}.csv")
    df_gb_mch.to_csv(f"{outpath}/{ct}.{_feat}.csv")

Output for mch_genebody
/gale/ddn/aging/aklein/240903_aging_mice/aging_gene_prediction/aging_gene_prediction/ak_workspace/data/v2/OPC_NN/OPC_NN.mch_genebody.csv


In [374]:
_feat = 'mcg_genebody'
rel_path = FEATURE_FILENAME_DICT[_feat]
fullpath = f'{filepath}/{ct}.{rel_path}'
if Path(fullpath).exists(): 
    df_gb_mcg = gen_genebody_features(_feat, fullpath)
    
    print(f"Output for {_feat}")
    print(f"{outpath}/{ct}.{_feat}.csv")
    df_gb_mcg.to_csv(f"{outpath}/{ct}.{_feat}.csv")

Output for mcg_genebody
/gale/ddn/aging/aklein/240903_aging_mice/aging_gene_prediction/aging_gene_prediction/ak_workspace/data/v2/OPC_NN/OPC_NN.mcg_genebody.csv


## Loops

The question: Should I be binning loops by the loop size? That is generating the mean, std, skew, max, min for different loop sizes? 

In [375]:
def gen_loop_features(_feat, fullpath):
    # Read in data file
    df_feat = pd.read_csv(fullpath).set_index('gene_name')

    # The features to generate
    LOOP_FEATURES = ['Qanova', 'Tanova', '9mo.2mo.Q', '18mo.9mo.Q', '18mo.2mo.Q', '9mo.2mo.T', '18mo.9mo.T', '18mo.2mo.T', 'a_length.log']
    df_feat['9mo.2mo.Q'] = df_feat['9mo.Q'] - df_feat['2mo.Q']
    df_feat['18mo.9mo.Q'] = df_feat['18mo.Q'] - df_feat['9mo.Q']
    df_feat['18mo.2mo.Q'] = df_feat['18mo.Q'] - df_feat['2mo.Q']
    df_feat['9mo.2mo.T'] = df_feat['9mo.T'] - df_feat['2mo.T']
    df_feat['18mo.9mo.T'] = df_feat['18mo.T'] - df_feat['9mo.T']
    df_feat['18mo.2mo.T'] = df_feat['18mo.T'] - df_feat['2mo.T']
    df_feat['a_length.log'] = np.log2((df_feat['anchor2_distance'] - df_feat['anchor1_distance']).abs().astype(np.float64) + 10000) #10000 is the loop resolution
    
    # The aggregating functions to use: 
    funct_names = ['mean', 'median', 'std', 'skew', 'max', 'min']
    df_res = df_feat.groupby("gene_name", observed=True)[LOOP_FEATURES].agg(funct_names)
    df_res.columns = df_res.columns.map('.'.join).str.strip('.')
    df_res['count'] = df_feat.groupby("gene_name", observed=True).size()
    
    return df_res

In [376]:
_feat = 'loops'
rel_path = FEATURE_FILENAME_DICT[_feat]
fullpath = f'{filepath}/{ct}.{rel_path}'
if Path(fullpath).exists(): 
    df_loop = gen_loop_features(_feat, fullpath)
    
    print(f"Output for {_feat}")
    print(f"{outpath}/{ct}.{_feat}.csv")
    df_loop.to_csv(f"{outpath}/{ct}.{_feat}.csv")

Output for loops
/gale/ddn/aging/aklein/240903_aging_mice/aging_gene_prediction/aging_gene_prediction/ak_workspace/data/v2/OPC_NN/OPC_NN.loops.csv


## Differential Regions

In [377]:
def gen_DR_features(_feat, fullpath):
    # Read data file
    df_feat = pd.read_csv(fullpath).set_index('gene_name')

    # Features (DAR and DMR are slightly different so): 
    # if _feat == 'DMR': 
    #     DR_FEATURES = ['size', 'rel_pos', '9mo.2mo.log_ratio', '18mo.9mo.log_ratio', '18mo.2mo.log_ratio']
    # elif _feat == 'DAR':
    #     DR_FEATURES = ['rel_pos', '9mo.2mo.log_ratio', '18mo.9mo.log_ratio', '18mo.2mo.log_ratio']
    # It appears that some of the cell types don't have the start, end and gene start and end, so for now:
    # On the other hand some of the other ones don't have the distance parameter, so what to do? 
    DR_FEATURES = ['9mo.2mo.log_ratio', '18mo.9mo.log_ratio', '18mo.2mo.log_ratio']

    # df_feat['size'] = df_feat['end'] - df_feat['start']
    # df_feat['rel_pos'] = (df_feat['start'] - df_feat['gene_start']) / (df_feat['gene_end'] - df_feat['gene_start'])
    df_feat['9mo.2mo.log_ratio'] = np.log2(df_feat['9mo'] + 1e-10) - np.log2(df_feat['2mo'] + 1e-10)
    df_feat['18mo.9mo.log_ratio'] = np.log2(df_feat['18mo'] + 1e-10) - np.log2(df_feat['9mo'] + 1e-10)
    df_feat['18mo.2mo.log_ratio'] = np.log2(df_feat['18mo'] + 1e-10) - np.log2(df_feat['2mo'] + 1e-10)
    
    df_res = df_feat.groupby("gene_name", observed=True)[DR_FEATURES].mean()
    df_res['count'] = df_feat.groupby("gene_name", observed=True).size()
    
    return df_res

In [378]:
_feat = 'DMR'
rel_path = FEATURE_FILENAME_DICT[_feat]
fullpath = f'{filepath}/{ct}.{rel_path}'
if Path(fullpath).exists(): 
    df_dmr = gen_DR_features(_feat, fullpath)

    print(f"Output for {_feat}")
    print(f"{outpath}/{ct}.{_feat}.csv")
    df_dmr.to_csv(f"{outpath}/{ct}.{_feat}.csv")

Output for DMR
/gale/ddn/aging/aklein/240903_aging_mice/aging_gene_prediction/aging_gene_prediction/ak_workspace/data/v2/OPC_NN/OPC_NN.DMR.csv


In [379]:
_feat = 'DAR'
rel_path = FEATURE_FILENAME_DICT[_feat]
fullpath = f'{filepath}/{ct}.{rel_path}'
if Path(fullpath).exists(): 
    df_dar = gen_DR_features(_feat, fullpath)

    print(f"Output for {_feat}")
    print(f"{outpath}/{ct}.{_feat}.csv")
    df_dar.to_csv(f"{outpath}/{ct}.{_feat}.csv")

Output for DAR
/gale/ddn/aging/aklein/240903_aging_mice/aging_gene_prediction/aging_gene_prediction/ak_workspace/data/v2/OPC_NN/OPC_NN.DAR.csv


## ABC Enhancers

In [380]:
def gen_abc_features(_feat, fullpath):
    # Read data file
    df_feat = pd.read_csv(fullpath) # .set_index('gene_name')

    # Features to generate
    ENHANCER_ABC_FEATURES = ['9mo.2mo.activity.log_ratio', '18mo.2mo.activity.log_ratio', '18mo.9mo.activity.log_ratio',
                             '9mo.2mo.contact.log_ratio', '18mo.2mo.contact.log_ratio', '18mo.9mo.contact.log_ratio',]
                             # '9mo.2mo.abc_score.log_ratio', '18mo.2mo.abc_score.log_ratio', '18mo.9mo.abc_score.log_ratio']

    df_feat['9mo.2mo.activity.log_ratio'] = np.log2(df_feat['9mo.activity'] + 1e-10) - np.log2(df_feat['2mo.activity'] + 1e-10)
    df_feat['18mo.2mo.activity.log_ratio'] = np.log2(df_feat['18mo.activity'] + 1e-10) - np.log2(df_feat['2mo.activity'] + 1e-10)
    df_feat['18mo.9mo.activity.log_ratio'] = np.log2(df_feat['18mo.activity'] + 1e-10) - np.log2(df_feat['9mo.activity'] + 1e-10)
    
    df_feat['9mo.2mo.contact.log_ratio'] = np.log2(df_feat['9mo.contact'] + 1e-10) - np.log2(df_feat['2mo.contact'] + 1e-10)
    df_feat['18mo.2mo.contact.log_ratio'] = np.log2(df_feat['18mo.contact'] + 1e-10) - np.log2(df_feat['2mo.contact'] + 1e-10)
    df_feat['18mo.9mo.contact.log_ratio'] = np.log2(df_feat['18mo.contact'] + 1e-10) - np.log2(df_feat['9mo.contact'] + 1e-10)
    
    # df_feat['9mo.2mo.abc_score.log_ratio'] = np.log2(df_feat['9mo.abc_score'] + 1e-10) - np.log2(df_feat['2mo.abc_score'] + 1e-10)
    # df_feat['18mo.2mo.abc_score.log_ratio'] = np.log2(df_feat['18mo.abc_score'] + 1e-10) - np.log2(df_feat['2mo.abc_score'] + 1e-10)
    # df_feat['18mo.9mo.abc_score.log_ratio'] = np.log2(df_feat['18mo.abc_score'] + 1e-10) - np.log2(df_feat['9mo.abc_score'] + 1e-10)

    df_feat['end_distance'] = abs(df_feat['start'] - df_feat['gene_end'])
    df_feat['start_distance'] = abs(df_feat['start'] - df_feat['gene_start'])
    df_feat['e_distance'] = df_feat[['end_distance', 'start_distance']].min(axis=1)
    df_feat = df_feat.drop(columns = ['end_distance', 'start_distance'])
    
    # binning by distance from gene: 
    eg_distance_cutoffs = [0, 10e3, 20e3, 1e6, 5e6]
    bin_name_map = {'(0.0, 10000.0]' : '0 - 10kb', 
                    '(10000.0, 20000.0]' : '10kb - 20kb', 
                    '(20000.0, 1000000.0]' : '20kb - 1Mb', 
                    '(1000000.0, 5000000.0]' : '1Mb - 5Mb'
                   }
    df_feat['binned_distance'] = pd.cut(df_feat['e_distance'], eg_distance_cutoffs)
    df_feat['binned_distance'] = df_feat['binned_distance'].apply(lambda x: bin_name_map[str(x)])
    df_feat['binned_distance'] = df_feat['binned_distance'].cat.reorder_categories([ '0 - 10kb', '10kb - 20kb', '20kb - 1Mb', '1Mb - 5Mb'], ordered=True)

    funct_names = ['mean', 'median', 'std', 'skew', 'max', 'min', 'count']
    df_res = df_feat.groupby(['gene_name', 'binned_distance'], observed=True)[ENHANCER_ABC_FEATURES].agg(funct_names)
    df_res.columns = df_res.columns.map('.'.join).str.strip('.')

    df_res_dist = pd.DataFrame(index=df_feat['gene_name'].unique())
    df_res_dist.index.name = 'gene_name'

    df_res = df_res.reset_index().copy()
    for _cat in df_res['binned_distance'].cat.categories:
        cat = _cat.replace(" ", "")
        df_temp = df_res.loc[df_res['binned_distance'] == _cat]
        df_temp = df_temp.set_index('gene_name').drop(columns = "binned_distance")
        df_temp.columns = [f"{c}.{cat}" for c in df_temp.columns]
        df_res_dist = df_res_dist.merge(df_temp, how='left',  left_on='gene_name', right_on = 'gene_name')
        
    df_res_dist['count'] = df_feat.groupby("gene_name", observed=True).size()
    return df_res_dist

In [381]:
_feat = 'enhancer_peak'
rel_path = FEATURE_FILENAME_DICT[_feat]
fullpath = f'{filepath}/{ct}.{rel_path}'
if Path(fullpath).exists(): 
    df_abc_peak = gen_abc_features(_feat, fullpath)
    
    print(f"Output for {_feat}")
    print(f"{outpath}/{ct}.{_feat}.csv")
    df_abc_peak .to_csv(f"{outpath}/{ct}.{_feat}.csv")

Output for enhancer_peak
/gale/ddn/aging/aklein/240903_aging_mice/aging_gene_prediction/aging_gene_prediction/ak_workspace/data/v2/OPC_NN/OPC_NN.enhancer_peak.csv


In [382]:
_feat = 'enhancer_DMR'
rel_path = FEATURE_FILENAME_DICT[_feat]
fullpath = f'{filepath}/{ct}.{rel_path}'
if Path(fullpath).exists(): 
    df_abc_dmr = gen_abc_features(_feat, fullpath)
        
    print(f"Output for {_feat}")
    print(f"{outpath}/{ct}.{_feat}.csv")
    df_abc_dmr.to_csv(f"{outpath}/{ct}.{_feat}.csv")

Output for enhancer_DMR
/gale/ddn/aging/aklein/240903_aging_mice/aging_gene_prediction/aging_gene_prediction/ak_workspace/data/v2/OPC_NN/OPC_NN.enhancer_DMR.csv


## Old

In [268]:
    
    # # Mean 
    # df_feat_mean = pd.DataFrame(df_feat.groupby(['gene_name', 'binned_distance'], observed=True)[ENHANCER_ABC_FEATURES].mean()).reset_index()
    
    # df_feat_mean_dist = pd.DataFrame(index=df_feat['gene_name'].unique())
    # df_feat_mean_dist.index.name = 'gene_name'
    
    # for _cat in df_feat_mean['binned_distance'].cat.categories:
    #     # print(_cat, _cat.replace(" ", ""))
    #     cat = _cat.replace(" ", "")
        
    #     df_temp = df_feat_mean.loc[df_feat_mean['binned_distance'] == _cat]
    #     df_temp = df_temp.set_index('gene_name').drop(columns = "binned_distance")
    #     df_temp.columns = [f"{c}.{cat}" for c in df_temp.columns]
    #     df_feat_mean_dist = df_feat_mean_dist.merge(df_temp, left_on='gene_name', right_on = 'gene_name')
    
    # # Max 
    # df_feat_max = pd.DataFrame(df_feat.groupby(['gene_name', 'binned_distance'], observed=True)[ENHANCER_ABC_FEATURES].max()).reset_index()
    
    # df_feat_max_dist = pd.DataFrame(index=df_feat['gene_name'].unique())
    # df_feat_max_dist.index.name = 'gene_name'
    
    # for _cat in df_feat_max['binned_distance'].cat.categories:
    #     # print(_cat, _cat.replace(" ", ""))
    #     cat = _cat.replace(" ", "")
        
    #     df_temp = df_feat_max.loc[df_feat_max['binned_distance'] == _cat]
    #     df_temp = df_temp.set_index('gene_name').drop(columns = "binned_distance")
    #     df_temp.columns = [f"{c}.{cat}" for c in df_temp.columns]
    #     df_feat_max_dist = df_feat_max_dist.merge(df_temp, left_on='gene_name', right_on = 'gene_name')
    
    
    # # Min
    # df_feat_min = pd.DataFrame(df_feat.groupby(['gene_name', 'binned_distance'], observed=True)[ENHANCER_ABC_FEATURES].min()).reset_index()
    
    # df_feat_min_dist = pd.DataFrame(index=df_feat['gene_name'].unique())
    # df_feat_min_dist.index.name = 'gene_name'
    
    # for _cat in df_feat_min['binned_distance'].cat.categories:
    #     # print(_cat, _cat.replace(" ", ""))
    #     cat = _cat.replace(" ", "")
        
    #     df_temp = df_feat_min.loc[df_feat_min['binned_distance'] == _cat]
    #     df_temp = df_temp.set_index('gene_name').drop(columns = "binned_distance")
    #     df_temp.columns = [f"{c}.{cat}" for c in df_temp.columns]
    #     df_feat_min_dist = df_feat_min_dist.merge(df_temp, left_on='gene_name', right_on = 'gene_name')
    
    
    # # Merge
    # df_feat_res = df_feat_mean_dist.merge(df_feat_max_dist, left_on='gene_name', right_on = 'gene_name', suffixes=(".mean", ""))
    # df_feat_res = df_feat_res.merge(df_feat_min_dist, left_on='gene_name', right_on = 'gene_name', suffixes=(".max", ".min"))
    # # return df_feat_res