## Notebook to prep phenotype data for tensorQTL

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv, read_hdf, DataFrame, read_pickle
import nb_util_funcs as nuf
from random import sample
import seaborn as sns
from seaborn import distplot , scatterplot, heatmap
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context
import ppscore as pps
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#### set notebooks variables

In [None]:
# parameters
cohort = 'nabec'
version = 'July_2024'
target = 'rna_TPM' #'RNA'
varianttype = 'SV'
caller = ''

In [None]:
# naming
modality = 'RNAB'
set_name = f'{cohort}_{version}_{target}_{varianttype}_{caller}'
cohort_version_target = f'{cohort}_{version}_{target}'


in_dir = f'/data/CARDPB/data/NABEC/projects/QTL_paper_2024/SV-eQTL'
geno_dir = f'{in_dir}/genotypes'
quants_dir = f'{in_dir}/expression'
info_dir = f'{in_dir}/sample_info'
public_dir = f'{in_dir}/public'

# in files
quants_file = f'{quants_dir}/all_samples_salmon_genes_new.csv'
covariates_file = f'{info_dir}/nabec.aug2020.sample_info.txt'

if modality == 'METH':
    features_file = f'{quants_dir}/EPIC_annotation_hg38.txt'    
elif modality == 'RNAB':
    features_file = '/data/CARDPB/resources/hg38/gencode.v43.primary_assembly.annotation.pkl'


if varianttype == 'SNV':
    covs_columns_to_use = ['SNVPC1', 'SNVPC2', 'SNVPC3', 'SNVPC4', 'SNVPC5','female', 'Age', 'JHU', 'MIAMI', 'SH', 'UKY', 'UMARY','EXP_PCA1', 'EXP_PCA2',
       'EXP_PCA3', 'EXP_PCA4', 'EXP_PCA5']
    bfile_prefix_path = f'{geno_dir}/MERGED_MAF_GENO005_plink19_ONTsamples'
    genetic_pcs_file = f'{in_dir}/sample_info/MERGED_MAF_GENO005_plink19_ONTsamples_pca20.txt'
elif varianttype == 'SV':
    covs_columns_to_use = ['SVPC1', 'SVPC2', 'SVPC3', 'SVPC4', 'SVPC5','female', 'Age', 'JHU', 'MIAMI', 'SH', 'UKY', 'UMARY','EXP_PCA1', 'EXP_PCA2',
       'EXP_PCA3', 'EXP_PCA4', 'EXP_PCA5']
    bfile_prefix_path = f'{geno_dir}/NABEC_snifles2_2_multisample_biggerthan50bps.sorted_noBlacklist_noSuperDups_02092024_MAF_GENO_005_updateid'
    genetic_pcs_file = f'{in_dir}/sample_info/NABEC_snifles2_2_multisample_biggerthan50bps.sorted_noBlacklist_noSuperDups_02092024_MAF_GENO_005_updateid_pca20.txt'
elif varianttype == 'SNV_SV':
    covs_columns_to_use = ['SVPC1', 'SVPC2', 'SVPC3', 'SNVPC4', 'SNVPC5','female', 'Age', 'JHU', 'MIAMI', 'SH', 'UKY', 'UMARY','EXP_PCA1', 'EXP_PCA2',
       'EXP_PCA3', 'EXP_PCA4', 'EXP_PCA5']
    bfile_prefix_path = f'{geno_dir}/SNV_sniffles_SV_merged'
    genetic_pcs_file = f'{in_dir}/sample_info/SNV_sniffles_SV_merged_pca20.txt'    
    
# out files
umap_covs_file = f'{info_dir}/{set_name}.umap.covs.csv'
scaled_file = f'{quants_dir}/{set_name}.scaled.hdf5'
adj_quants_file = f'{quants_dir}/{set_name}.scaled.adj.hdf5'
tnsrqtl_pheno_file = f'{quants_dir}/{set_name}.scaled.adj.bed.gz'
tnsrqtl_pheno_non_adj_file = f'{quants_dir}/{cohort_version_target}.scaled.bed.gz'

# constants
if modality == 'METH':
    min_detection_rate = 0.75
else:
    min_detection_rate = 0.25

DEBUG = False
low_var_quartile = '75%'
dpi_value = 50

REMOVE_SAMPLE= ['UMARY-4915']

### load input data

#### load the quantified features matrix

In [None]:
%%time
quants_df = read_csv(quants_file,index_col=0).set_index('ID').T
print(quants_df.shape)

if DEBUG:
    display(quants_df.head())

In [9]:
#REMOVE samples
quants_df = quants_df[~quants_df.index.isin(REMOVE_SAMPLE)]

#### load covariates files

In [None]:
covs_df = read_csv(covariates_file, index_col=0)
# drop any duplicated indices
print(covs_df.shape)
covs_df = covs_df[~covs_df.index.duplicated(keep='first')]
print(f'covariates shape {covs_df.shape}')
if DEBUG:
    display(covs_df.sample(5))

In [None]:
with open('/data/CARDPB/data/NABEC/projects/QTL_paper_2024/SV-eQTL/notebooks/ONT_samples.txt', 'r') as f:
    ONT_samples=[i.replace('\n','') for i in f]

In [None]:
quants_df = quants_df[quants_df.index.isin(ONT_samples)]

In [None]:
# check for any unexpected samples; ie probably name frmt issue
set(quants_df.index) - set(covs_df.index)

#### for further analysis remove the ID columns

In [None]:
print(covs_df.shape)
#cols_to_keep = set(covs_df.columns) - set(other_id_columns) - set(exclude_addl_info_cols)
cols_to_keep = ['Group', 'Ethnicity', 'PMI', 'Sex', 'Age', 'RIN_totalrna']
covs_df = covs_df[cols_to_keep]
print(f'covariates shape {covs_df.shape}')
if DEBUG:
    display(covs_df.head())

#### load and merge in the genetics PCs for cohort

In [None]:
genetic_components_df = pd.read_csv(genetic_pcs_file, sep='\s+', index_col=1)
genetic_components_df = genetic_components_df.iloc[:,1:]
print(genetic_components_df.shape)
covs_df = covs_df.merge(genetic_components_df, how='right', left_index=True, right_index=True)
print(covs_df.shape)

#### load feature annotations

In [None]:
%%time
if modality == 'METH':
    features_df = read_csv(features_file, sep='\t', header=None)
    features_df.columns = ['Chr', 'start', 'end', 'feature']

elif modality == 'RNAB':
    features_df = read_pickle(features_file)
    # features_df.columns = ['feature', 'chrom', 'start', 'end', 'strand']
    # drop the ont and tag columns
    discard_cols = features_df.columns[(features_df.columns.str.startswith('ont')) |
                                       (features_df.columns.str.startswith('tag')) | 
                                       (features_df.columns.str.startswith('havana_')) |                                       
                                       (features_df.columns.str.startswith('gene_alias')) | 
                                       (features_df.columns.str.startswith('transcript_alias'))]
    features_df = features_df.drop(columns=discard_cols)
    # subset to just 'gene' features
    features_df = features_df.loc[features_df.feature == 'gene']
    # now drop existing feature col so we can use that name
    features_df = features_df.drop(columns=['feature'])
    if modality == 'RNAB':
        features_df = features_df.rename(columns={'seqname': 'chrom', 'gene_id': 'feature'})    
        
print(f'features shape {features_df.shape}')
if DEBUG:
    display(features_df.head())

#### find IDs for features on sex chromosomes, for dropping later

In [None]:
sex_chr_feature_ids = features_df.loc[features_df.chrom
                                      .isin(['chrX', 'chrY'])]['feature'].unique()
print(len(sex_chr_feature_ids))

### check expected sex of samples

In [None]:
#Vawter MP, Evans S, Choudary P et al. Gender-specific gene expression in 
#post-mortem human brain: localization to sex chromosomes. 
#Neuropsychopharmacology 2004;29:373–84.
sex_genes = ['XIST','RPS4Y1','RPS4Y2','KDM5D','UTY','DDX3Y','USP9Y']

if modality == 'METH':
    sex_specific_features = features_df.loc[features_df['Chr']
                                          .isin(['chrX', 'chrY'])]['feature'].unique()
elif modality == 'RNAB':
    sex_features = features_df.loc[features_df.gene_name.isin(sex_genes)]
    sex_specific_features = sex_features.gene_name.to_list()

sex_features_present = list(set(sex_specific_features) & set(quants_df.columns))
print(f'found {len(sex_features_present)} sex features: \n{sex_features_present}')
quants_sex_df = quants_df[sex_features_present].copy()
print(f'sex features matrix shape {quants_sex_df.shape}')

In [None]:
%%time
sex_umap_df = nuf.generate_umap_covs_df(quants_sex_df, covs_df)
nuf.plot_umap_clusters(sex_umap_df, hue_cov='Sex', style_cov='Group')

### calculate, plot detection rates and subset well detected features

In [None]:
%%time
trait_miss_rates, sample_miss_rates = nuf.calculate_detection_rates(quants_df, modality)
nuf.plot_missing_rates(trait_miss_rates, sample_miss_rates)
bad_call_rate_features = nuf.bad_callrate_features(trait_miss_rates, min_detection_rate)
quants_wd_df = nuf.subset_well_detected_features(quants_df, bad_call_rate_features)

In [None]:
%%time
trait_miss_rates, sample_miss_rates = nuf.calculate_detection_rates(quants_df, modality)
nuf.plot_missing_rates(trait_miss_rates, sample_miss_rates)
bad_call_rate_features = nuf.bad_callrate_features(trait_miss_rates, min_detection_rate)
quants_wd_df = nuf.subset_well_detected_features(quants_df, bad_call_rate_features)

In [None]:
### standardize the dataset using transform

In [None]:
%%time
traits_scaled_df = nuf.scale_dataframe(quants_wd_df)

In [None]:
# check transformation for random feature
nuf.plot_trnsfrm_effect_example(quants_df, traits_scaled_df,
                                bf_label=modality, 
                                af_label='quantile transformed')

### save scaled, well detected data

In [None]:
nuf.write_df_to_hdf(traits_scaled_df, scaled_file)

### since switching to tensorQTL can just use one large transcriptome pheno bed instead of per chrom pheno

In [None]:
%%time


# get feature annots for present features
feature_present_df = features_df.loc[features_df['gene_name'].isin(traits_scaled_df.columns)]

# tensorQTL pheno bed is rows = features and columns = samples
# where first four columns are chr, start, end, phenotype_id, then sample1 ... sampleN

# create dict for renaming columns (samples) from assayid to geno_id
#sample_col_dict = id_map.set_index('assayid').to_dict()['sampleid'] #we can skip this in this NABEC samples because it is for FOUNDIN data.

# transpose the residuals df from sample x feature to feature x sample
tresiduals_df = traits_scaled_df.transpose()

# modify annots
feature_present_df = feature_present_df[['chrom', 'start', 'end', 'gene_name', 'strand']].copy()
feature_present_df.rename(columns={'chrom': 'chr', 'start': 'fstart', 
                                   'end': 'fend'}, inplace=True)
# for tensorQTL 'end' column is TSS so set appropriately
feature_present_df['end'] = np.where(feature_present_df['strand'] == '+',  
                                     feature_present_df['fstart'], 
                                     feature_present_df['fend'])
feature_present_df['start'] = feature_present_df['end'] - 1

# there is a feature per transcript, so can be multiple entries per feature, so just keep longest
feature_present_df['length'] = feature_present_df['fend'] - feature_present_df['fstart']
feature_present_df.sort_values(by=['gene_name', 'length'], 
                               inplace=True, ascending=False)
print(feature_present_df.shape)
feature_present_df.drop_duplicates(subset=['gene_name'], keep='first', 
                                   inplace=True, ignore_index=True)
feature_present_df.set_index('gene_name', inplace=True, drop=False)
feature_present_df = feature_present_df.reindex(tresiduals_df.index)
print(feature_present_df.shape)
# insert the feature annots
tresiduals_df.insert( 0, column='chr', value=feature_present_df['chr'])
tresiduals_df.insert( 1, column='start', value=feature_present_df['start'])
tresiduals_df.insert( 2, column='end', value=feature_present_df['end'])
tresiduals_df.insert( 3, column='phenotype_id', value=feature_present_df['gene_name'])

tresiduals_df

# if there are any genes that were in quants but not feature annots
# remove these with missing positions
tresiduals_df = tresiduals_df.loc[~tresiduals_df['chr'].isna()]
print(tresiduals_df.shape)
# make the positions ints instead of floats
tresiduals_df['start'] = tresiduals_df['start'].astype('int64')
tresiduals_df['end'] = tresiduals_df['end'].astype('int64')



# now rename sample ids in columns
#tresiduals_df.rename(columns=sample_col_dict, inplace=True)
tresiduals_df.to_csv(tnsrqtl_pheno_non_adj_file, index=False, sep='\t', compression='gzip')