## Notebook to prep NABEC/HBCC meth data

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv, read_hdf, DataFrame, read_pickle
import nb_util_funcs as nuf
from random import sample
import seaborn as sns
from seaborn import distplot , scatterplot, heatmap
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context
import ppscore as pps
import pandas as pd
import os
import numpy as np

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#### set notebooks variables

In [None]:
# Parameters
cohort = "nabec"
version = "Aug_2024"
target = "tss_start_5cpg"
varianttype_caller = "SV_sniffles"


In [None]:
# naming
##cohort_build = f'{cohort}.{version}'
modality = 'METH'
set_name = f'{cohort}_{version}_{target}_{varianttype_caller}'

in_dir = f'/data/CARDPB/data/NABEC/projects/QTL_paper_2024/SV-mQTL'
geno_dir = f'{in_dir}/genotypes'
quants_dir = f'{in_dir}/expression'
info_dir = f'{in_dir}/sample_info'

# in files
quants_file = f'{quants_dir}/{cohort}_{target}_avgmod.bed'
covariates_file = f'{info_dir}/nabec.aug2020.sample_info.txt'
if features_file = f'{quants_dir}/EPIC_annotation_hg38.txt'    

elif modality == 'RNAB' :
    features_file = '/data/CARDPB/resources/hg38/gencode.v43.primary_assembly.annotation.pkl'


if varianttype_caller =='SNV_illumina':
    bfile_prefix_path = f'{geno_dir}/MERGED_MAF_GENO005_plink19_ONTsamples'
    genetic_pcs_file = f'{in_dir}/sample_info/MERGED_MAF_GENO005_plink19_ONTsamples_pca20.txt'
elif varianttype_caller == 'SV_sniffles':
    bfile_prefix_path = f'{geno_dir}/NABEC_snifles2_2_multisample_biggerthan50bps.sorted_noBlacklist_noSuperDups_02092024_MAF_GENO_005_updateid'
    genetic_pcs_file = f'{in_dir}/sample_info/NABEC_snifles2_2_multisample_biggerthan50bps.sorted_noBlacklist_noSuperDups_02092024_MAF_GENO_005_updateid_pca20.txt'
elif varianttype_caller == 'SNV_SV_sniffles':
    bfile_prefix_path = f'{geno_dir}/SNV_sniffles_SV_merged'
    genetic_pcs_file = f'{in_dir}/sample_info/SNV_sniffles_SV_merged_pca20.txt'    
    
# out files
umap_covs_file = f'{info_dir}/{set_name}.umap.covs.csv'
scaled_file = f'{quants_dir}/{set_name}.scaled.hdf5'
tnsrqtl_pheno_nonadj_file = f'{quants_dir}/{cohort}_{version}_{target}.scaled.bed.gz'
percentage_pheno_file = f'{quants_dir}/{cohort}_{version}_{target}.raw_percentage.bed.gz'

# constants
if modality == 'METH':
    min_detection_rate = 1.00
else:
    min_detection_rate = 0.25


DEBUG = False
low_var_quartile = '75%'
dpi_value = 50

### load input data

#### load the quantified features matrix

In [None]:
%%time
quants_df = read_csv(quants_file,header=0,sep='\t')

#quants_df = read_csv(quants_file,index_col='phenotype_id', header=0,sep='\t').iloc[:,4:].transpose()
#.set_index('ID').T
print(quants_df.shape)

if DEBUG:
    display(quants_df.head())

In [None]:
quants_df.shape

In [None]:
if target == "geneBodies":
    quants_df['name'] = quants_df['name'].str.split(';',expand=True)[3].str.split('=',expand=True)[1]

In [None]:
n = -1
for i in quants_df.columns:
    n += 1
    if "avgMod" not in i:
        print(i)
    else:
        break
print(f"first column num of phenotype is {n}")

In [None]:
INFO = quants_df.iloc[:,0:n]
INFO.columns.values[3] = 'NAME'
INFO.index = INFO['chrom'].astype('str')  + "_" + INFO['start'].astype('str')+ "_" + INFO['end'].astype('str')  + "_" + INFO['NAME']
INFO = INFO.rename(columns={"chrom":"chr"})

In [None]:
quants_df = quants_df.iloc[:,n:]
quants_df.columns = quants_df.columns.str.replace('avgMod_NABEC_','').str.replace('_FTX','')
quants_df.index = INFO['chr'].astype('str')  + "_" + INFO['start'].astype('str')+ "_" + INFO['end'].astype('str')  + "_" + INFO['NAME']
quants_df = quants_df.T
quants_df.to_csv(percentage_pheno_file)

#### load covariates files

In [None]:
covs_df = read_csv(covariates_file, index_col=0)
# drop any duplicated indices
print(covs_df.shape)
covs_df = covs_df[~covs_df.index.duplicated(keep='first')]
print(f'covariates shape {covs_df.shape}')
if DEBUG:
    display(covs_df.sample(5))

In [None]:
with open('/data/CARDPB/data/NABEC/projects/QTL_paper_2024/SV-eQTL/notebooks/ONT_samples.txt', 'r') as f:
    ONT_samples=[i.replace('\n','') for i in f]

In [None]:
quants_df = quants_df[quants_df.index.isin(ONT_samples)]

In [None]:
# check for any unexpected samples; ie probably name frmt issue
set(quants_df.index) - set(covs_df.index)

In [None]:
quants_df.shape

In [None]:
SAMPLE_SIZE=len(quants_df)
Missing_rate = []
for i in range(0,quants_df.shape[1]):
    Missing_rate.append(sum(quants_df.iloc[:,i]==".")/SAMPLE_SIZE)

In [None]:
print(covs_df.shape)
#cols_to_keep = set(covs_df.columns) - set(other_id_columns) - set(exclude_addl_info_cols)
cols_to_keep = ['Group', 'Ethnicity', 'PMI', 'Sex', 'Age', 'RIN_totalrna']
covs_df = covs_df[cols_to_keep]
print(f'covariates shape {covs_df.shape}')
if DEBUG:
    display(covs_df.head())

#### load and merge in the genetics PCs for cohort

In [None]:
genetic_components_df = pd.read_csv(genetic_pcs_file, sep='\s+', index_col=1)
genetic_components_df = genetic_components_df.iloc[:,1:]
print(genetic_components_df.shape)
covs_df = covs_df.merge(genetic_components_df, how='right', left_index=True, right_index=True)
print(covs_df.shape)

#### check chr X and chr Y

In [33]:
sex_chr_feature_ids = quants_df.columns[(quants_df.columns.str.contains('chrX')) | (quants_df.columns.str.contains('chrY'))].unique()

In [None]:
sex_chr_feature_ids

### check expected sex of samples

In [None]:
#Vawter MP, Evans S, Choudary P et al. Gender-specific gene expression in 
#post-mortem human brain: localization to sex chromosomes. 
#Neuropsychopharmacology 2004;29:373–84.
sex_genes = ['XIST','RPS4Y1','RPS4Y2','KDM5D','UTY','DDX3Y','USP9Y']

if modality == 'METH':
    sex_specific_features = quants_df.columns[(quants_df.columns.str.contains('chrX')) | (quants_df.columns.str.contains('chrY'))].unique()

elif modality == 'RNAB':
    sex_features = features_df.loc[features_df.gene_name.isin(sex_genes)]
    sex_specific_features = sex_features.gene_name.to_list()

print(f'found {len(sex_features_present)} sex features: \n{sex_features_present}')
quants_sex_df = quants_df[sex_features_present].copy()
print(f'sex features matrix shape {quants_sex_df.shape}')

In [None]:
quants_sex_df = quants_sex_df.replace('.',0)

In [None]:
%%time
if quants_sex_df.shape[1] == 0:
    print("quants_sex_df is empty and cannot be processed.")
else:
    try:
        sex_umap_df = nuf.generate_umap_covs_df(quants_sex_df, covs_df)
        nuf.plot_umap_clusters(sex_umap_df, hue_cov='Sex', style_cov='Group')
    except Exception as e:
        print(f"An error occurred: {e}")

In [None]:
# Step 1: Check for NaN or Infinite values
print("Checking for NaN values:")
print(quants_df.isna().sum().sum())  # Total number of NaN values
print("Checking for Infinite values:")
print(np.isinf(quants_df).sum().sum())  # Total number of Infinite values

In [None]:
# Replace `.` with `NaN`
quants_df = quants_df.replace('.', np.nan).astype(float)

# Calculate the mean of each column
means = quants_df.mean()

# Replace `NaN` with the mean of the respective columns
for column in quants_df.columns:
    quants_df[column].fillna(means[column], inplace=True)
    
# delete all NaN columns
quants_df.dropna(axis=1, how='all', inplace=True)

In [None]:
def calculate_detection_rates(this_df, modality, round_percision=1, 
                              min_quant_value=None):
    if min_quant_value is None:
        min_quant_value = this_df.round(round_percision).min().min()

    print(f'minimun {modality} value is {min_quant_value}')

    detected_df = this_df.mask(this_df.round(round_percision) <= min_quant_value, 0)

    # calculate the missing counts from the detected df mask
    trait_missing_rates = round(detected_df.isin({0}).sum(0)/detected_df.shape[0], 2)
    sample_missing_rates = round(detected_df.isin({0}).sum(1)/detected_df.shape[1], 2)

    print(f'{len(trait_missing_rates)} features with mean missing \
rate = {trait_missing_rates.mean()}')
    print(f'{len(sample_missing_rates)} samples with mean missing \
rate = {sample_missing_rates.mean()}')
    return trait_missing_rates, sample_missing_rates

### calculate, plot detection rates and subset well detected features

In [None]:
%%time
trait_miss_rates, sample_miss_rates = nuf.calculate_detection_rates(quants_df, modality)
nuf.plot_missing_rates(trait_miss_rates, sample_miss_rates)
bad_call_rate_features = nuf.bad_callrate_features(trait_miss_rates, min_detection_rate)
quants_wd_df = nuf.subset_well_detected_features(quants_df, bad_call_rate_features)

### standardize the dataset using transform

In [None]:
%%time
traits_scaled_df = nuf.scale_dataframe(quants_wd_df)

In [None]:
# check transformation for random feature
nuf.plot_trnsfrm_effect_example(quants_df, traits_scaled_df,
                                bf_label=modality, 
                                af_label='quantile transformed')

### save scaled, well detected data for all days

In [None]:
nuf.write_df_to_hdf(traits_scaled_df, scaled_file)

### make tensorQTL files

In [None]:
%%time
INFO
info_df = INFO.iloc[:,:3]
info_df['phenotype_id'] = info_df.index
print(info_df.shape)

In [None]:
info_df

In [None]:
%%time
# transpose the scaled df from sample x feature to feature x sample
traits_scaled_df = traits_scaled_df.transpose()

if target in ['promoters2Kb','geneBodies','tss_start']:
    INFO = INFO[['chr','start','end','strand']]
    INFO['phenotype_id'] = INFO.index
    INFO['end'] = np.where(INFO['strand'] == '+',  
                                     INFO['start'], 
                                     INFO['end'])
    INFO['start'] = INFO['end'] - 1
    INFO = INFO[['chr','start','end','phenotype_id']]
    traits_scaled_df = INFO.merge(traits_scaled_df,right_index=True, left_index=True, how="inner")
    # for tensorQTL 'end' column is TSS so set appropriately
else:
    traits_scaled_df = info_df.merge(traits_scaled_df,right_index=True, left_index=True, how="inner")
    traits_scaled_df['end'] = traits_scaled_df['start'] + 1
display(traits_scaled_df.head())
print(traits_scaled_df.shape)


tresiduals_df.to_csv(tnsrqtl_pheno_nonadj_file, index=False, sep='\t', compression='gzip')