
# Preprocessing - Concatenate diseased Samples
 adapted from Michael Sterr

2024-05-31


# Setup


In [None]:
# General
import scipy as sci
import numpy as np
import pandas as pd
import logging
import time
import pickle
from itertools import chain
import h5py
import scipy.sparse as sparse
import anndata as ad
import os

# Plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import rcParams
from matplotlib import cm
import seaborn as sb
from matplotlib import colors
import matplotlib.colors as mcolors

# Analysis
import muon as mu
import scanpy as sc
import scvelo as scv

#R
import rpy2
import rpy2.robjects as ro
import rpy2.rinterface_lib.callbacks
from rpy2.robjects import pandas2ri
import anndata2ri

In [None]:
## Directory
base_dir = '/mnt/hdd/Notebooks/Gut_project/' #tochange
sc.settings.figdir = base_dir + 'Figures'
sc.settings.cachedir = base_dir + 'Cache'

## Scanpy settings
sc.settings.verbosity = 3
sc.logging.print_versions()

In [None]:
# Color maps
colors2 = plt.cm.Reds(np.linspace(0, 1, 128)) 
colors3 = plt.cm.Greys_r(np.linspace(0.7,0.8,20)) 
colorsComb = np.vstack([colors3, colors2]) 
mymap = colors.LinearSegmentedColormap.from_list('my_colormap', colorsComb)

In [None]:
# Plot settings
%matplotlib inline

## Plotting parameters
rcParams['figure.figsize']=(6,6) #rescale figures
#sc.set_figure_params(scanpy=True, frameon=False, vector_friendly=False, color_map='tab10' ,transparent=True, dpi=150, dpi_save=300)
sc.set_figure_params(scanpy=True, frameon=False, vector_friendly=False ,transparent=True, dpi=150, dpi_save=300)

'''## Font
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Source Sans 3']'''

## Grid & Ticks
rcParams['grid.alpha'] = 0
rcParams['xtick.bottom'] = True
rcParams['ytick.left'] = True

## Embed font
plt.rc('pdf', fonttype=42)

## Define new default settings
plt.rcParamsDefault = plt.rcParams

In [None]:
from matplotlib import colors
colors2 = plt.cm.Reds(np.linspace(0, 1, 128)) 
colors3 = plt.cm.Greys_r(np.linspace(0.7,0.8,20)) 
colorsComb = np.vstack([colors3, colors2]) 
mymap = colors.LinearSegmentedColormap.from_list('my_colormap', colorsComb)

plt.rcParams.update({
    "text.usetex": False,
    "font.family": "serif",
    "font.serif": "NewCM10", #Computer Modern Roman fontsize 10
})
## Define new default settings
plt.rcParamsDefault = plt.rcParams

# Setup R

In [None]:
%run utils.ipynb

In [None]:
setup_R('/home/scanalysis/mnt/envs/scUV/lib/R') #tochange

In [None]:
%%R

.libPaths()

In [None]:
%%R

# Parallelization
library(BiocParallel)
register(MulticoreParam(20, progressbar = TRUE))

library(future)
plan(multicore, workers = 20)
options(future.globals.maxSize = 100 * 1024 ^ 3) # for 50 Gb RAM
plan()

library(doParallel)
registerDoParallel(20)

sessionInfo()

# Functions

In [None]:
   
        
def get_feature_pos_from_gtf(gtf_path=None, random=True):
    # load Ensembl annotation file and reduce to genes
    annotation = pd.read_csv(gtf_path, header=None, skiprows=5, sep='\t')
    annotation = annotation.loc[annotation.iloc[:,2]=='gene',:]

    # get positions of + features
    features_p = annotation.loc[annotation.iloc[:,6]=='+',:].iloc[:,[0,3,4]]     
    features_p.columns = ['Chromosome','Start','End']

    # get positions of - features and switch start and end
    features_m = annotation.loc[annotation.iloc[:,6]=='-',:].iloc[:,[0,4,3]]     
    features_m.columns = ['Chromosome','Start','End']

    # concatenate
    features = pd.concat([features_p, features_m], ignore_index = True)
    
    # randomize order
    if random:
        features = features.sample(frac=1, random_state=420)

    return features

###################################################################################################################
###################################################################################################################
###################################################################################################################
    

def get_top_feature_pos_from_gtf(gex_adata, gtf_path=None, n_top=2000):
    # load Ensembl annotation file and reduce to genes
    annotation = pd.read_csv(gtf_path, header=None, skiprows=5, sep='\t')
    annotation = annotation.loc[annotation.iloc[:,2]=='gene',:]
    
    # add gene names
    annotation.loc[:,9] = annotation.loc[:,8].str.split('\"', expand=True).loc[:,1]
    annotation.set_index = annotation[9]
   
    # add counts and sort annotation
    annotation = annotation.merge(gex_adata.var.loc[:,['gene_ids','n_counts']], left_on=9, right_on='gene_ids')
    annotation = annotation.sort_values(by=['n_counts'], ascending=False)
    
    # filter n_top genes
    annotation = annotation.iloc[0:n_top,:]
      
    # get positions of + features
    features_p = annotation.loc[annotation.iloc[:,6]=='+',:].iloc[:,[0,3,4]]     
    features_p.columns = ['Chromosome','Start','End']

    # get positions of - features and switch start and end
    features_m = annotation.loc[annotation.iloc[:,6]=='-',:].iloc[:,[0,4,3]]
    features_m.columns = ['Chromosome','Start','End']

    # concatenate
    features = pd.concat([features_p, features_m], ignore_index = True)

    return features

    


# Load Data

In [None]:
base_path = '/mnt/hdd/data'
base_path1 = '/mnt/hdd/data/diseased'
base_path2 = '/mnt/hdd/data/healthy'
outs_path = '/count_matrices/'
cr_path = '/storage/scRNA-seq/scMultiome_Mouse_Crypts_FVF_P23033/cr_arc_rev8/cr_count'
agg_path = '/storage/scRNA-seq/scMultiome_Mouse_Crypts_FVF_P23033/cr_arc_rev8/cr_aggr'
velo_path = 'velocyto'
out_base_name = 'Diseased'
out_path = '/mnt/hdd/data/Files'

In [None]:
# Get a list of folder names, sorted alphabetically
folder_names = sorted([f for f in os.listdir(base_path1) if os.path.isdir(os.path.join(base_path1, f))])

In [None]:
folder_names

In [None]:
samples = ['105_Gut_PF1',
 '106_Gut_PF2',
 '107_Gut_VSG3',
 '108_Gut_VSG5',
 '83_Gut_Sham_d7_1',
 '84_Gut_VSG_d7_1',
 '85_Gut_Sham_d7_2',
 '86_Gut_VSG_d7_2',
 'HFD_1',
 'HFD_2',
 'HFD_3',
 #'HFD_mtmg', # high ambient genes fraction, low counts and low genes
 'MUC13635',
 'MUC13636',
 'MUC13643',
 'MUC13646',
 'MUC8397',
 'MUC8398',
 'MUC8400',
 'Control_2',
 'Mutant_2',
 'Control_4_FVR',
 'Mutant_4_FVR']

#exclusion bcs colon
#MUC8396
#MUC8401
#MUC8402
#MUC8404


In [None]:
adatas_velo = dict()
for i,sample in enumerate(samples):
    print(sample)
    if sample == 'Control_2':
        sample_name = 'Mutant_1'
    elif sample == 'Control_4_FVR':
        sample_name= 'Mutant_3_FVR'
    else:
        sample_name = sample
    print(sample_name)
    target_file = [files for files in os.listdir('/'.join([base_path,velo_path,sample])) if files.endswith('.loom')]
    adatas_velo[sample_name] = sc.read_loom('/'.join([base_path,velo_path,sample,target_file[0]]))

In [None]:
#test if Control_2 is still Mutant_1 and Control_4_FVr is actually still Mutant_3_FVR
samples = ['105_Gut_PF1',
 '106_Gut_PF2',
 '107_Gut_VSG3',
 '108_Gut_VSG5',
 '83_Gut_Sham_d7_1',
 '84_Gut_VSG_d7_1',
 '85_Gut_Sham_d7_2',
 '86_Gut_VSG_d7_2',
 'HFD_1',
 'HFD_2',
 'HFD_3',
 #'HFD_mtmg', # high ambient genes fraction, low counts and low genes
 'MUC13635',
 'MUC13636',
 'MUC13643',
 'MUC13646',
 'MUC8397',
 'MUC8398',
 'MUC8400',
 'Mutant_1',
 'Mutant_2',
 'Mutant_3_FVR',
 'Mutant_4_FVR']



adatas = dict()
d ={'True':1,'False':0}
for i,sample in enumerate(samples):
    try:
        target_file = [files for files in os.listdir(base_path1+ '/'+ sample + outs_path) if files.endswith('Doublets_detected.h5ad')]
        adata_tmp = sc.read(base_path1+ '/' + sample + outs_path + '/' + target_file[0])
        adata_tmp.obs['sample'] = sample
        if adata_tmp.var['is_ambient'].dtype != bool:
            adata_tmp.var[f'is_ambient-{i}']=adata_tmp.var['is_ambient'].map(d)
            adata_tmp.var[f'is_ambient-{i}']=adata_tmp.var[f'is_ambient-{i}'].to_numpy(dtype=bool)
        else:
            adata_tmp.var[f'is_ambient-{i}']=adata_tmp.var['is_ambient']
        adata_tmp.var[f'genome-{i}']=adata_tmp.var['genome'] #to keep genome info
        adatas[sample] = adata_tmp
    except FileNotFoundError:
        print(sample)

In [None]:
adatas

In [None]:
adatas_velo

# Add Velocyto Results

In [None]:
for key in adatas.keys():
    try:
        adatas_velo[key].obs_names = [name.split(':')[1][0:16] + '-1' for name in adatas_velo[key].obs_names]
    except IndexError: #already changed barcodes
        adatas_velo[key].obs_names = [name[0:16] + '-1' for name in adatas_velo[key].obs_names]
    adatas_velo[key].var_names_make_unique()
    try:
        adatas[key].layers = adatas_velo[key][adatas[key].obs_names,adatas[key].var_names].layers.copy()
    except KeyError:
        print(f' Value error in {key}.')
        print(adatas_velo[key].obs_names)
        print(adatas[key].obs_names)
        print(adatas[key].obs_names.intersection(adatas_velo[key].obs_names))

# Concatenate Samples & Filter Genes

In [None]:
# Concatenate RNA
import anndata as ad
rna = ad.concat([adatas[i] for i in adatas], merge="unique", label = 'batch', join='outer', index_unique="_") # outer join, because FVF-hi and low might lead to exclusion of intresting genes in case min_cell filter was applied during QC

In [None]:
rna.obs[rna.obs['sample']=='105_Gut_PF1']

In [None]:
rna.obs

In [None]:
rna

In [None]:
rna.X = np.nan_to_num(rna.X)

In [None]:
# Ambient genes
ambi_cols = rna.var.columns[[column.startswith('is_ambient-') for column in rna.var.columns]]
ambi_bool = [False] * rna.var.shape[0] #np.array()
for col in ambi_cols:
    ambi_bool = list(np.add(ambi_bool, [a is True for a in rna.var[ambi_cols].loc[:,col]])) #join ambient gene boolean by union
    
rna.var['is_ambient'] = ambi_bool

In [None]:
rna.var['is_ambient']

In [None]:
rna.var['is_ambient'].value_counts()

In [None]:
for col in ambi_cols:
    rna.var.drop(col,axis =1,inplace=True)

In [None]:
del adatas
del adatas_velo
del adata_tmp
gc.collect()

In [None]:
qc_metrics(rna, ambient=True)

In [None]:
# Save
rna.write(f'{base_path}/{out_base_name}/Dbtl_detected_velocyto_diseased_v1.h5ad')

In [None]:
adata = sc.read_h5ad(f'{base_path}/{out_base_name}/Dbtl_detected_velocyto_diseased_v1.h5ad')

In [None]:
genome_cols = adata.var.columns[[column.startswith('genome-') for column in adata.var.columns]]

In [None]:
adata.var['genome'] = adata.var['genome-0'][0]

In [None]:
for col in genome_cols:
    adata.var.drop(col,axis =1,inplace=True)

In [None]:
adata

In [None]:
adata.obs[adata.obs['sample'].isin([ 'Mutant_3_FVR',
 'Mutant_4_FVR'])]

In [None]:
get_umap_leiden(adata)

In [None]:
#Filter genes:
print('Total number of genes: {:d}'.format(adata.n_vars))

# Min 20 cells - filters out 0 count genes
sc.pp.filter_genes(adata, min_cells=20)
print('Number of genes after cell filter: {:d}'.format(adata.n_vars))

In [None]:
sc.pl.umap(adata, color=['sample','n_counts','log_counts','n_genes','log_genes','mt_frac','rp_frac'], title=['samples','total counts','log total counts','total genes','log total genes','mt fraction','rp fraction'], size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=2, wspace =1)

# Doublets

In [None]:
sc.pl.umap(adata, color=['final_doublets_cat','doublet_calls'], size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=2)

In [None]:
gc.collect()

In [None]:
# Number of doublet calls calls
print(adata.obs['doublet_calls'].value_counts())

In [None]:
# Number of final doublets
print('Number of doublets:')
print(adata.obs['final_doublets'].value_counts())

# Percentage:
print('\nOverall doublet rate: ',adata.obs['final_doublets'].value_counts()[1]/len(adata.obs['final_doublets'])*100,'%')

for sample in set(adata.obs['sample']):
    print('\n',sample,'doublet rate: ',adata.obs['final_doublets'][adata.obs['sample']==sample].value_counts()[1]/len(adata.obs['final_doublets'][adata.obs['sample']==sample])*100,'%')

In [None]:
# Annotate the data sets
print(adata.obs['sample'].value_counts())

# Checking the total size of the data set
adata.shape

In [None]:
adata

In [None]:
# Clean up .obs
adata.obs = adata.obs.loc[:,['sample', 'n_counts', 'log_counts', 'n_counts_rank', 'n_genes', 'log_genes', 'mt_frac', 'rp_frac', 'ambi_frac', 'final_doublets', 'final_doublets_cat', 'doublet_calls', 'cells_remain','is_paneth','batch']]

## Save

In [None]:
# Save
adata.write(f'{base_path}/{out_base_name}/Dbtl_detected_velocyto_diseased.h5ad')

# Session Info

In [None]:
%%R
sessionInfo()

In [None]:
sc.logging.print_versions()