In [None]:
import scanpy as sc
import math
import os
import anndata
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import harmonypy as hm
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=120, dpi_save=150, facecolor='white', color_map='Blues')

In [None]:
full_path = 'C:/Users/julia/Project'
BL_A = sc.read_h5ad(os.path.join(full_path+'/TEST/BL_A/AnnData_storage/BL_A.h5ad'))
BL_C = sc.read_h5ad(os.path.join(full_path+'/TEST/BL_C/AnnData_storage/BL_C.h5ad'))
BL_A.obs['batch'] = 'BL_A'
BL_C.obs['batch'] = 'BL_C'
concat = anndata.concat([BL_C, BL_A], index_unique='_', axis=0, join='inner', 
                              merge=None, uns_merge=None, label=None, keys=None,
                                fill_value=None, pairwise=None)

In [None]:
data_mat = concat.obsm['X_pca']
meta_data = concat.obs
vars_use = ['batch']
ho = hm.run_harmony(data_mat, meta_data, vars_use,
                             theta=None, lamb=None, sigma=0.1,
                             nclust=None, tau=0, block_size=0.5,
                             max_iter_harmony=10, max_iter_kmeans=20,
                             epsilon_cluster=1e-5, epsilon_harmony=1e-4,
                             plot_convergence=True, verbose=True, reference_values=None,
                             cluster_prior=None, random_state=0)
adjusted_pcs = pd.DataFrame(ho.Z_corr).T
concat.obsm['X_pca'] = adjusted_pcs.values
sc.pp.neighbors(concat, n_pcs=20)
sc.tl.leiden(concat, resolution=0.5)
sc.tl.umap(concat)

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(10,8),constrained_layout=True)
Cocult = concat[concat.obs['batch'] == 'BL_C']
Sample = concat[concat.obs['batch'] == 'BL_A']
sc.pl.umap(concat, color="batch", title='samples', ax=axs[0,0], show=False)
sc.pl.umap(concat, color="leiden", title="Leidenalg UMAP", ax=axs[0,1], show=False)
sc.pl.umap(Sample, color="leiden", title="BL_A sample only", ax=axs[1,0], show=False)
sc.pl.umap(Cocult, color="leiden", title="BL_C sample only", ax=axs[1,1], show=False)
        

In [None]:
adata_DE = concat.raw.to_adata()
adata_DE

In [None]:
sc.tl.rank_genes_groups(adata_DE, 'leiden', method='wilcoxon', corr_method='bonferroni', key='wilcoxon', pts=True)
sc.tl.filter_rank_genes_groups(adata_DE, groupby='leiden', min_in_group_fraction=0.1, min_fold_change=1)
#sc.pl.rank_genes_groups(adata_DE,sharey=False)

In [None]:
adata_DE

In [None]:
adata=adata_DE

Make this user friendly --> genes are hardcoded for now

In [None]:
goede_anndatas = []
marker_genes = ['VIM', 'FABP7', 'S100B']
for cluster in np.unique(adata.obs['leiden']):
    subcluster = adata[adata.obs['leiden']==cluster]
    justmarkers = subcluster[:, marker_genes]
    df = justmarkers.to_df()
    pos = df[(df['VIM'] > 0) & (df['FABP7'] > 0) & (df['S100B'] > 0)]
    pos.to_csv(full_path+f'/dfjes/{cluster}positive.tsv', sep='\t', encoding='utf-8')
    df.to_csv(full_path+f'/dfjes/{cluster}_rank_genes_df.tsv', sep='\t', encoding='utf-8')
    som = int((len(pos)/len(df)*100))
    if som >= 20:
        print('goed!')
        goede_anndatas.append(subcluster)
    else:
        print('niet zo goed!')
    print(len(pos), len(df), som)

In [None]:
concat2 = anndata.concat(goede_anndatas, index_unique='_', axis=0, join='inner', 
                              merge=None, uns_merge=None, label=None, keys=None,
                                fill_value=None, pairwise=None)
concat2

In [None]:
# sc.pp.filter_cells(concat2, min_genes=700)
# sc.pp.filter_genes(concat2, min_cells=3)
# concat2.var['mt'] = concat2.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
# sc.pp.calculate_qc_metrics(concat2, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
# concat2 = concat2[concat2.obs.pct_counts_mt < 20, :]
# sc.pp.normalize_total(concat2, target_sum=1e4) # still included highly expressed data for now
# sc.pp.log1p(concat2)
# sc.pp.highly_variable_genes(concat2, flavor='cell_ranger', subset=False) # EDIT: added this line for testing
# concat2 = concat2[:, concat2.var.highly_variable] # Actually do the slicing
# sc.pp.regress_out(concat2, ['total_counts', 'pct_counts_mt']) # regress out sequencing depth and % MT-RNA
# sc.pp.scale(concat2)
# sc.tl.pca(concat2, n_comps=50)

In [None]:
data_mat = concat2.obsm['X_pca']
meta_data = concat2.obs
vars_use = ['batch']
ho = hm.run_harmony(data_mat, meta_data, vars_use,
                             theta=None, lamb=None, sigma=0.1,
                             nclust=None, tau=0, block_size=0.5,
                             max_iter_harmony=10, max_iter_kmeans=20,
                             epsilon_cluster=1e-5, epsilon_harmony=1e-4,
                             plot_convergence=True, verbose=True, reference_values=None,
                             cluster_prior=None, random_state=0)
adjusted_pcs = pd.DataFrame(ho.Z_corr).T
concat2.obsm['X_pca'] = adjusted_pcs.values
sc.pp.neighbors(concat2, n_pcs=20)
sc.tl.leiden(concat2, resolution=0.5)
sc.tl.umap(concat2)

This looks really bad

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(10,8),constrained_layout=True)
Cocult = concat2[concat2.obs['batch'] == 'BL_C']
Sample = concat2[concat2.obs['batch'] == 'BL_A']
sc.pl.umap(concat2, color="batch", title='samples', ax=axs[0,0], show=False)
sc.pl.umap(concat2, color="leiden", title="Leidenalg UMAP", ax=axs[0,1], show=False)
sc.pl.umap(Sample, color="leiden", title="BL_A sample only", ax=axs[1,0], show=False)
sc.pl.umap(Cocult, color="leiden", title="BL_C sample only", ax=axs[1,1], show=False)

In [None]:
# clusteradatas = []
# teller = 0
# data = {}
# for cluster_label in np.unique(adata_DE.obs['leiden']):
#     cluster_adata = adata_DE[adata_DE.obs['leiden'] == cluster_label]
#     lfc_values = cluster_adata.uns['rank_genes_groups']['logfoldchanges']
#     names = cluster_adata.uns['rank_genes_groups']['names']
#     barcode_list = []
#     for barcode in cluster_adata.obs_names:
#         barcode_list.append(barcode)
#     data = {'names' : names.tolist(),
#             'lfc_values' : lfc_values.tolist()}
#     print(data)
    #print(pd.DataFrame(data, index=list(range(len(names)))))
    #df.to_csv(full_path+f'/dfjes/{cluster_label}_rank_genes_df.tsv', sep='\t', encoding='utf-8')
    # for cell_index, lfc in zip(cluster_adata.obs_names, lfc_values):
    #     # for all cells
    #     print(f"Cell {cell_index}: LFC = {lfc}")


In [None]:
# adata = adata_DE
# # pd.DataFrame(adata.uns['rank_genes_groups']['names'])
# result = adata.uns['rank_genes_groups']
# groups = result['names'].dtype.names
# pd.DataFrame(
#     {group + '_' + key[:1]: result[key][group]
#     for group in groups for key in ['names', 'logfoldchanges', 'pts']}).head(5)

In [None]:
    # # clusteradatas.append(cluster_adata)
    # # cluster_adata = cluster_adata[cluster_adata[: , 'VIM'].X > 0, :]
    # # cluster_adata = cluster_adata[cluster_adata[: , 'S100B'].X > 0, :]
    # # cluster_adata = cluster_adata[cluster_adata[: , 'GFAP'].X > 0, :]
    # # teller+=cluster_adata.n_obs
    # result = cluster_adata.uns['rank_genes_groups']['logfoldchanges'][cluster_label]
    # cluster_adata.layers['result'] = result
    # # groups = result['names'].dtype.names
    # # a = pd.DataFrame(
    # # {group + '_' + key[:1]: result[key][group]
    # # for group in groups for key in ['names', 'logfoldchanges', 'pts']})
    # #a.to_csv(full_path+f'/dfjes/{cluster_label}_rank_genes_df.tsv', sep='\t', encoding='utf-8')