In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!chmod 755 -R /content/drive/MyDrive/virtual_env/bin

In [None]:
import sys
# add the path of the virtual environmentsite-packages to colab system path
sys.path.append("/content/drive/MyDrive/virtual_env/lib/python3.10/site-packages")

In [None]:
from adjustText import adjust_text
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import matplotlib.pyplot as plt
import os
import seaborn as sns
import sys
from glob import iglob
from scipy import sparse
from scipy.sparse import coo_matrix

In [None]:
# verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.verbosity = 3
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')


In [None]:
# the directory with the `.h5` file
# use gene symbols for the variable names (variables-axis index)
input_dir="/content/drive/MyDrive/LeoDai/Private_Data/Preprocessing/chimp"
output_fig_dir="/content/drive/MyDrive/LeoDai/Private_Data/DE_genes/chimp"
output_dir="/content/drive/MyDrive/LeoDai/Private_Data/DE_genes/chimp"
cluster_dir="/content/drive/MyDrive/LeoDai/Private_Data/Clustering/Chimp"
sc.settings.figdir = output_dir

In [None]:
adata = sc.read_h5ad(os.path.join(input_dir,"chimp_all_raw_filt.h5ad"))
print(adata)

In [None]:
pd.DataFrame(adata.var_names).to_csv(os.path.join(output_dir,"chimp_genes.txt"),sep="\t",index=False)

In [None]:
#High zero rate, low depth
depth_per_gene = adata.obs.total_counts / adata.obs.n_genes_by_counts
print(depth_per_gene[:5])
avg_depth=np.mean(depth_per_gene)
print(avg_depth)
zero_rate = (adata.n_vars - adata.obs.n_genes_by_counts) / adata.n_vars
print(zero_rate)
avg_zero_rate = np.mean(zero_rate)
print(avg_zero_rate)
print(min(zero_rate))
print(max(zero_rate))

In [None]:
adata_norm = sc.read_h5ad(os.path.join(input_dir,"chimp_all_norm_class.h5ad"))
print(adata_norm)

In [None]:
#prepare gabaergic cells for DEG analysis
adata_gaba=sc.read_h5ad(os.path.join(cluster_dir,"Chimp_inh_level2_level3_cluster.h5ad"))
print(adata_gaba)
adata_inh = adata[adata.obs_names.isin(adata_gaba.obs_names),:]
adata_inh.obs["subclass"]=adata_gaba.obs["subclass"]
#save raw data into npz format for DE analysis
adata_inh.write_h5ad(os.path.join(output_dir,"chimp_inh_raw_filt_norm_subclass.h5ad"),compression='gzip')
cellinfo_inh=pd.DataFrame({"cell_id":adata_inh.obs_names,"subclass":adata_inh.obs['subclass'],"batch":adata_inh.obs["sample"]})
#cellinfo
cellinfo_inh.to_csv(os.path.join(output_dir,"chimp_cellinfo_inh.txt"),sep="\t", index=False)

sparse.save_npz(
     os.path.join(output_dir, "chimp_inh_raw_counts.npz"),
     sparse.csr_matrix(adata_inh.X)
     )


In [None]:
#prepare data for glut DEG anlaysis
adata_glut=sc.read_h5ad(os.path.join(cluster_dir,"Chimp_exc_level2_level3_cluster.h5ad"))
adata_exc = adata[adata.obs_names.isin(adata_glut.obs_names),:]
adata_exc.obs["subclass"]=adata_glut.obs["subclass"]
adata_exc.write_h5ad(os.path.join(output_dir,"chimp_exc_raw_filt_norm_subclass.h5ad"),compression='gzip')
cellinfo_exc=pd.DataFrame({"cell_id":adata_exc.obs_names,"subclass":adata_exc.obs['subclass'],"batch":adata_exc.obs["sample"]})
#cellinfo
cellinfo_exc.to_csv(os.path.join(output_dir,"chimp_cellinfo_exc.txt"),sep="\t", index=False)
#pd.DataFrame(adata.X).to_csv(os.path.join(output_dir,"chimp_raw_counts.txt"),sep="\t")
#M=adata.X.power(-1)
#M.A
sparse.save_npz(
     os.path.join(output_dir, "chimp_exc_raw_counts.npz"),
     sparse.csr_matrix(adata_exc.X)
     )

In [None]:
adata_glial=sc.read_h5ad(os.path.join(cluster_dir,"Chimp_glial_level2_level3_cluster.h5ad"))
adata_nn = adata[adata.obs_names.isin(adata_glial.obs_names),:]
adata_nn.obs["subclass"]=adata_glial.obs["subclass"]
adata_nn.obs['cluster_label'] = adata_glial.obs['cluster_label']
adata_nn.write_h5ad(os.path.join(output_dir,"chimp_nn_raw_filt_norm_subclass.h5ad"),compression='gzip')
cellinfo_nn=pd.DataFrame({"cell_id":adata_nn.obs_names,"subclass":adata_nn.obs['subclass'],"batch":adata_nn.obs["sample"]})
#cellinfo
cellinfo_nn.to_csv(os.path.join(output_dir,"chimp_cellinfo_nn.txt"),sep="\t", index=False)
#pd.DataFrame(adata.X).to_csv(os.path.join(output_dir,"chimp_raw_counts.txt"),sep="\t")
#M=adata.X.power(-1)
#M.A
sparse.save_npz(
     os.path.join(output_dir, "chimp_nn_raw_counts.npz"),
     sparse.csr_matrix(adata_nn.X)
     )

In [None]:
adata_nn_astro = adata[adata.obs.subclass == "Astro",:]
adata_nn_opc = adata[adata.obs.subclass == "OPC",:]

adata_nn_oligo = adata[adata.obs.subclass == "Oligo",:]

adata_nn_VLMC = adata[adata.obs.subclass == "VLMC",:]


In [None]:
adata_nn_VLMC.write_h5ad(os.path.join(output_dir,"chimp_nn_VLMC_raw_filt_norm_subclass.h5ad"),compression='gzip')
cellinfo_nn_VLMC=pd.DataFrame({"cell_id":adata_nn_VLMC.obs_names,"cell_type":adata_nn_VLMC.obs['cluster_label'],"batch":adata_nn_VLMC.obs["sample"]})
#cellinfo
cellinfo_nn_VLMC.to_csv(os.path.join(output_dir,"chimp_cellinfo_nn_VLMC.txt"),sep="\t", index=False)
#pd.DataFrame(adata.X).to_csv(os.path.join(output_dir,"chimp_raw_counts.txt"),sep="\t")
#M=adata.X.power(-1)
#M.A
sparse.save_npz(
     os.path.join(output_dir, "chimp_nn_VLMC_raw_counts.npz"),
     sparse.csr_matrix(adata_nn_VLMC.X)
     )

In [None]:
adata_nn_astro.write_h5ad(os.path.join(output_dir,"chimp_nn_astro_raw_filt_norm_subclass.h5ad"),compression='gzip')
cellinfo_nn_astro=pd.DataFrame({"cell_id":adata_nn_astro.obs_names,"cell_type":adata_nn_astro.obs['cluster_label'],"batch":adata_nn_astro.obs["sample"]})
#cellinfo
cellinfo_nn_astro.to_csv(os.path.join(output_dir,"chimp_cellinfo_nn_astro.txt"),sep="\t", index=False)
#pd.DataFrame(adata.X).to_csv(os.path.join(output_dir,"chimp_raw_counts.txt"),sep="\t")
#M=adata.X.power(-1)
#M.A
sparse.save_npz(
     os.path.join(output_dir, "chimp_nn_astro_raw_counts.npz"),
     sparse.csr_matrix(adata_nn_astro.X)
     )

In [None]:
adata_nn_oligo.write_h5ad(os.path.join(output_dir,"chimp_nn_oligo_raw_filt_norm_subclass.h5ad"),compression='gzip')
cellinfo_nn_oligo=pd.DataFrame({"cell_id":adata_nn_oligo.obs_names,"cell_type":adata_nn_oligo.obs['cluster_label'],"batch":adata_nn_oligo.obs["sample"]})
#cellinfo
cellinfo_nn_oligo.to_csv(os.path.join(output_dir,"chimp_cellinfo_nn_oligo.txt"),sep="\t", index=False)
#pd.DataFrame(adata.X).to_csv(os.path.join(output_dir,"chimp_raw_counts.txt"),sep="\t")
#M=adata.X.power(-1)
#M.A
sparse.save_npz(
     os.path.join(output_dir, "chimp_nn_oligo_raw_counts.npz"),
     sparse.csr_matrix(adata_nn_oligo.X)
     )

In [None]:
adata_nn_opc.write_h5ad(os.path.join(output_dir,"chimp_nn_OPC_raw_filt_norm_subclass.h5ad"),compression='gzip')
cellinfo_nn_opc=pd.DataFrame({"cell_id":adata_nn_opc.obs_names,"cell_type":adata_nn_opc.obs['cluster_label'],"batch":adata_nn_opc.obs["sample"]})
#cellinfo
cellinfo_nn_opc.to_csv(os.path.join(output_dir,"chimp_cellinfo_nn_opc.txt"),sep="\t", index=False)
#pd.DataFrame(adata.X).to_csv(os.path.join(output_dir,"chimp_raw_counts.txt"),sep="\t")
#M=adata.X.power(-1)
#M.A
sparse.save_npz(
     os.path.join(output_dir, "chimp_nn_opc_raw_counts.npz"),
     sparse.csr_matrix(adata_nn_opc.X)
     )

In [None]:
adata=sc.read_h5ad(os.path.join(output_dir,"chimp_exc_raw_filt_norm_subclass.h5ad"))
adata_mask=sc.read_h5ad(os.path.join(cluster_dir,"Chimp_exc_level2_level3_cluster.h5ad"))
adata.obs['cluster_label'] = adata_mask.obs['cluster_label']
adata.write_h5ad(os.path.join(output_dir,"chimp_exc_cell_types_raw_filt_norm_subclass.h5ad"),compression='gzip')
cellinfo_exc_cell_types=pd.DataFrame({"cell_id":adata.obs_names,"subclass":adata.obs["subclass"],"cell_type":adata.obs['cluster_label'],"batch":adata.obs["sample"]})
#cellinfo
cellinfo_exc_cell_types.to_csv(os.path.join(output_dir,"chimp_cellinfo_exc_cell_types.txt"),sep="\t", index=False)
#pd.DataFrame(adata.X).to_csv(os.path.join(output_dir,"chimp_raw_counts.txt"),sep="\t")
#M=adata.X.power(-1)
#M.A
sparse.save_npz(
     os.path.join(output_dir, "chimp_exc_cell_types_raw_counts.npz"),
     sparse.csr_matrix(adata.X)
     )
for i in adata.obs['subclass'].unique():
  adata_exc_subclass = adata[adata.obs.subclass == i,:]
  adata_exc_subclass.write_h5ad(os.path.join(output_dir,"chimp_exc_" + i + "_raw_filt_norm_subclass.h5ad"),compression='gzip')
  cellinfo_exc_subclass=pd.DataFrame({"cell_id":adata_exc_subclass.obs_names,"cell_type":adata_exc_subclass.obs['cluster_label'],"batch":adata_exc_subclass.obs["sample"]})
  cellinfo_exc_subclass.to_csv(os.path.join(output_dir,"chimp_cellinfo_exc_" + i + ".txt"),sep="\t", index=False)
  sparse.save_npz(
     os.path.join(output_dir, "chimp_exc_" + i + "_raw_counts.npz"),
     sparse.csr_matrix(adata_exc_subclass.X)
     )

In [None]:
adata=sc.read_h5ad(os.path.join(output_dir,"chimp_exc_raw_filt_norm_subclass.h5ad"))
adata_mask=sc.read_h5ad(os.path.join(cluster_dir,"Chimp_exc_level2_level3_cluster.h5ad"))
adata.obs['cluster_label'] = adata_mask.obs['cluster_label']

In [None]:
adata=sc.read_h5ad(os.path.join(output_dir,"chimp_inh_raw_filt_norm_subclass.h5ad"))
adata_mask=sc.read_h5ad(os.path.join(cluster_dir,"Chimp_inh_level2_level3_cluster.h5ad"))
adata.obs['cluster_label'] = adata_mask.obs['cluster_label']
adata.write_h5ad(os.path.join(output_dir,"chimp_inh_cell_types_raw_filt_norm_subclass.h5ad"),compression='gzip')
cellinfo_exc_cell_types=pd.DataFrame({"cell_id":adata.obs_names,"subclass":adata.obs["subclass"],"cell_type":adata.obs['cluster_label'],"batch":adata.obs["sample"]})
#cellinfo
cellinfo_exc_cell_types.to_csv(os.path.join(output_dir,"chimp_cellinfo_inh_cell_types.txt"),sep="\t", index=False)
#pd.DataFrame(adata.X).to_csv(os.path.join(output_dir,"chimp_raw_counts.txt"),sep="\t")
#M=adata.X.power(-1)
#M.A
sparse.save_npz(
     os.path.join(output_dir, "chimp_inh_cell_types_raw_counts.npz"),
     sparse.csr_matrix(adata.X)
     )
for i in adata.obs['subclass'].unique():
  adata_exc_subclass = adata[adata.obs.subclass == i,:]
  adata_exc_subclass.write_h5ad(os.path.join(output_dir,"chimp_inh_" + i + "_raw_filt_norm_subclass.h5ad"),compression='gzip')
  cellinfo_exc_subclass=pd.DataFrame({"cell_id":adata_exc_subclass.obs_names,"cell_type":adata_exc_subclass.obs['cluster_label'],"batch":adata_exc_subclass.obs["sample"]})
  cellinfo_exc_subclass.to_csv(os.path.join(output_dir,"chimp_cellinfo_inh_" + i + ".txt"),sep="\t", index=False)
  sparse.save_npz(
     os.path.join(output_dir, "chimp_inh_" + i + "_raw_counts.npz"),
     sparse.csr_matrix(adata_exc_subclass.X)
     )