# Prep Environment

In [None]:
#Import relevant packages
import numpy as np
import pandas as pd
from matplotlib import rcParams
import os
import scanpy as sc
import anndata as ad

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm

#For nice color schemes
import cmocean

#For barplots
import seaborn as sns
from statannot import add_stat_annotation

In [None]:
os.chdir('/hpc/group/goldsteinlab/Python')

In [None]:
#Show full pandas dataframe when produced
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [None]:
#Set fontsize
plt.rcParams.update({'font.size': 15})

In [2]:
%matplotlib inline

# Read data

In [None]:
# read full count matrix
full_counts = pd.read_csv('/hpc/group/goldsteinlab/R/Working_directory/bulk_seq_atlas/1000hvgs_604sample_bulk_RNA_Seq_atlas.csv')

In [None]:
full_counts

In [None]:
# make gene names row indices
genes = full_counts.iloc[:, 0]
full_counts.set_index(genes, inplace=True)

In [None]:
# drop exogenous column
full_counts=full_counts.drop(columns=full_counts.columns[0])

In [None]:
#transpose df
full_counts = full_counts.T

In [None]:
# create anndata object from counts df
adata = ad.AnnData(X=full_counts)

In [None]:
# read in list of conditions to add to .obs
conds = pd.read_csv('/hpc/group/goldsteinlab/R/Working_directory/bulk_seq_atlas/updated_sample_order_604_samples.csv')

In [None]:
# set row indices to be the same as obs_names in adata (allows for transfer over)

# make gene names row indices
row_indices = full_counts.index # this is full_counts from df used to make adata above
conds.set_index(row_indices, inplace=True)

In [None]:
# add obs
adata.obs['tumor_type']=conds['x']

In [None]:
# check to make sure obs match up
adata.obs

# Generate clustering and UMAPs

In [None]:
# Generate PCs and UMAP

# define pcs
pcs=60

sc.pp.pca(adata, n_comps=pcs)
sc.pl.pca_variance_ratio(adata, log=True, n_pcs=pcs)

In [None]:
sc.pp.neighbors(adata, n_pcs=pcs, n_neighbors=10)
sc.tl.leiden(adata, resolution=0.8)
sc.tl.umap(adata)

In [None]:
# plot UMAP
fig, ax = plt.subplots(figsize=(6, 6))
sc.pl.umap(adata, color="leiden", legend_loc='on data',
        ax=ax, frameon=False, save=False, 
          legend_fontoutline=2)

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
sc.pl.umap(adata, color="tumor_type", legend_loc='on data',
        ax=ax, frameon=False, save=False, 
          legend_fontoutline=2)

# Perform DE

In [None]:
# Find marker genes for leiden clusters
#Find cluster markers
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon', layer='counts', use_raw=False)
pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(50)

# ONB, SCLC, PNEC in leiden cluster 1
# neuroblastoma in leiden cluster 18
# Medulloblastoma cluster 9
# glioma cluster 5

In [None]:
# find drivers of neural vs. neuroendocrine tumor clusters

# Combine clusters '9', '5', and '18' into 'neural tumors'
adata.obs['new_category'] = adata.obs['leiden'].replace({'9': 'neural tumors', '5': 'neural tumors', '18': 'neural tumors'})

# Rename cluster '1' to 'neuroendocrine tumors'
adata.obs['new_category'] = adata.obs['new_category'].replace({'1': 'neuroendocrine tumors'})

# You may want to visualize the new category
sc.pl.umap(adata, color='new_category', legend_loc='on data')

# If you are satisfied with the result, you can remove the original 'leiden' information
adata.obs.drop('leiden', axis=1, inplace=True)

In [None]:
# remove normal OE
# Now remove COVID and Presbyosmic patients
bad_clust=['Normal OE']

#Filter out bad clusters
to_keep=(~adata.obs['tumor_type'].isin(bad_clust))

#Copy over to new anndata object
adata_hvg_no_OE = adata[to_keep].copy()

In [None]:
# Perform DE comparing specific clusters

#Find cluster markers
sc.tl.rank_genes_groups(adata_hvg_no_OE, 'new_category', groups=['neural tumors'], reference="neuroendocrine tumors", method='wilcoxon', layer='counts', use_raw=False)
pd.DataFrame(adata_hvg_no_OE.uns['rank_genes_groups']['names']).head(50)

In [None]:
result = adata_hvg_no_OE.uns['rank_genes_groups']
groups = result['names'].dtype.names
df_neural_tumors=pd.DataFrame(
    {group + '_' + key[:10]: result[key][group]
    for group in groups for key in ['names', 'pvals_adj']})