<a href="https://colab.research.google.com/github/Ken-Lau-Lab/single-cell-lectures/blob/main/section01_scRNAseq_intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## __Section 1:__ Introduction to scRNA-seq

February 9, 2022

In [None]:
!git clone git://github.com/Ken-Lau-Lab/single-cell-lectures  # for Colab users

In [None]:
!pip install scanpy

In [None]:
!pip install leidenalg

In [None]:
import scanpy as sc; sc.set_figure_params(dpi=200)
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns

In [None]:
adata = sc.read_h5ad('single-cell-lectures/data/GSM3305227_Tumor_1_Full.h5ad')  # for Colab users, path to repository
#adata = sc.read_h5ad('data/GSM3305227_Tumor_1_Full.h5ad')  # for local users, read from data directory

In [None]:
adata.raw = adata  # save un-manipulated state to .raw attribute of AnnData object

In [None]:
adata.X  # working array of counts data

In [None]:
# normalize, transform, and scale counts
sc.pp.normalize_total(adata)
adata.X = np.arcsinh(adata.X).copy()
sc.pp.scale(adata)

In [None]:
adata.X

In [None]:
adata.var_names

In [None]:
adata.var['Mitochondrial'] = adata.var.index.str.startswith('mt-')
sc.pp.calculate_qc_metrics(adata,qc_vars=['Mitochondrial'],use_raw=True,inplace=True)

In [None]:
sc.pp.pca(adata,random_state=0)

In [None]:
neighborhood_k = np.sqrt(adata.n_obs).astype(int)  # We have found that scaling the K to equal the square root of the total number of neighbors to be effective
sc.pp.neighbors(adata,n_neighbors=neighborhood_k,use_rep='X_pca',random_state=0)  # Calculate this KNN based off of the PCA distances

In [None]:
sc.tl.leiden(adata,resolution=0.5,random_state=0)  # Here we use a resolution of 2, which should yield 30+ clusters. This step may take a while.

In [None]:
# Differential gene expression testing using the .raw values
sc.tl.rank_genes_groups(adata,groupby='leiden',use_raw=True,n_genes=200,method='wilcoxon')

In [None]:
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, use_raw=False)

In [None]:
sc.pl.rank_genes_groups_heatmap(adata, n_genes=5, show_gene_labels=True, use_raw=False)

In [None]:
sc.pl.violin(adata, keys=['total_counts','total_counts_Mitochondrial','pct_counts_Mitochondrial'], jitter=0.4, multi_panel=True)

In [None]:
#Show differential gene expression results
sc.pl.rank_genes_groups(adata)

In [None]:
sc.pl.violin(adata, keys=['Krt20','Myc','Lgr5'], use_raw=False)

In [None]:
sc.pl.violin(adata, keys=['Krt20','Myc','Lgr5'], groupby='leiden', use_raw=False)

In [None]:
adata

In [None]:
pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(10)

In [None]:
#Subsetting and slicing

In [None]:
#Cell-wise / Obs-wise subsetting

In [None]:
adata[:10] #first 10

In [None]:
adata[10:100] #between 10-100

In [None]:
random_idx = np.random.randint(0,1000,500) #choose 500 cells between indices 0-1000
adata[random_idx]

In [None]:
#Subsetting feature/variable metadata

In [None]:
adata.var[adata.var['Mitochondrial']] #look at just the mitochondrial subset of variables

In [None]:
adata.obs_vector('mt-Atp6') #get all values of a particular variable/feature across cells