# Session 2: Intro to scRNA-seq preprocessing and QC with Python

Exercise 1: Load a small count matrix, evaluate the dimensions and sparsity

Exercise 2: Calculate QC metrics on a real dataset

Exercise 3: Evaluate the effect of different QC thresholds on a real dataset

In [None]:
### Load libraries

import re # regular expressions

import scanpy as sc # single-cell analysis
import pandas as pd # data manipulation
import numpy as np # numerical operations
import matplotlib.pyplot as plt # plotting
import seaborn as sns # visualization

## Exercise 1: Load and evaluate a dummy count matrix

1. Load the (dummy) count matrix.
2. Evaluate the dimensions: how many drops/samples? How many genes/features?
3. How sparse is the matrix?

In [None]:
### Load the count matrix

# NOTE: the count matrix was formatted as a dense pandas DataFrame for this example,
# and has labeled indices / columns.
# In practice, the count matrix is stored as a sparse matrix.

df_counts = pd.read_csv("./files_for_session2/ex1_dummy_count_matrix.csv", index_col=0)
df_counts

In [None]:
### Evaluate the dimensions: how many drops/cells/samples? How many genes/features?

print(f"Number of cells: {df_counts.shape[0]}")
print(f"Number of genes: {df_counts.shape[1]}")

In [None]:
### How sparse is the df_counts matrix?

# 1. How many zero entries are present for each gene?
zero_entries_per_gene = (df_counts == 0).astype(int).sum(axis=0)
print(zero_entries_per_gene)

# 2. How many zero entries are present in total?
print(f"Total number of zero entries in the dummy matrix: {zero_entries_per_gene.sum()}")
print(f"Percentage of zero entries in the dummy matrix: {zero_entries_per_gene.sum() / df_counts.size * 100:.2f}%")

In [None]:
### Plot a histogram of the number of zero entries per gene

fig, ax = plt.subplots(figsize=(6, 4))

sns.histplot(zero_entries_per_gene, bins=30, kde=True, ax=ax)

ax.set_title("Histogram of zero entries per gene")
ax.set_xlabel("Number of zero entries per gene")
ax.set_ylabel("Number of genes")

## Exercise 2: Calculate QC metrics on the count matrix

An anndata object containing two samples (rd10-M, wt-M) is provided.

Load the count matrix to answer the following questions:

1. How many cells are present in the matrix? How many genes?
2. How many transcripts were detected per cell? How many genes?
3. How many cells is each gene expressed in?

**Advanced**    
1. Calculate the percent of mitochondrial transcripts expressed in each cell:    
    a. Define mitochondrial genes by using a regular expression      
    b. Calculate the % of transcripts that belong to a mitochondrial gene

In [None]:
### Load dataset using scanpy
adata = sc.read_h5ad("./session2/ex2_adata_object.h5ad")

# The count matrix can be accessed via the `.X` attribute
# adata.X

# The .obs attribute contains metadata about each cell
# adata.obs

# The .var attribute contains metadata about each gene
# adata.var

In [None]:
### How many cells and genes are present in the object?

n_cells, n_genes = adata.shape
print(f"Number of cells: {n_cells}, Number of genes: {n_genes}")

# Do they make sense to you, based on your knowledge of 10X, for two samples?

In [None]:
### How many transcripts / genes were detected per cell?

n_transcripts_per_cell = np. ... (adata.X, axis=1).A1

# What's the average number of transcripts detected per cell? Fill in the missing numpy operation.
print(f"Average number of transcripts per cell: {np. ... (n_transcripts_per_cell)}")

# How would you find out how many genes were detected in each cell, using the count matrix?
# Think of what it means for a gene to be detected in a cell,
# and how to express that notion in programming terms.

n_genes_per_cell = np.sum( (...), axis=1).A1

# What's the average number of genes detected per cell?
print(f"Average number of genes detected per cell: {np. ... (n_genes_per_cell)}")

In [None]:
### How many cells express each gene?

# You will use a similar trick as the n_genes calculation,
# but in a different dimension. 

n_cells_per_gene = np.sum( (...), axis= ... ).A1
print(f"Average number of cells expressing a gene: {np. ... (n_cells_per_gene)}")

### Update the Anndata object with the calculated QC metrics

In [None]:
### Add n_transcripts_per_cell, n_genes_per_cell, and n_cells_per_gene to the adata object as QC metrics associated with each cell/gene.

# For this, we add our calculated arrays to the .obs and .var attributes of the adata object.

adata.obs["n_transcripts_per_cell"] = n_transcripts_per_cell
adata.obs["n_genes_per_cell"] = n_genes_per_cell
adata.var["n_cells_per_gene"] = n_cells_per_gene

### Plot the calculated QC values using matplotlib or seaborn.

Generate histograms and violin plots.

For cellwise measures, can you plot each sample's values separately? (You can try the hue option.)

In [None]:
### Plot histograms of QC metrics, grouped by sample_id

# Use the dataframe found in the adata.obs attribute to access both the QC metrics and sample_id.

fig, axes = plt.subplots(1, 3, figsize=(12, 4))

sns.histplot(adata.obs, x= ... , hue= ... , bins=50, ax=axes[0]) # the hue attribute allows us to print two colored histograms on top of each other
sns.histplot(adata.obs, x= ... , hue= ... , bins=50, ax=axes[1])
sns.histplot(adata.var, x= ... , bins=50, ax=axes[2])

axes[0].set_title("Transcripts per cell")
axes[1].set_title("Genes per cell")
axes[2].set_title("Cells per gene")
plt.tight_layout()
plt.show()

In [None]:
### Plot violin plots

fig, axes = plt.subplots(1, 3, figsize=(12, 4))

sns.violinplot(data=adata.obs, x= ... , y= ... , cut=0, ax=axes[0]) # cut=0 prevents the violin from extending beyond the data range
sns.violinplot(data=adata.obs, x= ... , y= ... ,  cut=0, ax=axes[1])
sns.violinplot(data=adata.var, y= ... , cut=0, ax=axes[2])

axes[0].set_title("Transcripts per cell")
axes[1].set_title("Genes per cell")
axes[2].set_title("Cells per gene")

plt.tight_layout()
plt.show()

### Annotating mitochondrial genes using regex

A regular expression is a sequence of characters that form a pattern, which can be used to match strings. They are presented as strings, and as such are enclosed in quotes. Characters within a regex are termed "tokens". 

For example, the regex string 'hi' would match:

- 'hi'
- 'ghibli'
- 'hiya'
- 'mahi mahi'

but would not match 'hahaha', etc. 

We can use special characters in regex strings to go beyond exact text matches. These characters are incorporated into the regex string to perform their functions:

- '^hi': the matched string should start with 'hi': 'hi' and 'hiya' would match, but 'ghibli', 'hahaha' and 'mahi mahi' would not.
- 'hi$': the match string should end with 'hi': 'hi' and 'mahi mahi' would match, but 'ghibli', 'hahaha' and 'hiya' would not.
- '.': represents any character.
- 'hi*': the asterisk edits the token preceding it - in this case, zero to any number of i', followed by an h, would match - this would match all the five strings above, including 'hahaha'.

We can use regex patterns to find the indices of genes that we are looking for. If we wanted to find the gene *CNGB3*, we would use the regex string '^CNGB3' -- if we are interested finding the indices for all CNG channel members, we can simply use the string '^CNG', but if there is another gene, say, 'CNGPH' - it would also match! 

You can build and test Python regexes at [regex101](https://regex101.com/).

**The bottom section is already filled, except for the regex to detect mouse mitochondrial transcripts.**

In [None]:
# Write down the regex that would match mitochondrial genes IN THE MOUSE.
mito_regex = ...
gene_names = adata.var_names.astype(str)

# How many mitochondrial genes are there in the matrix?
# Would recommend using list comprehension here - always a good trick up your sleeve.
adata.var["is_mito"] = gene_names.str.match(mito_regex)
mito_genes = adata.var[adata.var["is_mito"]].index
print(f"Number of mitochondrial genes: {len(mito_genes)}")

# Find the indices of mitochondrial genes in the count matrix, so that you can check their counts.
mito_indices = gene_names.str.match(mito_regex)

# Ensure that the mito_indices are boolean (True/False) values.
assert mito_indices.dtype == "bool", "mito_indices should be boolean values"

# Ensure that the number of mito_genes matches the number of True values in mito_indices.
assert len(mito_genes) == mito_indices.sum(), "Mismatch in number of mitochondrial genes and True values in mito_indices"

In [None]:
# Now that you found the positions in the count matrix where mito genes are,
# You can calculate the total counts of mitochondrial genes per cell.
mito_counts_per_cell = np.sum(adata.X[ ... ], axis=1).A1 # How would you select the mito genes from the count matrix?

# and you can also calculate the percentage of mitochondrial counts per cell.
mito_percentage_per_cell = ( ... / n_transcripts_per_cell) * 100

# What is the average percentage of mitochondrial transcripts per cell?
average_mito_percentage = np. ... (mito_percentage_per_cell)
print(f"Average percentage of mitochondrial transcripts per cell: {average_mito_percentage:.2f}%")

# How many cells have more than 5% of their transcripts coming from mitochondrial genes?
high_mito_cells = np. ... (mito_percentage_per_cell > 5)
print(f"Number of cells with more than 5% mitochondrial transcripts: {high_mito_cells}")

# What is the percentage of cells with more than 5% mitochondrial transcripts?
percentage_high_mito_cells = ( ... / ... ) * 100
print(f"Percentage of cells with more than 5% mitochondrial transcripts: {percentage_high_mito_cells:.2f}%")

In [None]:
# Add percent_mito to the adata object
adata.obs["percent_mito"] = mito_percentage_per_cell

# Plot the percentage of mitochondrial transcripts per cell
# Plot two samples separately on the x axis
plt.figure(figsize=(6, 4))
sns.boxplot(data=adata.obs, x= ... , y= ...)
plt.title("Percentage of mitochondrial transcripts per cell")
plt.xlabel("Sample ID")
plt.ylabel("Percentage of mitochondrial transcripts")
plt.tight_layout()

In [None]:
### Plot QC scatterplots
# Plot 1: n_transcripts_per_cell vs. n_genes_per_cell
# Plot 2: n_genes_per_cell vs. percent_mito

fig, axes = plt.subplots(1, 2, figsize=(12, 6))
axes = axes.flatten() # makes the axes a 1D array for easier indexing

# Make a dataframe with the adata.obs for plotting
df_obs = adata.obs.copy()

# Randomly select half of the cells for plotting
df_obs = df_obs.sample(frac=0.5, random_state=42)

# Scatterplot of n_transcripts_per_cell vs. n_genes_per_cell
sns.scatterplot(df_obs, x="n_transcripts_per_cell", y="n_genes_per_cell", hue="sample_id", alpha=0.5, s=10, ax=axes[0])
axes[0].set_xlabel("Number of transcripts per cell")
axes[0].set_ylabel("Number of genes per cell")

# scatterplot of n_genes_per_cell vs. percent_mito
sns.scatterplot(df_obs, x="n_genes_per_cell", y="percent_mito", hue="sample_id", alpha=0.5, s=10, ax=axes[1])
axes[1].set_xlabel("Number of genes per cell")
axes[1].set_ylabel("Percentage of mitochondrial transcripts")


## Exercise 3: Could we use a z-score to threshold `percent_mito`?

We discussed in the class that z-scores can be used to determine how "normal" a given value is compared to the distribution. In a normally distributed dataset, the z-score is calculated by standardizing the distribution as below:

$z = \frac{x - \mu}{\sigma}$

Can we use this adaptable threshold on `percent_mito`? We will visualize the distribution to answer this question.


In [None]:
# Plot a histogram of percent_mito to visualize the distribution. Is it normal?
# Plot separately for each sample_id, so that the two distributions do not confound each other's assessment.

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

sns.histplot(data=df_obs[ ... ], x= ... , hue="sample_id", bins=50, kde=True, ax=axes[0]) # the kde parameter adds a line that represents the density estimate of the distribution
axes[0].set_title("Histogram of percent_mito, wt")

sns.histplot(data=df_obs[ ... ], x= ... , hue="sample_id", bins=50, kde=True, ax=axes[1])
axes[1].set_title("Histogram of percent_mito, rd10")

## Exercise 4: Test hard vs. adaptable thresholds on percent_mito

Many papers use hardcoded, manual thresholds on QC metrics such as percent_mito, either opting for the field convention for the sample type (e.g. 5%) or by visually inspecting QC scatterplots. This method is easy to use and reproduce, but the one-size-fits all approach is likely to fail beyond small experiments with few samples of a similar makeup (e.g. generating an atlas of healthy mouse retinas at the same age)

Another option, which may be beneficial in complex experiments, is to use an adaptable threshold learned from the sample itself. Due to the violation of common distributional assumptions in QC metrics, we usually use a robust statistical approach to avoid making any distributional assumptions. This method allows us to set a bespoke threshold for every sample and account for differences due to biology or batch, e.g., a higher mitochondrial threshold for degenerating samples. However, it still assumes a singular underlying distribution for the QC metric, which may be unsuitable for highly heterogeneous samples that might exhibit instead an e.g. bimodal distributions.

For adaptable thresholds, we often use median absolute deviation (MAD) based thresholding, as the median is a more robust statistic than the mean. This involves calculating a threshold as follows:

$MAD = median( |X_{i} - median(X_{i})| )$    
$threshold = median \pm multiplier * MAD * 1.4826$

The factor of 1.4826 allows the MAD to be comparable to a z-score for a normal distribution. Often, a multiplier of 3 is used to mimic the commonly used z-score threshold of 3. Note that it's also possible to adjust the "harshness" of an adaptable threshold by setting the multiplier, similarly to the z-score.

A mix of hard and adaptable thresholds can be used - setting hard thresholds on the number of genes and reads (taking splicing into account) while setting an adaptable mitochondrial read threshold is the default setting of `sc_pipe_iob`.

In this exercise, we will set manual thresholds for `n_transcripts_per_cell` and `n_genes_per_cell`, while testing both a manual threshold and a MAD-based threshold for `percent_mito`. For the first two QC metrics, we are interested in a lower bound to filter empty or very dying-adjacent cells with low values. For the latter, we want to construct an upper bound, after which we filter cells that may be dying.

In [None]:
### Test some hard thresholds for QC
# Lines will adapt to the thresholds you set

n_transcripts_per_cell_thr = ...
n_genes_per_cell_thr = ...
percent_mito_thr = ...

### Plot QC scatterplots with thresholds

fig, axes = plt.subplots(1, 2, figsize=(12, 6))
axes = axes.flatten() # makes the axes a 1D array for easier indexing

# Make a dataframe with the adata.obs for plotting
df_obs = adata.obs.copy()

# Randomly select half of the cells for plotting - easier to visualize
df_obs = df_obs.sample(frac=0.5, random_state=42)

# Scatterplot of n_transcripts_per_cell vs. n_genes_per_cell
sns. ... (df_obs, x="n_transcripts_per_cell", y="n_genes_per_cell", hue="sample_id", alpha=0.5, s=10, ax=axes[0])
axes[0].set_xlabel("Number of transcripts per cell")
axes[0].set_ylabel("Number of genes per cell")

# Add horizontal and vertical lines for the thresholds
axes[0]. ... (n_genes_per_cell_thr, color="red", linestyle="--", label="Genes threshold")
axes[0]. ... (n_transcripts_per_cell_thr, color="blue", linestyle="--", label="Transcripts threshold")
axes[0].legend()

# scatterplot of n_genes_per_cell vs. percent_mito
sns. ... (df_obs, x="n_genes_per_cell", y="percent_mito", hue="sample_id", alpha=0.5, s=10, ax=axes[1])
axes[1].set_xlabel("Number of genes per cell")
axes[1].set_ylabel("Percentage of mitochondrial transcripts")

# Add horizontal line for the percent_mito threshold
axes[1]. ... (percent_mito_thr, color="blue", linestyle="--", label="Mitochondrial threshold")
axes[1]. ... (n_genes_per_cell_thr, color="red", linestyle="--", label="Genes threshold")
axes[1].legend()

fig.suptitle("QC scatterplots with manual thresholds")
plt.tight_layout()
plt.show()

In [None]:
### Count how many cells pass the thresholds

# Create a boolean mask for cells that pass the thresholds
mask = (
    (adata.obs["n_transcripts_per_cell"] > n_transcripts_per_cell_thr) & # transcripts larger than the threshold
    ( ...) & # and n_genes larger than the threshold
    ( ... ) # and percent_mito smaller than the threshold
)

# Apply the mask to the adata.obs dataframe to create a new column indicating whether each cell passes the QC thresholds.
adata.obs["pass_qc_hard_thresholds"] = ...

# Count the number of cells that pass the QC thresholds, separately for each sample
adata.obs.groupby( ... )[ ... ].value_counts()

In [None]:
### Compute adaptable threshold for mitochondrial gene filter based on MAD
# Should be calculated for each sample separately
def compute_mad_threshold(data, multiplier=3, upper=True):
    """Compute the MAD-based threshold for a given data series."""
    median = np.median(data)
    mad = np.median(np.abs(data - median))
    if upper:
        # For upper threshold, we add the MAD to median
        thr = median + multiplier * mad * float(1.4826) 
        return thr
    else:
        # For lower threshold, we subtract the MAD
        thr = median - multiplier * mad * float(1.4826)
        if thr < 0:
            # Ensure the threshold is not negative since we deal with nonnegative data
            thr = 
        return thr

# Compute thresholds separately for each sample, and store them in a dictionary
dict_thresholds = {}

# how to loop through each sample_id in the adata.obs dataframe?
for sample_id in adata.obs["sample_id"]. ... ():
    # subset the data for the current sample
    sample_data = adata.obs[adata.obs["sample_id"] == sample_id]

    # compute the MAD threshold for percent_mito
    percent_mito_thr_mad = compute_mad_threshold(sample_data["percent_mito"])

    # store the threshold in the dictionary
    dict_thresholds[sample_id] = percent_mito_thr_mad

# Print the computed thresholds
for k, v in dict_thresholds.items():
    print(f"Sample {k}:")
    print(f"  percent_mito threshold: {v:.2f}")

In [None]:
### Plot the QC scatterplots with MAD threshold for percent_mito
# Separately for each sample
fig, axes = plt.subplots(1, 2, figsize=(12, 6))

# Plot the wt sample
percent_mito_thr_wt = dict_thresholds["wt"]

sns.scatterplot(df_obs[df_obs["sample_id"] == "wt"], x="n_genes_per_cell", y="percent_mito", hue="sample_id", alpha=0.5, s=10, ax=axes[0])

axes[0].axhline(percent_mito_thr_wt, color="blue", linestyle="--", label="MAD threshold")
axes[0].set_title("WT Sample")
axes[0].set_xlabel("Number of genes per cell")
axes[0].set_ylabel("Percentage of mitochondrial transcripts")
axes[0].legend()

# plot the rd10 sample
percent_mito_thr_rd10 = dict_thresholds["rd10"]

sns.scatterplot(df_obs[df_obs["sample_id"] == "rd10"], x="n_genes_per_cell", y="percent_mito", hue="sample_id", alpha=0.5, s=10, ax=axes[1])

axes[1].axhline(percent_mito_thr_rd10, color="blue", linestyle="--", label="MAD threshold")
axes[1].set_title("rd10 Sample")
axes[1].set_xlabel("Number of genes per cell")
axes[1].set_ylabel("Percentage of mitochondrial transcripts")
axes[1].legend()

In [None]:
### Count how many cells pass the hard n_transcripts + n_genes thresholds alongside the MAD-based threshold for percent_mito
mask_mad_wt = (
    (adata.obs["n_transcripts_per_cell"] > n_transcripts_per_cell_thr) &
    (adata.obs["n_genes_per_cell"] > n_genes_per_cell_thr) &
    (adata.obs["percent_mito"] < percent_mito_thr_wt)
)
mask_mad_rd10 = (
    (adata.obs["n_transcripts_per_cell"] > n_transcripts_per_cell_thr) &
    (adata.obs["n_genes_per_cell"] > n_genes_per_cell_thr) &
    (adata.obs["percent_mito"] < percent_mito_thr_rd10)
)

# Add the masks to the adata object
adata.obs["pass_qc_mad_thresholds_wt"] = mask_mad_wt
adata.obs["pass_qc_mad_thresholds_rd10"] = mask_mad_rd10

# Count the number of cells that pass the QC thresholds, separately for each sample
print("Number of cells passing QC thresholds (hard + MAD):")
print("Sample wt:")
# Filter to wt, then count the number of entriesd that pass the QC thresholds or not
print(adata.obs[adata.obs["sample_id"] == "wt"][ ... ]. ... ())
print("Sample rd10:")
print(adata.obs[adata.obs["sample_id"] == "rd10"][ ... ]. ... ())