# Tutorial 'QC, topographical analysis and segmentaton-free of Spot-based transcriptomics data'

This is the fast lane of the tutorial. Just start a session and click 'run the whole notebook' on the top to start executing.

UMAP calculation takes a few minutes, which we can use in the main notebook to get familiar with the general exploratory workflow.

In [None]:
# widens the screen:

%load_ext autoreload
%autoreload 2

from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
import plotly.offline as pyo
pyo.init_notebook_mode(connected=True)  
import sys
import os

sys.path.append(os.path.join(os.path.abspath('.'),'../..'))

In [None]:
# imports, define a handy figure function:

import plankton.plankton as pl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc

def figure(width=8,height=8):
    plt.figure(figsize=(width,height))


In [None]:
# load background stain:


um_p_px=0.325

coordinates = pd.read_csv('../data/in_situ_sequencing/S2T1_pcw6.csv')
bg = -plt.imread('../data/in_situ_sequencing/background.jpg').mean(-1)
bg = (bg-bg.min())/(bg.max()-bg.min())
bg_map = pl.PixelMap(pixel_data=bg,
                     cmap='Greys',
                     px_p_um = 0.504/um_p_px)
del bg

In [None]:
rands = np.random.rand(0,2,)*np.array([coordinates.Global_x_pos.values.max(),coordinates.Global_y_pos.values.max()])

x = np.hstack([coordinates.Global_x_pos.values,rands[:,0]])
y = np.hstack([coordinates.Global_y_pos.values,rands[:,1]])

rand_genes = coordinates.Gene.values[np.random.randint(len(coordinates.Gene.unique()), size=(rands.shape[0]))]
g = np.hstack([coordinates.Gene.values,rand_genes])

In [None]:
sdata = pl.SpatialData(x_coordinates=x*um_p_px,
                       y_coordinates=y*um_p_px,
                       genes=g,
                       pixel_maps={'DAPI':bg_map}
                      )

In [None]:
# sdata['noise']=False
# sdata.loc[len(sdata)-rands.shape[0]:,'noise']=True

sdata = sdata[sdata.stats.progressive_sample(1.05)].spatial[100:2800,1000:]

In [None]:
# Parameterization for data cleaning/artefact removal:`

#bw 100: segmentation

import time

knn_neighbors=100
bandwidth=30
n_neighbors=60
metric='euclidean'
min_dist=0.02
random_state=0
zero_weight=0.0
n_ica_components=14

_=sdata.graph.update_knn(n_neighbors=knn_neighbors)


t = time.perf_counter()
sdata.graph.run_umap(bandwidth=bandwidth,n_neighbors=n_neighbors, min_dist=min_dist, metric=metric, random_state=random_state,zero_weight=zero_weight,cutoff=n_ica_components)
t = (time.perf_counter()-t)
print(t)

### Saving:

The save function saves the sdata object with all its dependencies:

In [None]:
# Save the calculated umap:

# sdata.save('tutorial-umap.pl')
# sdata = pl.load('tutorial-umap.pl')

### UMAP overview:

We can plot the UMAP embedding and the in-situ coordinates at the same time with identical coloration:

In [None]:
figure(20,15)

sdata.graph.map_and_umap(alpha=0.3,c=sdata.graph.umap_0)
plt.suptitle(f"nbrs:{n_neighbors}-cutoff:{n_ica_components}-bw:{bandwidth}-t:{int(t)}")

In [None]:
assert False

# stop execution for now : P

In [None]:
sdata.var

### Defining clusters in embedding space:

We can use an interactive javascript-based function to investigate a combined representation of the data in physical space and in local-gene-composition space.

Use the function to define: 

- A background noise signal
- Two major epithelial clusters (called epithelial_central/epithelial_distal)
- The mesothelium that encapsulates the sample (called mesothelial)
- Two clusters of mesothelium encapsulating the ducts (called mesothelial_duct_centra/mesothelial_duct_distal)
- Two clusters of the remaining tissue (called submucosa_central/submucosa_distal)

In [None]:
# Use the javascript renderer to define tissue clusters:

# -bg_noise
# -epihtelial_central
# -epithelial_distal
# -mesothelial
# -mesotheloal_duct_central
# -mesotheloal_duct_distal
# -submucosa_central
# -submucosa_distal


sdata.graph.umap_js()

In [None]:
sdata.graph.umap

In [None]:
# sdata.save('tissue_clusters.pl')
# sdata = pl.load('tissue_clusters.pl')

### Plot the cleaned signal:

The tilde operator inverts the boolean 'bg_noise' column.

In [None]:
figure()

sdata[~sdata.bg_noise].scatter()

Environment noise can actually be an interesting QC indicator: In a way, it introduces a lower bound of the noise we can expect to exist throughout the sample.

A way to visualize the noise composition is to plot it against the observed signal:

In [None]:
from plankton.utils import hbar_compare


figure(6,10)
hbar_compare(sdata[~sdata.bg_noise].stats,sdata[sdata.bg_noise].stats,['signal','noise'])

In [None]:
count_ratios_noise = sdata[sdata.bg_noise].counts/sdata[~sdata.bg_noise].counts

figure(25,5)
count_ratios_noise.sort_values().plot.bar()
plt.title('Noise-to-count ratio:')

The next plot shows the six 'noisiest genes plotted on top of DAPI:

In [None]:
figure(10,10)
sdata[sdata.g.isin(count_ratios_noise.sort_values()[-6:].index)].scatter(legend=True,alpha=0.6)
plt.title('Six genes with the highest noise-to-count-ratio:')

### Remove noise

The background noise signal is sliced from sdata and the column 'bg_noise' is removed:

In [None]:
# clean from noise
sdata = sdata[~sdata.bg_noise]
sdata = sdata.drop('bg_noise')

In [None]:
sdata

In [None]:
figure(12,12)

tissues = sdata.columns[4:]
sdata['tissues'] = sdata.unite_columns(tissues)

# sdata = sdata[~sdata.bg_noise].drop('bg_noise')
sdata
# sdata[~sdata.bg_noise].scatter()

In [None]:
from matplotlib.cm import get_cmap

accent = get_cmap('nipy_spectral')

figure(9,9)
out = sdata.scatter(c=sdata.tissues.cat.codes,cmap=accent)

handlers = [plt.scatter([],[],color=accent(f)) for f in np.linspace(0.2,1,len(tissues))]

plt.legend(handlers,tissues,)


## Analyse DEGs

We can analyse the differential expression/local molecule occurrence between different assigned contexts:

### Ducts

Try to identify genes that are expressed in all ductal cells:

In [None]:
from plankton.stats import mor_normalize


# create a mask to index all 'ductal' molecules in the data set:
mask_ducts = sdata.epithelial_central|sdata.epithelial_distal|sdata.mesothelial_duct_central|sdata.mesothelial_duct_distal

# Normalize counts using the median of ratios:
c1,c2 = mor_normalize(sdata[mask_ducts],sdata[~mask_ducts])

figure(25,5)

# Compute the log ratios, sort:
lfc_ducts = np.log2(c1/c2).sort_values()

# Plot the sorted ratios:
lfc_ducts.plot.bar()

plt.title('log-count-ratios for <- other vs. ductal -> genes')

This plot answers the question: *Which genes are overrepresented in the ductal regions of tissue compared to the submucosa*?

We can plot the five most significant genes:

In [None]:
# degs for ducts:

figure(8,8)
sdata[sdata.g.isin(lfc_ducts[-5:].index)].scatter(alpha=0.5,legend=True)

plt.title('Five prominent ductal indicator genes:')

### Distinguish endothelial/mesothelial cells in the ducts:

The ducts are composed out of endothelial and mesothelial cells. We can attempt to identify markers for both:

In [None]:
# Normalize samples:
c1,c2 = mor_normalize(sdata[sdata.epithelial_central|sdata.epithelial_distal],sdata[sdata.mesothelial_duct_central|sdata.mesothelial_duct_distal])

figure(25,5)

# Plot log-count-ratio:
lfc_epi = np.log2(c1/c2).sort_values()

lfc_epi.plot.bar()

plt.title('log-count-ratios  <- mesothelial vs. epithelial -> ')

In [None]:
# degs for ducts:

figure(8,8)
sdata[sdata.g.isin(lfc_epi[:5].index)].scatter(alpha=0.2,color='lime')
sdata[sdata.g.isin(lfc_epi[-5:].index)].scatter(alpha=0.2,color='magenta')

# (Pssst, this is a hack to create the legend) : /
handlers = [plt.scatter([],[],color=c) for c in ['lime','magenta']]
plt.legend(handlers,['endothel','mesothel'],)

plt.title('The five most prominent indicators of epithel/mesothel:')

### Combine plots:

This plot shows the ductal-specificity, with bars sorted by the values recovered from the analysis for endothelial <-> mesothelial specificity.

The question is:

Is there a common ductal marker that is not specific for any individual ductal cell?

In [None]:
figure(25,5)

lfc_ducts[lfc_epi.index].plot.bar()

plt.title('log-count-ratios for <- other vs. ductal -> genes, sorted by epithelial affinity')

'DNAH12' seems out of line as an epithelial marker without ductal affinity:

In [None]:
sdata[sdata.g=='DNAH12'].scatter()

... but this is probably just an effect of the low overall count and some inherent noise...

## Distinguishing epithelial cells:

We have discovered multiple epithelial tissue contexts. What are the genes that determine the individual clusters:

In [None]:
from plankton.stats import mor_normalize

c1,c2=mor_normalize(sdata[sdata.epithelial_central].stats,sdata[sdata.epithelial_distal].stats)


figure(25,5)

lfc_epi_01 = np.log2(c1/c2).sort_values()

lfc_epi_01.plot.bar()
plt.title('log-count-ratios  <- epithelial_distal vs. epithelial_central -> ')

In [None]:
# degs for ducts:

figure(20,8)
plt.subplot(121)
plt.title('distal:')
sdata[sdata.g.isin(lfc_epi_01[:5].index)&(sdata.epithelial_central|sdata.epithelial_distal)].scatter(alpha=0.2,legend=True)

plt.subplot(122)
plt.title('central')
sdata[sdata.g.isin(lfc_epi_01[-5:].index)&(sdata.epithelial_central|sdata.epithelial_distal)].scatter(alpha=0.2,legend=True)

# sdata[sdata.g.isin(lfc_epi_01[-5:].index)].scatter(alpha=0.5,color='magenta')

In [None]:
# degs for ducts:

figure(8,8)

sdata[sdata.g.isin(lfc_epi_01[:5].index)&(sdata.epithelial_central|sdata.epithelial_distal)].scatter(alpha=0.1,color='lime')

sdata[sdata.g.isin(lfc_epi_01[-10:].index)&(sdata.epithelial_central|sdata.epithelial_distal)].scatter(alpha=0.1,color='magenta')




### 'Inner' vs. 'peripheral' clusters

We discovered a number of clusters that are expressed more in the center of the sample, and we can try to identify the respective genes:

In [None]:
figure(25,5)

for i,label in enumerate(['epithelial_central', 'epithelial_distal', 'mesothelial_duct_central', 'mesothelial_duct_distal', 'submucosa_central', 'submucosa_distal']):
    plt.subplot(1,7,i+1)
    
    plt.title(label)
    sdata[sdata[label]].scatter()

In [None]:
# define central tissues:
central_tissues = ['epithelial_central','mesothelial_duct_central','submucosa_central']
distal_tissues = ['epithelial_distal','mesothelial_duct_distal','submucosa_distal']

# create mask from tissues:
mask_central = sdata.tissues.isin(central_tissues)
mask_distal = sdata.tissues.isin(distal_tissues)

# plot detected central <-> distal molecules
figure()
sdata[mask_distal].scatter(color='red',alpha=0.2)
sdata[mask_central].scatter(color='yellow',alpha=0.2)
plt.title('Central vs. distal molecules:')

We can perform differential expression analysis again to identify genes with central<->distal affinity:

In [None]:

c1,c2=mor_normalize(sdata[mask_central].stats,sdata[mask_distal].stats)


figure(25,5)

lfc_central = np.log2(c1/c2).sort_values()

lfc_central.plot.bar()

plt.title('log-count-ratios  <- distal vs. central -> ')

In [None]:
figure(16,16)

plt.subplot(121)
plt.title('central genes:')
sdata[sdata.g.isin(lfc_central[-7:].index)].scatter(alpha=0.2,legend=True)


plt.subplot(122)
plt.title('distal genes:')
sdata[sdata.g.isin(lfc_central[:7].index)].scatter(alpha=0.2,legend=True)

It is difficult to say whether this effect is biologial or not. My estimate would be that it is a technical artefact, especially given the fact that it coincides with the expression-free 'hole' in the center of the tissue. Also, it is visible across indicator genes for different tissues at the same time. It might be due to differences in thickness of the frozen tissue sample, which resulted in different degrees of permeation by the fluids utilized in the ISS protocol. Anyway, this shows that claims concerning the tissue compositions along the central-distal axis should be formulated relatively conservative.

# unsupervised approach using SSAM-denovo

Our lab has a segmentation-free, unsupervised celltype calling algorithm called SSAM (Park, 2020).

It uses KDE to model an expression density of molecules in space, samples from the estimated density and creates clusters of cell types, which can then be projected onto a pixelized map of the sample:

In [None]:
from plankton.utils import localmax_sampling,ssam

# create a signature matrix through local-max sampling
signatures = pd.DataFrame(localmax_sampling(sdata,n_clusters=11,bandwidth=6),columns=sdata.genes)

signatures = np.array(signatures)
signatures-=signatures.min()
signatures/=signatures.max(0)
signatures/=signatures.max(1)[:,None]

ct_map = ssam(sdata,signatures=signatures,kernel_bandwidth=6,threshold_exp=0.9)

In [None]:
figure(9,9)
ct_map.imshow(cmap='nipy_spectral',interpolation='none')

We can verify the clusters by plotting them against our self-defined tissue subclasses:

In [None]:
figure(8,45)

tissue_counts = [sdata[sdata[t]].counts for t in tissues]
# tissue_counts = pd.DataFrame(tissue_counts,index=tissues).T

for i in range(ct_map.data.max()):
    
    
    # plot tissue sections of 
    plt.subplot(ct_map.data.max(),2,i*2+2)
    
    plt.title('cluster' + str(i))
    (ct_map==i).imshow(cmap='Reds')
    
    
    plt.subplot(ct_map.data.max(),2,i*2+1)
    
    sampled = sdata[ct_map.get_value(sdata.x,sdata.y)==i]
    
    correlations = pd.Series([sampled.counts.corr(t) for t in tissue_counts],index=tissues)
    correlations.plot.barh()
#     sampled.scatter(alpha=0.5)
    
    


We can also compare the identified cluster with the signatures obtained in the 'supervised' analysis:

In [None]:
signatures = pd.read_csv('signatures.csv',index_col=0)


figure(15,75)

for i in range(ct_map.data.max()):
    
    
    # plot tissue sections of 
    plt.subplot(ct_map.data.max(),2,i*2+2)
    
    plt.title('cluster' + str(i))
    (ct_map==i).imshow(cmap='Reds')
    
    
    plt.subplot(ct_map.data.max(),2,i*2+1)
    
    sampled = sdata[ct_map.get_value(sdata.x,sdata.y)==i]
    
    correlations = pd.Series([sampled.counts.corr(t) for i,t in signatures.iterrows()],index=signatures.index)
    correlations.plot.barh()
#     sampled.scatter(alpha=0.5)
    
    
