# Imports

In [None]:
from os import path, listdir
from copy import deepcopy
import stlearn as st
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import matplotlib.pyplot as plt
import cv2

%load_ext autoreload
%autoreload 2

from scanpy_stlearn_loaders import StlearnLoader

# Load Data 

In [None]:
dataset_name = 'Visium_Mouse_Olfactory_Bulb'

## Genes-Spots Expression Values 

In [None]:
obj = StlearnLoader().load_local_visum(path=path.join('/', 'data', dataset_name),
                                      count_file='filtered_feature_bc_matrix.h5')
x = obj.X.toarray()
n_spots, n_genes = x.shape
print(f'# spots: {n_spots} | # genes: {n_genes}')
obj

## Spatial Images 

In [None]:
# Low resolution Images
spatial_dir = path.join('/', 'data', dataset_name, 'spatial')
for img in [k for k in listdir(spatial_dir) if k.endswith('.jpg') or k.endswith('.png')]:
    plt.title(img)
    plt.imshow(cv2.imread(path.join(spatial_dir, img)))
    plt.show()
    
print('Spots:')
display(obj.obs.head())

## Full Resolution Image 

In [None]:
# Full Resultion Image

from PIL import Image
image_path = path.join('/', 'data', dataset_name, 'image.tif')
im = Image.open(image_path)
image_array = np.array(im)
im.show()

## Kmeans Clustering

In [None]:
st.pp.normalize_total(obj)
st.pp.log1p(obj)
# run PCA for gene expression data
st.em.run_pca(obj, n_comps=50)
# K-means clustering
st.tl.clustering.kmeans(obj, n_clusters=7, use_data="X_pca", key_added="X_pca_kmeans")
st.pl.cluster_plot(obj, use_label="X_pca_kmeans")

# Filter Genes
- min_cells = Keep genes with at least X spots with non zero expression (x[spot, gene] > 0)
- min_counts = Keep genes with at least X expressions (sum of expression over all spots) (sum(x[:, gene]) > X)

In [None]:
obj = StlearnLoader().load_local_visum(path=path.join('/', 'data', dataset_name),
                                      count_file='filtered_feature_bc_matrix.h5')

In [None]:
# Keep genes with at least 15% non zero spots
min_cells = int(n_spots * 0.15)
print(f'Keep genes with at least {min_cells} non zero spots')
st.pp.filter_genes(obj, min_cells=min_cells)
x = obj.X.toarray()
n_spots, n_genes = x.shape
print(f'# spots: {n_spots} | # genes: {n_genes}')

In [None]:
min_counts = 10
print(f'Keep genes with total expression of at least {min_counts} over all spots')
st.pp.filter_genes(obj, min_counts=min_counts)
x = obj.X.toarray()
n_spots, n_genes = x.shape
print(f'# spots: {n_spots} | # genes: {n_genes}')

## Filtered Kmeans Clustering

In [None]:
st.pp.normalize_total(obj)
st.pp.log1p(obj)
# run PCA for gene expression data
st.em.run_pca(obj, n_comps=50)
# K-means clustering
st.tl.clustering.kmeans(obj, n_clusters=7, use_data="X_pca", key_added="X_pca_kmeans")
st.pl.cluster_plot(obj, use_label="X_pca_kmeans")

# EDA 

## Genes - Spots Info

In [None]:
print('Spots information')
display(obj.obs.head())
print('Genes information')
display(obj.var.head())

In [None]:
print(f'All the spots in the matrix are in the tissue: "in_tissue" # unique values = {obj.obs.in_tissue.nunique()}')
print(f'All the genes are from the same genome: "genome" # unique values = {obj.var.genome.nunique()}')

## Expression over spots 

In [None]:
def total_exp_over_spots(x):
    plt.figure(figsize=(15, 6))
    spots_total_exp = x.sum(axis=1)
    sns.histplot(spots_total_exp)
    plt.title('Total expression in spots (Histogram)')
    plt.xlabel('Total Expression')
    plt.ylabel('# of spots')
    plt.show()

In [None]:
total_exp_over_spots(x)

In [None]:
def spots_sparsity(x):
    plt.figure(figsize=(15, 6))
    spots_exp_sparsity = (x.shape[1] - np.count_nonzero(x, axis=1)) / x.shape[1]
    sns.histplot(spots_exp_sparsity)
    plt.title('Spots Expression Sparsity (Histogram)')
    plt.xlabel('% Sparsity (% of genes with zero expression)')
    plt.ylabel('# of spots')
    plt.show()

In [None]:
spots_sparsity(x)

## Expression over genes

In [None]:
def total_exp_over_genes(x, logx=True, logy=True):
    plt.figure(figsize=(15, 6))
    genes_total_exp = x.sum(axis=0)
    sns.histplot(genes_total_exp)
    plt.title('Total expression in genes (Histogram)')
    plt.xlabel('Total Expression')
    plt.ylabel('# of genes')
    if logy:
        plt.yscale('log')
    if logx:
        plt.xscale('log')
    plt.show()

In [None]:
total_exp_over_genes(x)

In [None]:
def genes_sparsity(x):
    plt.figure(figsize=(15, 6))
    genes_exp_sparsity = (x.shape[0] - np.count_nonzero(x, axis=0)) / x.shape[0]
    sns.histplot(genes_exp_sparsity)
    plt.title('Genes Expression Sparsity (Histogram)')
    plt.xlabel('% Sparsity (% of spots with zero expression)')
    plt.ylabel('# of genes')
    plt.show()

In [None]:
genes_sparsity(x)

## Unstructred 

In [None]:
obj_unstructred = obj.uns['spatial']['Visium_Mouse_Olfactory_Bulb']

In [None]:
print(f"Scale factors: \n{obj_unstructred['scalefactors']}")
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(19, 8))
ax1.imshow(obj_unstructred['images']['hires'])
ax1.set_title('Hires Image')
ax2.imshow(obj_unstructred['images']['lowres'])
ax2.set_title('Lowres Image')
plt.show()

## Genes Expressions 

### Top N Expressed Genes 

In [None]:
N = 10
genes_expressed = np.sum(x, axis=0) / (np.count_nonzero(x, axis=0) + 1)
top_genes_indices = genes_expressed.argsort()[-N:][::-1]
top_genes_names = obj.var.index[top_genes_indices]
print(top_genes_names)
top_genes_expression = x[:, top_genes_indices]
top_genes_expression.shape

In [None]:
tmp = pd.DataFrame(top_genes_expression, columns=top_genes_names)
tmp.plot.box(figsize=(15, 9), title='Top expressed genes', rot=90)
plt.show()

In [None]:
for gene_symbol in top_genes_names:
    st.pl.gene_plot(obj, gene_symbols=gene_symbol, size=20, figsize=(12, 9))
    plt.show()

## Log expression 

In [None]:
st.pp.log1p(obj)
scaled_x = obj.X.toarray()
obj

In [None]:
total_exp_over_spots(scaled_x)
total_exp_over_genes(scaled_x, logx=False, logy=False)