# Cluster Analysis of Reprogramming Data

Description: Comparing KNN purity and silhouette coefficient for unpreturbed and preturbed fibroblast gene expression data. Using reprogramming results from scGPT.

In [1]:
import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Score Functions

### KNN Purity

Purity is an external evaluation criterion of cluster quality. That is, it is the percent of the total number of data points that were classified correctly.

Let $N$ = number of objects(data points), $C$ = number of clusters, $c_i$ is a cluster in $C$, and $t_j$ is the classification which has the max count for cluster $c_i$. Then,

$Purity = \displaystyle{\frac{1}{N}\sum^{k}_{i = 1} \max_{j} |c_i \cap t_j|}$

In [2]:
from sklearn.neighbors import NearestNeighbors

def knn_purity(adata, k=10, label_key='cell_type'):
    """
    Compute the KNN purity for single-cell data.

    Parameters:
    adata: AnnData object with an embedding (e.g., PCA, UMAP) in adata.obsm['X_pca'] or adata.obsm['X_umap']
    k: Number of nearest neighbors to consider for KNN
    label_key: The key in adata.obs containing the labels (e.g., cell types)
    
    Returns:
    purity_scores: Purity score for each cell
    average_purity: The average KNN purity for the dataset
    """
    # Extract the embedding from the AnnData object
    embedding = adata.obsm['X_pca'] if 'X_pca' in adata.obsm else adata.obsm['X_umap']
    
    # Extract cell labels
    labels = adata.obs[label_key].values
    
    # Initialize the NearestNeighbors model
    nbrs = NearestNeighbors(n_neighbors=k + 1).fit(embedding)  # k + 1 because the nearest neighbor includes itself
    distances, indices = nbrs.kneighbors(embedding)
    
    purity_scores = []
    
    for i, neighbors in enumerate(indices):
        # Exclude the first neighbor (the point itself)
        neighbor_labels = labels[neighbors[1:]]
        true_label = labels[i]
        
        # Calculate the purity as the fraction of neighbors that have the same label
        purity = np.mean(neighbor_labels == true_label)
        purity_scores.append(purity)
    
    average_purity = np.mean(purity_scores)
    
    return purity_scores, average_purity

### Silhouette Coefficient

The silhouette coefficient is used to evaluate the quality of clustering by considering cohesion (how close data points in the same cluster are to each other) and separation (how far apart different clusters are from one another). 

Let $a$ be the average distance from one point to other points in its own cluster, and $b$ be the average distance from one point to points in the nearest neighboring cluster. Then,

$Silhouette = \displaystyle\frac{b-a}{\max(a,b)}$

In [3]:
from sklearn.metrics import silhouette_score

def compute_silhouette(adata, label_key='cell_type', embedding_key='X_umap'):
    """
    Compute the silhouette coefficient for single-cell data.
    
    Parameters:
    adata: AnnData object with an embedding (e.g., PCA, UMAP) in adata.obsm
    label_key: The key in adata.obs containing the labels (e.g., cell types or clusters)
    embedding_key: The key in adata.obsm that contains the dimensionality reduction coordinates
    
    Returns:
    silhouette_avg: Average silhouette coefficient for all cells
    """
    # Extract the embedding (e.g., PCA or UMAP) from the AnnData object
    embedding = adata.obsm[embedding_key]
    
    # Extract cell labels
    labels = adata.obs[label_key].values
    
    # Compute the silhouette score
    silhouette_avg = silhouette_score(embedding, labels)
    
    return silhouette_avg

In [4]:
# Path to data, original and reprogrammed
data_directory = "/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot"
unpreturbed_file = "fibroblast.h5ad"

In [5]:
with open(data_directory + '/README.md', 'r') as file:
    content = file.read()

print(content)

# One-Shot Perturbation Experiment

Auth: Joshua Pickard
      jpic@umich.edu
Date: August 4, 2024

This directory and subdirectories contain data related to the in-silico reprogramming experiment. This experiment uses single cell Firboblasts from Tabula Sapiens, and increases the expression of Transcription Factors that are expected to reprogram cells, as found in the literature.

This directory is organized as follows:
- fibroblast.h5ad contains the unperturbed single cell data used in the experiment
- perturbed/ contains the raw single cell data with the upregulated Transcription Factors, i.e. adata.var is a table of gene names
- scGPT/ contains the scGPT of the single cell data from perturbed/
- geneformer/ is similar to scGPT/ but using a different foundation model
- files all have the same adata.obs table, which lets you map individual cells accross different TF perturbations and embeddings
- all files are nameda according to the TFs that are upregulated
- note, the indices in ad

# Unpreturbed Cell Data

In [None]:
adata = sc.read_h5ad(os.path.join(data_directory, unpreturbed_file))

# Check adata object
print(adata)

gene_list = adata.var['gene_symbol'].values.tolist()

# Print first 5 entries
print("First 5 entries:")
print(gene_list[:5])

# Print last 5 entries
print("Last 5 entries:")
print(gene_list[-5:])

In [None]:
adata.obs['free_annotation']

In [None]:
purity_scores, average_purity = knn_purity(adata, k=10, label_key='cell_ontology_class')
print("Average KNN Purity:", average_purity)

In [None]:
silhouette_avg = compute_silhouette(adata, label_key='cell_ontology_class', embedding_key='X_umap')
print("Average Silhouette Coefficient:", silhouette_avg)

In [None]:
sc.pl.umap(adata, color="cell_ontology_class")

# Preturbed Cell Data (scGPT)

In [None]:
cd "/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot"

In [None]:
scgpt_embed_data = os.path.join(data_directory, "scGPT")

In [None]:
for file_name in os.listdir(scgpt_embed_data):
    # Construct the full file path
    file_path = os.path.join(scgpt_embed_data, file_name)

    # Check if it's a file (not a directory)
    if os.path.isfile(file_path):
        # Print the file name without extension
        print(os.path.splitext(file_name)[0])
    
    adata = sc.read_h5ad(file_path)
    
    purity_scores, average_purity = knn_purity(adata, k=10, label_key='cell_ontology_class')
    print("Average KNN Purity:", average_purity)
    
    silhouette_avg = compute_silhouette(adata, label_key='cell_ontology_class', embedding_key='X_umap')
    print("Average Silhouette Coefficient:", silhouette_avg)

# To-Do
- Redo embedding (UMAP) on fibroblast data
- Write file to directory under same name
- Try a few clustering methods, then cluster in embedded space post-preturbation
- Check scores for data-space preturbations