In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
from sklearn.linear_model import LinearRegression
from collections import Counter
from scipy import stats
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict
import math
from pathlib import Path
import os

In [2]:
## optional
## delete celltypes with cell counts lower than certain amount
## specifing an lower bond is a must
def filter_low_counts(celltype_df, age_df, celltype_col, threshold):
    print("Checking low count cell types...")
    
    celltype_count = Counter(celltype_df[celltype_col])
    for key in celltype_count:
        if threshold == None:
            unique_ages = np.unique(age_df)
            num_groups = (len(unique_ages) + 1) * 100
            if celltype_count[key] < num_groups:
                print(key, " has too low counts")
                celltype_df = celltype_df[celltype_df[celltype_col] != key]
        else:
            if celltype_count[key] < threshold:
                print(key, " has too low counts")
                celltype_df = celltype_df[celltype_df[celltype_col] != key]
    return celltype_df

In [3]:
def get_skewed_count_info(adata, class_col, age_col, age_threshold):
    print("Checking skewed count cell types...")
    
    # Compute the fraction of cells for each age group within each cell ontology class
    group_counts = adata.obs.groupby([class_col, age_col]).size()
    total_counts = adata.obs.groupby([class_col]).size()
    
    # Calculate the fraction of each age group within each class
    class_age_fraction = group_counts / total_counts
    
    # Find the cell classes to filter out based on age distribution
    classes_to_filter = class_age_fraction[class_age_fraction > age_threshold].index.get_level_values(0).unique()
    
    return classes_to_filter

In [4]:
## Read h5ad file 
## and do cell type filtering based on age distribution and cell count thresholds.
def read_and_filter_h5ad(filepath_1, filepath_2, class_col="celltype", age_col="age", age_threshold=0.8, count_threshold=None):
    """Parameters:
    adata: AnnData object
        The Scanpy AnnData object containing single-cell data.
    class_col: str, optional (default: 'celltype')
        The column name in adata.obs representing the cell ontology class.
    age_col: str, optional (default: 'age')
        The column name in adata.obs representing the age of the cells.
    age_threshold: float, optional (default: 0.8)
        The threshold fraction for filtering based on age distribution. If one age group has more than this
        fraction of cells in a class, the class will be filtered out.
    count_threshold: list, optional (default: [100])
        Threshold for filtering cell types based on count. If a single value is provided,
        it filters out cell types with counts lower than this value. If a range is provided,
        it filters out cell types outside this range.
    
    Returns:
    filtered_adata: AnnData object
        The filtered AnnData object with specified cell ontology classes removed based on both criteria."""
    try:
        adata1 = sc.read_h5ad(filepath_1)
        adata2 = sc.read_h5ad(filepath_2)
        adata = adata1.concatenate(adata2)
        celltype_df = adata.obs[[class_col]].copy()
        age_df = adata.obs[[age_col]].copy()
        
        # Apply the cell count threshold filtering
        celltype_df = filter_low_counts(celltype_df, age_df, class_col, count_threshold)
    
        # Create a filtered AnnData object based on cell count filtering
        filtered_adata = adata[celltype_df.index].copy()
        
        # Identify the skewed classes to filter based on age distribution
        classes_to_filter = get_skewed_count_info(filtered_adata, class_col, age_col, age_threshold)
        
        if len(classes_to_filter):
            print(classes_to_filter[0], " has skewed cell counts")
        # Further filter the AnnData object based on age distribution
        final_filtered_adata = filtered_adata[~filtered_adata.obs[class_col].isin(classes_to_filter)].copy()
        
        return final_filtered_adata
    except Exception as e:
        raise(e)

In [13]:
# Get the current working directory, this should get the path automatically
# hope it works for mac
current_dir = Path.cwd()
print(f"Current working directory: {current_dir}")

# Construct full file paths using the `/` operator
file1 = current_dir / "tabula-muris-senis-facs-processed-official-annotations-Brain_Myeloid.h5ad"
file2 = current_dir / "tabula-muris-senis-facs-processed-official-annotations-Brain_Non-Myeloid.h5ad"
# Print the paths for verification
print(f"File 1 path: {file1}")
print(f"File 2 path: {file2}")

# Verify that files exist
assert file1.is_file(), f"File not found: {file1}"
assert file2.is_file(), f"File not found: {file2}"

# Read and filter the data
adata = read_and_filter_h5ad(str(file1), str(file2), "cell_ontology_class", "age")

# adata = read_and_filter_h5ad("../Mouse Tabula Muris/tabula-muris-senis-facs-processed-official-annotations-Brain_Myeloid.h5ad", 
#                              "../Mouse Tabula Muris/tabula-muris-senis-facs-processed-official-annotations-Brain_Non-Myeloid.h5ad",
#                              "cell_ontology_class", "age")
# # 

Current working directory: c:\Users\Hang Yu\Desktop\SC_Ageing_Prediction
File 1 path: c:\Users\Hang Yu\Desktop\SC_Ageing_Prediction\Brain_Myeloid_facs.h5ad
File 2 path: c:\Users\Hang Yu\Desktop\SC_Ageing_Prediction\Brain_Non-Myeloid_facs.h5ad



This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(

See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html


Checking low count cell types...
macrophage  has too low counts
brain pericyte  has too low counts
interneuron  has too low counts
CD8-positive, alpha-beta T cell  has too low counts
Bergmann glial cell  has too low counts
ependymal cell  has too low counts
neuronal stem cell  has too low counts
oligodendrocyte precursor cell  has too low counts
Il6 expressing cells  has too low counts
medium spiny neuron  has too low counts
Checking skewed count cell types...
astrocyte  has skewed cell counts


In [6]:
# What does adata look like?

print(type(adata))
print(adata)
print(type(adata.obs))
# print out all obs columns
print(adata.obs.columns)    
print(adata.obsm['X_tsne'].shape)
print(adata.X.T)

<class 'anndata._core.anndata.AnnData'>
AnnData object with n_obs × n_vars = 9500 × 22899
    obs: 'FACS.selection', 'age', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cellid', 'free_annotation', 'method', 'mouse.id', 'plate', 'sex', 'subtissue', 'tissue', 'well', 'n_genes', 'n_counts', 'louvain', 'cluster_names', 'leiden'
    var: 'n_cells', 'means-0', 'dispersions-0', 'dispersions_norm-0', 'highly_variable-0', 'means-1', 'dispersions-1', 'dispersions_norm-1', 'highly_variable-1'
    obsm: 'X_pca', 'X_umap', 'X_tsne'
<class 'pandas.core.frame.DataFrame'>
Index(['FACS.selection', 'age', 'batch', 'cell', 'cell_ontology_class',
       'cell_ontology_id', 'cellid', 'free_annotation', 'method', 'mouse.id',
       'plate', 'sex', 'subtissue', 'tissue', 'well', 'n_genes', 'n_counts',
       'louvain', 'cluster_names', 'leiden'],
      dtype='object')
(9500, 2)
  (22884, 0)	1.7220321
  (22868, 0)	4.901944
  (22839, 0)	1.5224471
  (22825, 0)	2.8334746
  (22821, 0)	1.4039102
  

In [7]:
# print out all obs columns
for col in adata.obs.columns:
    print(adata.obs[col].value_counts())
    print("***************************************************")
    print("\n")


# print(adata.obs['age'].value_counts())

# question: why there is not 18m here

FACS.selection
nan          4840
Microglia    4268
Neurons       392
Name: count, dtype: int64
***************************************************


age
3m     4840
24m    4660
Name: count, dtype: int64
***************************************************


batch
0    8756
1     744
Name: count, dtype: int64
***************************************************


cell
nan                         4660
L12.B001176.3_56_F.1.1         1
L12.B000825.3_38_F.1.1         1
L11.MAA000638.3_9_M.1.1        1
L11.MAA000617.3_10_M.1.1       1
                            ... 
G2.B001689.3_38_F.1.1          1
G2.B001176.3_56_F.1.1          1
G2.B000825.3_38_F.1.1          1
G1.MAA000617.3_10_M.1.1        1
I21.B003279.3_38_F.1.1         1
Name: count, Length: 4841, dtype: int64
***************************************************


cell_ontology_class
microglial cell    8756
nan                 395
neuron              349
Name: count, dtype: int64
***************************************************


cel

In [8]:
# data preprocessing

sc.pp.filter_genes(adata, min_cells=5)
sc.pp.filter_cells(adata, min_genes=500)
adata.obs['n_counts'] = np.sum(adata.X, axis=1).A1
adata = adata[adata.obs['n_counts']>=3000]
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4) #simple lib size normalization?
adata = sc.pp.filter_genes_dispersion(adata, subset = False, min_disp=.5, max_disp=None, 
                              min_mean=.0125, max_mean=10, n_bins=20, n_top_genes=None, 
                              log=True, copy=True)
sc.pp.log1p(adata)
sc.pp.scale(adata, max_value=10, zero_center=False)
sc.tl.pca(adata,use_highly_variable=True)
sc.pp.neighbors(adata, n_neighbors=18)
sc.tl.louvain(adata, resolution = 1)
sc.tl.umap(adata)
if 'X_umap' not in adata.obsm.keys():
    sc.pp.neighbors(adata, n_neighbors=15, use_rep='X_pca')
    sc.tl.umap(adata)

  adata.obs[key_n_counts] = counts_per_cell
  from .autonotebook import tqdm as notebook_tqdm


In [9]:
celltype_df = pd.DataFrame(adata.obs["cell_ontology_class"])
celltype_df = celltype_df.rename(columns={"cell_ontology_class": "celltype"})
age_df = pd.DataFrame(adata.obs["age"])
age_df = age_df.rename(columns={"age": "age"})

In [10]:
def clean_age(age_df, substring):
    values = []
    for x in age_df["age"]:
        try:
            # Attempt to strip the substring and convert to integer
            value = int(x.strip(substring))
            values.append(value)
        except ValueError:
            # Handle the case where conversion fails
            warnings.warn(f"Warning: '{x}' could not be converted to an integer.")
            break
    age_df["age"] = values
    return age_df

def get_raw_counts(adata, celltype_df):
    raw_count = pd.DataFrame.sparse.from_spmatrix(adata.X.T, 
                                               index = adata.var_names, 
                                               columns = adata.obs_names).astype(int)
    raw_count = raw_count[list(celltype_df.index)]
    return raw_count

In [11]:
cleaned_age_df = clean_age(age_df, "m")
adata.obsp["distances"]

<8816x8816 sparse matrix of type '<class 'numpy.float64'>'
	with 149872 stored elements in Compressed Sparse Row format>