## PCA

In [2]:
import scanpy as sc
import numpy as np
from sklearn.decomposition import IncrementalPCA
from sklearn.preprocessing import StandardScaler

# Load .h5ad file (scRNA)
ad_sc = sc.read('../data/processed/processed_scrna_adata.h5ad')  # Use scanpy to read the file

# Check the shape of the data
print(f"Original shape: {ad_sc.X.shape}")

# Convert to dense format if necessary
if hasattr(ad_sc.X, 'toarray'):
    ad_sc_dense = ad_sc.X.toarray()
else:
    ad_sc_dense = ad_sc.X

# Ensure the data is in the correct numerical format
ad_sc_dense = ad_sc_dense.astype(np.float32)

# Standardize the data
scaler = StandardScaler()
ad_sc_dense = scaler.fit_transform(ad_sc_dense)

# Select a subset of samples (e.g., the first 2500 samples)
subset_data = ad_sc_dense[:2500, :]  # Adjust this as needed

# Initialize Incremental PCA to reduce features
n_components = 50  # Specify the number of components you want to keep
ipca = IncrementalPCA(n_components=n_components, batch_size=100)  # Use batch_size to control memory usage

# Fit and transform the subset data in batches
for i in range(0, subset_data.shape[0], 100):  # Adjust batch size as needed
    ipca.partial_fit(subset_data[i:i + 100, :])

# Transform the data
reduced_data = ipca.transform(subset_data)

# Print the shape of the reduced data
print(f"Reduced shape: {reduced_data.shape}")

# Create a new AnnData object to store the PCA results
ad_sp_pca = sc.AnnData(X=reduced_data)

# Optionally, you can add the original observation metadata
# ad_sp_pca.obs = ad_sc.obs.iloc[:2500]  # If you used the first 2500 samples

# Save the PCA results to a new .h5ad file
ad_sp_pca.write('ad_sc_pca.h5ad')

print("PCA results saved to 'ad_sc_pca.h5ad'")

Original shape: (737280, 33694)


MemoryError: Unable to allocate 92.5 GiB for an array with shape (737280, 33694) and data type float32

## PCA on Single cell Data

In [6]:
import scanpy as sc
import numpy as np
from sklearn.decomposition import TruncatedSVD
import psutil

# Load .h5ad file (scRNA)
ad_sc = sc.read('../data/processed/processed_scrna_adata.h5ad')
print(f"Original shape: {ad_sc.X.shape}")

# Check if the matrix is sparse
is_sparse = hasattr(ad_sc.X, 'toarray')

if is_sparse:
    print("Using sparse matrix format...")

# Select a subset of samples (first 2500 samples)
subset_data = ad_sc.X[:3500, :]  # This will keep the data sparse if it's sparse

# Memory check
available_memory = psutil.virtual_memory().available / (1024 ** 3)
print(f"Available memory: {available_memory:.2f} GB")

# Initialize TruncatedSVD for sparse data
n_components = 50
svd = TruncatedSVD(n_components=n_components)

# Fit and transform the subset data
reduced_data = svd.fit_transform(subset_data)

# Print the shape of the reduced data
print(f"Reduced shape: {reduced_data.shape}")

# Create a new AnnData object to store the PCA results
ad_sc_pca = sc.AnnData(X=reduced_data)

# Save the PCA results to a new .h5ad file
ad_sc_pca.write('../data/processed/ad_sc_pca.h5ad')
print("PCA results saved to 'ad_sc_pca.h5ad'")


Original shape: (737280, 33694)
Using sparse matrix format...
Available memory: 18.10 GB
Reduced shape: (3500, 50)
PCA results saved to 'ad_sc_pca.h5ad'


## PCA on Visium spatial data

In [7]:
import scanpy as sc
import numpy as np
from sklearn.decomposition import TruncatedSVD
import psutil

In [8]:


# Load .h5ad file (scRNA)
ad_sp = sc.read('../data/processed/processed_visium_adata.h5ad')
print(f"Original shape: {ad_sp.X.shape}")

# Check if the matrix is sparse
is_sparse = hasattr(ad_sp.X, 'toarray')

if is_sparse:
    print("Using sparse matrix format...")

# Select a subset of samples (first 2500 samples)
subset_data = ad_sp.X[:3500, :]  # This will keep the data sparse if it's sparse

# Memory check
available_memory = psutil.virtual_memory().available / (1024 ** 3)
print(f"Available memory: {available_memory:.2f} GB")

# Initialize TruncatedSVD for sparse data
n_components = 50
svd = TruncatedSVD(n_components=n_components)

# Fit and transform the subset data
reduced_data = svd.fit_transform(subset_data)

# Print the shape of the reduced data
print(f"Reduced shape: {reduced_data.shape}")

# Create a new AnnData object to store the PCA results
ad_sp_pca = sc.AnnData(X=reduced_data)

# Save the PCA results to a new .h5ad file
ad_sp_pca.write('../data/processed/ad_sp_pca.h5ad')

print("PCA results saved to 'ad_sp_pca.h5ad'")


Original shape: (3858, 18085)
Using sparse matrix format...
Available memory: 17.98 GB
Reduced shape: (3500, 50)
PCA results saved to 'ad_sp_pca.h5ad'
