In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import squidpy as sq
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm
from anndata import AnnData
import scipy.sparse as sp

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'


# Load the data
sample_id = "IMMUNEX001"
adata = sc.read_10x_h5(f"/scratch/IMMUNEX/OUTPUT/Visium_NSCLC_{sample_id}/outs/binned_outputs/square_008um/filtered_feature_bc_matrix.h5")
adata.raw = adata
adata.uns['sample_id'] = sample_id
adata


In [None]:
import pandas as pd

# Load coordinates from Space Ranger output (adjust path as needed)
parquet_path = f"/scratch/IMMUNEX/OUTPUT/Visium_NSCLC_{sample_id}/outs/binned_outputs/square_008um/spatial/tissue_positions.parquet"
coords = pd.read_parquet(parquet_path)
display(coords.head())

# Set barcode index if not done
coords.set_index("barcode", inplace=True)

# Join without suffix — no conflict now
adata.obs = adata.obs.join(coords, how="left")
display(adata.obs)

# Keep only bins with spatial coordinates
adata = adata[adata.obs["pxl_row_in_fullres"].notnull()].copy()

# Add to obsm
adata.obsm["spatial"] = adata.obs[["pxl_row_in_fullres", "pxl_col_in_fullres"]].values

adata.obs["library_id"] = "IMMUNEX001"
adata.uns["spatial"] = {
    "IMMUNEX001": {
        "images": {"hires": None},
        "scalefactors": {
            "tissue_hires_scalef": 1.0,
            "spot_diameter_fullres": 1.0
        }
    }
}

adata.obs


In [None]:

tls_annotation = pd.read_csv('/scratch/IMMUNEX/data/TLS_ANNOTATIONS/TLSIMMUNEX001.csv')
tls_annotation.columns = ['barcode_8um','TLS']
tls_annotation[['row_8', 'col_8']] = tls_annotation['barcode_8um'].str.extract(r's_008um_(\d+)_(\d+)', expand=True).astype(int)

display(tls_annotation.head())

import pandas as pd
import numpy as np

# adata.obs['TLS'] = 0

# Extract row/col from 8µm barcodes
tls_annotation[['row_8', 'col_8']] = tls_annotation['barcode_8um'].str.extract(r's_008um_(\d+)_(\d+)', expand=True).astype(int)
tls_annotation.index = tls_annotation['barcode_8um']

adata.obs = adata.obs.join(tls_annotation, how="left")

adata.obs.head()

# # Loop over TLS rows where TLS is not NaN (i.e., presence)
# for _, row in tqdm(tls_annotation[tls_annotation['TLS'].notna()].iterrows(), total=tls_annotation['TLS'].notna().sum(), desc="Mapping TLS to 2µm bins"):
#     row_2_start = row['row_8'] * 4
#     row_2_end = row_2_start + 3
#     col_2_start = row['col_8'] * 4
#     col_2_end = col_2_start + 3

#     # Find matching 2µm bins
#     match = (
#         (adata.obs['array_row'] >= row_2_start) & (adata.obs['array_row'] <= row_2_end) &
#         (adata.obs['array_col'] >= col_2_start) & (adata.obs['array_col'] <= col_2_end)
#     )

#     # Set TLS = 1 for matched bins
#     adata.obs.loc[match, 'TLS'] = 1


In [None]:
idx = adata.obs['TLS'].isna()
adata.obs.loc[idx,'TLS'] = 'NoTLS'

In [None]:
adata.obs["library_id"] = "IMMUNEX001"
adata.uns["spatial"] = {
    "VisiumHD_dummy": {
        "images": {"hires": None},
        "scalefactors": {
            "tissue_hires_scalef": 1.0,
            "spot_diameter_fullres": 1.0
        }
    }
}


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Prepare data
df = adata.obs.reset_index()

plt.figure(figsize=(8, 8))
sns.scatterplot(
    data=df,
    x="array_col",
    y="array_row",
    hue="TLS",
    palette="viridis",
    s=10,
    linewidth=0,
    alpha=0.8
)

plt.gca().invert_yaxis()
plt.axis("equal")
plt.title("Visium HD 8µm bin layout (colored by n_counts)")
plt.xlabel("Array column")
plt.ylabel("Array row")
plt.tight_layout()
plt.show()


In [None]:
# Mitochondrial genes
adata.var_names_make_unique()
adata.var["mt"] = adata.var_names.str.startswith("MT-")
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], inplace=True)


In [None]:
# # Plot QC
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')

In [None]:
adata.var_names_make_unique()
adata.raw = adata.copy()
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=1, multi_panel=True, stripplot=False )


In [None]:
# sc.pl.spatial(
#     adata,
#     color=["TLS", "total_counts", "pct_counts_mt"],
#     size=0.1,
#     cmap="viridis",  # or specify list of colormaps for each
#     ncols=3,         # number of panels per row
#     show=True
# )


In [None]:
# Filter out extremely low-count bins, but keep most signal
sc.pp.filter_cells(adata, min_counts=10)
sc.pp.filter_genes(adata, min_cells=10)
adata

In [None]:
# Normalize and log1p
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

# # Plot total counts after normalization
sc.pl.highest_expr_genes(adata, n_top=20)

In [None]:
sns.histplot(adata.obs['pct_counts_mt'], bins=100)
plt.title("pct_counts_mt per 2µm bin")
plt.show()

sns.histplot(adata.obs['total_counts'], bins=100)
plt.title("Total Counts per 2µm bin")
plt.show()

sns.histplot(adata.obs['n_genes_by_counts'], bins=100)
plt.title("Detected Genes per 2µm bin")
plt.show()


In [34]:
adata

AnnData object with n_obs × n_vars = 672507 × 18207
    obs: 'in_tissue', 'array_row', 'array_col', 'pxl_row_in_fullres', 'pxl_col_in_fullres', 'library_id', 'barcode_8um', 'TLS', 'row_8', 'col_8', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'n_counts'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells'
    uns: 'sample_id', 'spatial', 'TLS_colors', 'log1p'
    obsm: 'spatial'

In [None]:
sc.pp.highly_variable_genes(adata, flavor="seurat", n_top_genes=2000)
sc.pl.highly_variable_genes(adata)

adata = adata[:, adata.var.highly_variable]
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver="arpack")
sc.pl.pca_variance_ratio(adata, log=True)



In [None]:
# Neighbors and clustering
sc.pp.neighbors(adata, n_neighbors=15, n_pcs=7)

In [None]:
sc.tl.leiden(adata, resolution=0.5)

In [None]:
sc.tl.umap(adata, min_dist=0.5, spread=1.0, random_state=42)

In [None]:

# Visualize clustering
sc.pl.umap(adata, color=["leiden"], title="Leiden clusters")

In [None]:
# # Show spatial clustering
sq.pl.spatial_scatter(adata, color="leiden", size=1.0, title="Leiden clusters (spatial)")


In [None]:
# Build spatial neighbors graph
sq.gr.spatial_neighbors(adata, coord_type="grid")


In [None]:

# Spatial autocorrelation (Moran’s I)
sq.gr.spatial_autocorr(adata, mode="moran")
sq.pl.spatial_autocorr(adata, mode="moran", genes=adata.var.highly_variable[:6].tolist())
