In [1]:
import os
import re
import pickle
import numpy as np
import pandas as pd
from scipy.sparse import csc_matrix
import rpy2.robjects as ro
from rpy2.robjects import numpy2ri
from glob import glob

numpy2ri.activate()  # enables automatic conversion between R and numpy objects

# Import the R function for reading RDS files.
readRDS = ro.r['readRDS']

# Get list of .rds files
file_list = glob(r"D:\Macaque\cellFiles\*.rds")

def load_rds_as_sparse(file_path):
    """
    Load an .rds file that contains a dgCMatrix and return:
      - a scipy.sparse.csc_matrix for the gene expression data,
      - the gene symbols (row names),
      - the cell metadata as a pandas DataFrame.
      
    This function assumes that:
      - The dgCMatrix has slots "i", "p", "x", "Dim", and "Dimnames".
      - The cell metadata is stored in the slot "metadata".
    """
    r_obj = readRDS(file_path)
    
    # Access the S4 slots using ro.r['slot']
    i_slot = np.array(ro.r['slot'](r_obj, 'i'))
    p_slot = np.array(ro.r['slot'](r_obj, 'p'))
    x_slot = np.array(ro.r['slot'](r_obj, 'x'))
    dim = np.array(ro.r['slot'](r_obj, 'Dim'))
    sparse_mat = csc_matrix((x_slot, i_slot, p_slot), shape=(dim[0], dim[1]))
    
    # Get gene names from the first element of the Dimnames slot
    dimnames = ro.r['slot'](r_obj, 'Dimnames')
    genes = list(dimnames[0])
    
    # Convert the metadata (cell info) to a pandas DataFrame.
    meta_r = ro.r['slot'](r_obj, 'metadata')
    metadata = pd.DataFrame(np.array(meta_r))
    
    return sparse_mat, genes, metadata

# Load the region lookup CSV as before.
region_lookup = pd.read_csv('global_region_id_to_atlas_number.csv')
region_lookup_dict = pd.Series(region_lookup.atlas_number.values, index=region_lookup.global_region_id).to_dict()

# Load the per-slide cell type mapping from pickle.
with open("CellID_Per_Slide_Mapping.pkl", "rb") as f:
    slide_lookup = pickle.load(f)

# # --- FIRST PASS: Determine the union of all unique genes ---
# all_genes_set = set()
# for file_path in file_list:
#     _, genes, _ = load_rds_as_sparse(file_path)
#     all_genes_set.update(genes)
# all_genes = sorted(all_genes_set)
# n_genes = len(all_genes)
# print(f"Total unique genes: {n_genes}")
all_genes = pd.read_csv(r"D:\Macaque\macaque_genes.csv")
all_genes = all_genes['gene_name'].to_list()
n_genes = len(all_genes)

# --- Define global atlas dimensions ---
unique_atlas = sorted(region_lookup.atlas_number.unique())
n_atlas = len(unique_atlas)

# To determine global cell type dimension, we need to scan through all slide lookup mappings.
all_celltypes = set()
for key, df in slide_lookup.items():
    all_celltypes.update(df['celltype_index'].unique())
all_celltype = sorted(all_celltypes)
n_celltype = len(all_celltype)

# Create mappings from atlas number / celltype to indices in aggregated arrays.
atlas_to_index = {atlas: idx for idx, atlas in enumerate(unique_atlas)}
celltype_to_index = {ct: idx for idx, ct in enumerate(all_celltype)} # cell type is just the same

# Create a mapping from gene symbol to its global index.
gene_to_global_idx = {gene: idx for idx, gene in enumerate(all_genes)}

# Initialize global cumulative arrays.
cumulative_sum = np.zeros((n_genes, n_atlas, n_celltype), dtype=np.float64)
cumulative_count = np.zeros((n_genes, n_atlas, n_celltype), dtype=np.int64)
print('Initialized global cumulative arrays.')


# --- SECOND PASS: Process each file and update global cumulative arrays ---
for file_path in file_list:
    print(f"Processing {file_path}...")
    sparse_mat, local_genes, metadata = load_rds_as_sparse(file_path)

    # Extract slide key from file name.
    # For example, if filename is "total_gene_T25.rds", extract "T25".
    file_name = os.path.basename(file_path)
    match = re.search(r"T\d+", file_name)
    if not match:
        print(f"Could not extract slide key from {file_name}. Skipping.")
        continue
    slide_key = match.group(0)
    
    # Get the file-specific cell type mapping dataframe.
    if slide_key not in slide_lookup:
        print(f"Slide key {slide_key} not found in slide_lookup. Skipping {file_name}.")
        continue
    file_mapping_df = slide_lookup[slide_key]
    # Create a dictionary mapping from cell_id to celltype_index for this slide.
    file_cell_lookup = dict(zip(file_mapping_df['cell_id'], file_mapping_df['celltype_index']))

    # Because of R conversion need to transpose metadata and add column names
    metadata_T = metadata.transpose()
    metadata_T.columns = ['cell_id', 'gene_area','x','y','ry','rx']
    metadata = metadata_T[['cell_id', 'gene_area',]].copy()

    # Map each cell's cell_id to its celltype_index using the per-file mapping.
    metadata['celltype_index'] = metadata['cell_id'].map(file_cell_lookup)

    # Map each cell's 'gene_area' (global_region_id) to its atlas_number using region_lookup.
    metadata['atlas_number'] = metadata['gene_area'].map(region_lookup_dict)
    
    # Convert the lookup values to indices in our global arrays.
    # To know where in the global array these values should go (can't use atlas number 981 for index 981)
    metadata['celltype_idx'] = metadata['celltype_index'].map(lambda x: celltype_to_index.get(x))
    metadata['atlas_idx'] = metadata['atlas_number'].map(lambda x: atlas_to_index.get(x))
    # Now metadata has two new columns showing indexs in global array for the celltype and atlas
    
    # Group cells by their (atlas_idx, celltype_idx)
    # .indices returns a dict with (atlas_idx, celltype_idx) and list indicies in metadata that
    # correpsond to that combination (all cells of that celltype found in that atlas region in this file)
    groups = metadata.groupby(['atlas_idx', 'celltype_idx']).indices 
    for (atlas_idx, celltype_idx), cell_indices in groups.items():
        # Sum expression over the cells in this group.
        # sparse_mat[:, cell_indices] has shape (n_local_genes, n_group_cells)
        # sparse_mat[:, cell_indices] selects all cells found in this (atlas_idx, celltype_idx) 
        # and returns all their genes
        # The .sum(axis=1) collects the total gene expression for each gene (axis=1 to sum across cells)
        # across all cells for this (atlas_idx, celltype_idx) group
        group_sum = sparse_mat[:, cell_indices].sum(axis=1) 
        # group_sum is a matrix with shape (n_local_genes, 1) because it was a sparse matrix
        group_sum = np.array(group_sum).flatten()  # convert to 1D array so (n_local_genes,)
        group_cell_count = len(cell_indices)
        
        # For each gene in this file (local order), update the corresponding global entry.
        # Local genes is a list of gene names in this file
        for local_idx, gene in enumerate(local_genes):
            global_idx = gene_to_global_idx[gene]
            # There are n genes in this file so use local_idx to go through each group sum
            # So this will only go through each gene (global_idx)
            cumulative_sum[global_idx, atlas_idx, celltype_idx] += group_sum[local_idx]
            cumulative_count[global_idx, atlas_idx, celltype_idx] += group_cell_count

# --- Compute overall mean expression ---
mean_array = np.full((n_genes, n_atlas, n_celltype), np.nan, dtype=np.float64)
valid = cumulative_count > 0 # Can't divide by 0 (no cells found here)
mean_array[valid] = cumulative_sum[valid] / cumulative_count[valid]

# --- Save the aggregated data ---
np.savez('new_aggregated_data.npz',
         mean=mean_array,
         sum=cumulative_sum,
         count=cumulative_count,
         genes=all_genes,
         atlas_numbers=unique_atlas,
         celltype_indices=all_celltype)
print("Aggregated data saved to new_aggregated_data.npz")

Initialized global cumulative arrays.
Processing D:\Macaque\cellFiles\total_gene_T100.rds...


R[write to console]: Loading required package: Matrix



Processing D:\Macaque\cellFiles\total_gene_T101.rds...
Processing D:\Macaque\cellFiles\total_gene_T102.rds...
Processing D:\Macaque\cellFiles\total_gene_T103.rds...
Processing D:\Macaque\cellFiles\total_gene_T104.rds...
Processing D:\Macaque\cellFiles\total_gene_T105.rds...
Processing D:\Macaque\cellFiles\total_gene_T106.rds...
Processing D:\Macaque\cellFiles\total_gene_T107.rds...
Processing D:\Macaque\cellFiles\total_gene_T108.rds...
Processing D:\Macaque\cellFiles\total_gene_T109.rds...
Processing D:\Macaque\cellFiles\total_gene_T110.rds...
Processing D:\Macaque\cellFiles\total_gene_T111.rds...
Processing D:\Macaque\cellFiles\total_gene_T112.rds...
Processing D:\Macaque\cellFiles\total_gene_T113.rds...
Processing D:\Macaque\cellFiles\total_gene_T114.rds...
Processing D:\Macaque\cellFiles\total_gene_T115.rds...
Processing D:\Macaque\cellFiles\total_gene_T116.rds...
Processing D:\Macaque\cellFiles\total_gene_T117.rds...
Processing D:\Macaque\cellFiles\total_gene_T118.rds...
Processing

In [1]:
import numpy as np
# Load the aggregated data
data = np.load('new_aggregated_data.npz')

# You can inspect available keys
print("Keys:", data.files)

# Access the stored arrays and lists
mean_array = data['mean']
# genes = data['genes']
# sum_array = data['sum']
# count_array = data['count']
# atlas_numbers = data['atlas_numbers']
# celltype_indices = data['celltype_indices']

# # For example, print shapes or contents
# print("Mean shape:", mean_array.shape)
# print("Number of genes:", len(genes))

mean_data_permuted = np.transpose(mean_array, (0, 2, 1))
samples = mean_data_permuted.reshape(-1, mean_array.shape[1])
print("Samples shape:", samples.shape)  # Expected: (8460*104, 239)
np.save("samples141Regions.npy", samples)

Keys: ['mean', 'sum', 'count', 'genes', 'atlas_numbers', 'celltype_indices']
Samples shape: (4108908, 141)


# Old way:
Didn't realise cell_ids were the same across different slides (T25 etc)
Additionally didn't realise that the metadata from the R conversion was transposed, additinoally that the code was adding column names like 'celltype_index' when in the transposed set up should of been adding rows, but then accessing later in the code would of been a nightmare!

In [1]:
import os
import numpy as np
import pandas as pd
from scipy.sparse import csc_matrix
import rpy2.robjects as ro
from rpy2.robjects import numpy2ri
numpy2ri.activate()  # enables automatic conversion between R and numpy objects
from glob import glob

# Import the R function for reading RDS files.
readRDS = ro.r['readRDS']

file_list = glob(r"D:\Macaque\cellFiles\*.rds")

def load_rds_as_sparse(file_path):
    """
    Load an .rds file that contains a dgCMatrix and return:
      - a scipy.sparse.csc_matrix for the gene expression data,
      - the gene symbols (row names),
      - the cell metadata as a pandas DataFrame.
      
    This function assumes that:
      - The dgCMatrix has slots "i", "p", "x", "Dim", and "Dimnames".
      - The cell metadata is stored in the slot "metadata".
    """
    r_obj = readRDS(file_path)
    
    # Access the S4 slots using ro.r['slot']
    i_slot = np.array(ro.r['slot'](r_obj, 'i'))
    p_slot = np.array(ro.r['slot'](r_obj, 'p'))
    x_slot = np.array(ro.r['slot'](r_obj, 'x'))
    dim = np.array(ro.r['slot'](r_obj, 'Dim'))
    sparse_mat = csc_matrix((x_slot, i_slot, p_slot), shape=(dim[0], dim[1]))
    
    # Get gene names from the first element of the Dimnames slot
    dimnames = ro.r['slot'](r_obj, 'Dimnames')
    genes = list(dimnames[0])
    
    # Convert the metadata (cell info) to a pandas DataFrame.
    meta_r = ro.r['slot'](r_obj, 'metadata')
    metadata = pd.DataFrame(np.array(meta_r))
    # You may need to adjust column names depending on how the metadata converts.
    
    return sparse_mat, genes, metadata

# Load lookup CSVs
cell_lookup = pd.read_csv('cell_id_to_celltype_lookup_with_index.csv')
region_lookup = pd.read_csv('global_region_id_to_atlas_number.csv')
print('Load lookup CSVs')

# Create lookup dictionaries:
# cell_lookup_dict = pd.Series(cell_lookup.celltype_index.values, index=cell_lookup.cell_id).to_dict()
region_lookup_dict = pd.Series(region_lookup.atlas_number.values, index=region_lookup.global_region_id).to_dict()
print('Create lookup dictionaries')
print(region_lookup_dict)

# # --- FIRST PASS: Determine the union of all unique genes ---
# all_genes_set = set()
# for file_path in file_list:
#     _, genes, _ = load_rds_as_sparse(file_path)
#     all_genes_set.update(genes)
# all_genes = sorted(all_genes_set)
# n_genes = len(all_genes)
# print(f"Total unique genes: {n_genes}")
all_genes = pd.read_csv(r"D:\Macaque\macaque_genes.csv")
all_genes = all_genes['gene_name'].to_list()
n_genes = len(all_genes)

# --- Define global atlas and celltype dimensions ---
unique_atlas = sorted(region_lookup.atlas_number.unique())
n_atlas = len(unique_atlas)
n_celltype = 258
print('Define global atlas and celltype dimensions')

# Create mappings from atlas number / celltype to indices in aggregated arrays.
atlas_to_index = {atlas: idx for idx, atlas in enumerate(unique_atlas)}
# celltype_to_index = {ct: idx for idx, ct in enumerate(unique_celltype)}
# Create a mapping from gene symbol to its global index.
gene_to_global_idx = {gene: idx for idx, gene in enumerate(all_genes)}
print('Create mappings from atlas number / celltype to indices in aggregated arrays.')

# Initialize global cumulative arrays:
cumulative_sum = np.zeros((n_genes, n_atlas, n_celltype), dtype=np.float64)
cumulative_count = np.zeros((n_genes, n_atlas, n_celltype), dtype=np.int64)
print('Initialize global cumulative arrays:')


ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
# --- SECOND PASS: Process each file and update global cumulative arrays ---
for file_path in file_list:
    print(f"Processing {file_path}...")
    sparse_mat, local_genes, metadata = load_rds_as_sparse(file_path)
    # Assume that metadata rows correspond to the columns of sparse_mat.
    
    # Map each cell's 'cell_id' to its celltype_index using the lookup dictionary.
    # Because of conversion metadata of shape (6, 313454), cell_id is first row
    metadata['celltype_index'] = metadata.iloc[0].map(cell_lookup_dict)
    # Map each cell's 'gene_area' (global_region_id) to its atlas_number.
    # Gene is second row
    metadata['atlas_number'] = metadata.iloc[1].map(region_lookup_dict)
    
    # Drop cells that could not be mapped.
    metadata = metadata.dropna(subset=['celltype_index', 'atlas_number'])
    
    # Convert the lookup values to indices in our global arrays.
    metadata['celltype_idx'] = metadata['celltype_index'].map(lambda x: celltype_to_index.get(x))
    metadata['atlas_idx'] = metadata['atlas_number'].map(lambda x: atlas_to_index.get(x))
    metadata = metadata.dropna(subset=['celltype_idx', 'atlas_idx'])
    metadata['celltype_idx'] = metadata['celltype_idx'].astype(int)
    metadata['atlas_idx'] = metadata['atlas_idx'].astype(int)
    
    # Group cells by their (atlas_idx, celltype_idx)
    groups = metadata.groupby(['atlas_idx', 'celltype_idx']).indices
    for (atlas_idx, celltype_idx), cell_indices in groups.items():
        # Sum expression over the cells in this group.
        # sparse_mat[:, cell_indices] has shape (n_local_genes, n_group_cells)
        group_sum = sparse_mat[:, cell_indices].sum(axis=1)  # result is a matrix with shape (n_local_genes, 1)
        group_sum = np.array(group_sum).flatten()  # convert to 1D array
        group_cell_count = len(cell_indices)
        
        # For each gene in this file (local order), update the corresponding global entry.
        for local_idx, gene in enumerate(local_genes):
            global_idx = gene_to_global_idx[gene]
            cumulative_sum[global_idx, atlas_idx, celltype_idx] += group_sum[local_idx]
            cumulative_count[global_idx, atlas_idx, celltype_idx] += group_cell_count

# --- Compute overall mean expression ---
mean_array = np.full((n_genes, n_atlas, n_celltype), np.nan, dtype=np.float64)
valid = cumulative_count > 0
mean_array[valid] = cumulative_sum[valid] / cumulative_count[valid]

# --- Save the aggregated data ---
np.savez('new_aggregated_data.npz',
         mean=mean_array,
         sum=cumulative_sum,
         count=cumulative_count,
         genes=all_genes,
         atlas_numbers=unique_atlas,
         celltype_indices=unique_celltype)
print("Aggregated data saved to new_aggregated_data.npz")

Processing D:\Macaque\cellFiles\total_gene_T100.rds...
Processing D:\Macaque\cellFiles\total_gene_T101.rds...
Processing D:\Macaque\cellFiles\total_gene_T102.rds...
Processing D:\Macaque\cellFiles\total_gene_T103.rds...
Processing D:\Macaque\cellFiles\total_gene_T104.rds...
Processing D:\Macaque\cellFiles\total_gene_T105.rds...
Processing D:\Macaque\cellFiles\total_gene_T106.rds...
Processing D:\Macaque\cellFiles\total_gene_T107.rds...
Processing D:\Macaque\cellFiles\total_gene_T108.rds...
Processing D:\Macaque\cellFiles\total_gene_T109.rds...
Processing D:\Macaque\cellFiles\total_gene_T110.rds...
Processing D:\Macaque\cellFiles\total_gene_T111.rds...
Processing D:\Macaque\cellFiles\total_gene_T112.rds...
Processing D:\Macaque\cellFiles\total_gene_T113.rds...
Processing D:\Macaque\cellFiles\total_gene_T114.rds...
Processing D:\Macaque\cellFiles\total_gene_T115.rds...
Processing D:\Macaque\cellFiles\total_gene_T116.rds...
Processing D:\Macaque\cellFiles\total_gene_T117.rds...
Processing

# Creating the region mappings

In [None]:
import pandas as pd
import json

# Read the CSV file (replace 'your_file.csv' with your actual filename)
df = pd.read_csv(r"D:\Macaque\regions-macaque1.csv")
regions = pd.read_csv(r"D:\Macaque\MacaqueRegionMappings.csv")

# Helper function to extract the region from the origin_name
def extract_region(origin_name):
    if not isinstance(origin_name, str):
        return None
    parts = origin_name.split('-')
    return parts[1] if len(parts) >= 3 else None

# Create a new column 'region' by extracting it from 'origin_name'
df['region'] = df['origin_name'].apply(extract_region)

# List of valid regions
regions_list = regions['Regions'].to_list()

# Filter the DataFrame to only include rows with valid regions
df = df[df['region'].isin(regions_list)]

# Group by region and aggregate both the 'global_region_id' and 'chip' columns into lists
region_mapping = df.groupby('region').agg({
    'global_region_id': list,
    'chip': list
}).to_dict(orient='index')

import json

# Save the region_mapping dictionary to a JSON file
with open('region_mapping.json', 'w') as outfile:
    json.dump(region_mapping, outfile, indent=4)

print("Region mapping saved to region_mapping.json")

Region mapping saved to region_mapping.json


In [None]:
df = pd.read_csv(r"D:\Macaque\MacaqueRegionMappings.csv")

print(df['Atlas #'].shape)
print(len(df['Atlas #'].unique()))
print(df[df['Atlas #'].duplicated(keep=False)])

(143,)
141
   Regions  Atlas #
87     Opt      419
92      PF      389
93     PFG      389
94      PG      419


# Creating Cell_Id Mappings New:

In [None]:
# Load your region mapping from JSON file
with open('region_mapping.json', 'r') as infile:
    region_mapping = json.load(infile)

# Load CSV that maps region keys to atlas numbers.
# Assume CSV has columns: 'region' and 'atlas_number'
region_atlas_df = pd.read_csv(r"D:\Macaque\MacaqueRegionMappings.csv")

# Prepare a list for the flattened mapping.
flat_mapping = []

# Iterate over each region and its list of global_region_ids.
for region, mapping in region_mapping.items():
    # Look up the atlas number for this region.
    atlas_row = region_atlas_df[region_atlas_df['Regions'] == region]
    if not atlas_row.empty:
        atlas_number = atlas_row['Atlas #'].values[0]
    else:
        atlas_number = None  # or handle missing atlas number as needed
    # Add a record for each global_region_id in the region.
    for global_id in mapping['global_region_id']:
        flat_mapping.append({
            'global_region_id': global_id,
            'region': region,
            'atlas_number': atlas_number
        })

# Convert the flat mapping to a DataFrame.
lookup_df = pd.DataFrame(flat_mapping)
lookup_df = lookup_df.drop_duplicates()

# Save this flattened lookup table to a CSV file.
lookup_df.to_csv('global_region_id_to_atlas_number.csv', index=False)
print("Lookup table saved to global_region_id_to_atlas_number.csv")


Lookup table saved to global_region_id_to_atlas_number.csv


In [13]:
import pandas as pd
import pickle # to interact with the dataframes!
cell_types = pd.read_csv(r"D:\Macaque\ST.CellID.CellType.3monkeys.all.tsv", sep='\t', usecols=['macaque','slide','cell_id','celltype'])
cell_types_macaque1 = cell_types[cell_types['macaque'] == 'macaque1'].copy()
# Find all unique cell types.
unique_celltypes = cell_types_macaque1['celltype'].unique()

# Create a mapping: celltype -> unique index.
celltype_mapping = {ct: idx for idx, ct in enumerate(unique_celltypes)}

# Add a new column 'celltype_index' to the DataFrame using the mapping.
cell_types_macaque1['celltype_index'] = cell_types_macaque1['celltype'].map(celltype_mapping)
# Create a lookup dictionary where the key is the slide and 
# the value is the DataFrame of the corresponding cell_id and celltype rows.
slide_lookup = {
    slide: group[['cell_id', 'celltype', 'celltype_index']]
    for slide, group in cell_types_macaque1.groupby('slide')
}

# Example: Print the lookup for a specific slide.
example_slide = list(slide_lookup.keys())[0]
print("Slide:", example_slide)
print(slide_lookup[example_slide])

with open('CellID_Per_Slide_Mapping.pkl', 'wb') as outfile:
    pickle.dump(slide_lookup, outfile)

print("Lookup saved as a pickle file.")

Slide: T100
           cell_id celltype  celltype_index
25035597       1.0     L4.3              97
25035598       3.0     L6.7             122
25035599       5.0     L2.2              96
25035600      12.0   L2/3.8               4
25035601      14.0     L4.2               0
...            ...      ...             ...
25349046  995833.0  PVALB.9              31
25349047  995836.0     L6.5              54
25349048  995837.0   L5/6.3              84
25349049  995839.0   L5/6.4              83
25349050  995842.0    ASC.7              55

[313454 rows x 3 columns]
Lookup saved as a pickle file.


# Creating the Cell_Id Mappings OLD NOT TAKING INTO ACCOUNT DIFFERENT SLIDES T100 etc!! As well as macaque!
(95 cells of the 32 mil have no cell Ids)

In [None]:
# Read the TSV file (adjust the filename if necessary)
df = pd.read_csv(r"D:\Macaque\ST.CellID.CellType.3monkeys.all.tsv", sep='\t', usecols=['cell_id', 'celltype'])

# Extract only the relevant columns (cell_id and celltype) and remove duplicate rows
lookup_df = df[['cell_id', 'celltype']].drop_duplicates()

# Save the unique lookup table to a CSV file
lookup_df.to_csv('cell_id_to_celltype_lookup.csv', index=False)
print("Unique lookup table saved to cell_id_to_celltype_lookup.csv")

Unique lookup table saved to cell_id_to_celltype_lookup.csv


In [None]:
df = pd.read_csv(r"D:\Macaque\cell_id_to_celltype_lookup.csv")
print(df)
print(len(df['celltype'].unique()))
nan_rows = df[df['celltype'].isna()]
print(len(nan_rows))

   cell_id celltype
0      3.0     L4.2
1      5.0     L6.4
2      8.0    OLG.8
3     14.0  L4/5.12
4     15.0   L2/3.8


In [15]:
# Read the existing lookup CSV file
lookup_df = pd.read_csv('cell_id_to_celltype_lookup.csv')

# Get unique celltypes in the order of appearance
unique_celltypes = lookup_df['celltype'].drop_duplicates().reset_index(drop=True)

# Create a mapping dictionary: celltype -> index (starting from 0)
celltype_to_index = {celltype: idx for idx, celltype in enumerate(unique_celltypes)}

# Add a new column 'celltype_index' by mapping the celltype column
lookup_df['celltype_index'] = lookup_df['celltype'].map(celltype_to_index)

# Save the updated lookup table to a new CSV file
lookup_df.to_csv('cell_id_to_celltype_lookup_with_index.csv', index=False)
print("Updated lookup table saved to cell_id_to_celltype_lookup_with_index.csv")

Updated lookup table saved to cell_id_to_celltype_lookup_with_index.csv


In [18]:
print(lookup_df.head())
print(len(lookup_df['celltype_index'].unique()))

   cell_id celltype  celltype_index
0      3.0     L4.2               0
1      5.0     L6.4               1
2      8.0    OLG.8               2
3     14.0  L4/5.12               3
4     15.0   L2/3.8               4
258
