#### Prediction of Celltypes with Autogated Results

In [16]:
import pandas as pd
import numpy as np
import sklearn
import anndata as ad

#### Prediction Steps
1. Map autogated predictions to anndata
2. Split data by regions
3. Group 70% as training (wrt region) and 30% as testing
4. Apply two layer MLP

#### Step 1: Data Cleaning - Merging Autogated Predictions with Anndata

In [30]:
# Map autogating results to the original data
autogated = pd.read_csv('/mnt/disks/ecdyer-disk1/cell_type_assignments/scimap_auto_gating.csv')
autogated.rename(columns={'CellID': 'cell_id'}, inplace=True)

# Load adata object
adata = ad.read_h5ad('/mnt/disks/ecdyer-disk1/data/mel01_3_1_embeddings.h5ad')

# Ensure the DataFrame index matches the AnnData obs_names
autogated.set_index('cell_id', inplace=True)  # Replace 'cell_id' with the actual column name in your DataFrame

# Set 'cell_id' as index in adata.obs
adata.obs.set_index('cell_id', inplace=True)

In [36]:
# Save the updated AnnData object
adata.write('/mnt/disks/ecdyer-disk1/data/updated_mel01_3_1_embeddings.h5ad')

In [37]:
matched_adata = ad.read_h5ad('/mnt/disks/ecdyer-disk1/data/autogated_mel01_3_1_embeddings.h5ad')



In [34]:
matched_adata.obs

Unnamed: 0,cell_id,roi_im_fnames,region_names,region_id,row_index,col_index,image_sample_name,htan_image_id,patient_outcome,image_histology,patch_centroids_row_orig,patch_centroids_col_orig,patch_centroids_row_adj,patch_centroids_col_adj,patch_fname,boundary_status,updated_phenotypes
0,182870,MEL01-3-1_roi_532_row_8800_col_37440.ome.tif,MEL01-3-1_roi_532_row_8800_col_37440,532,8800,37440,MEL01-3-1,HTA7_1_9,Recurrence,Superficial Spreading Melanoma,8802.828025,38665.305732,2.828025,1225.305732,MEL01-3-1_roi_532_row_8800_col_37440_patch_0.tiff,border,Tumor
1,182906,MEL01-3-1_roi_532_row_8800_col_37440.ome.tif,MEL01-3-1_roi_532_row_8800_col_37440,532,8800,37440,MEL01-3-1,HTA7_1_9,Recurrence,Superficial Spreading Melanoma,8800.314961,37795.598425,0.314961,355.598425,MEL01-3-1_roi_532_row_8800_col_37440_patch_1.tiff,border,Tumor
2,182908,MEL01-3-1_roi_532_row_8800_col_37440.ome.tif,MEL01-3-1_roi_532_row_8800_col_37440,532,8800,37440,MEL01-3-1,HTA7_1_9,Recurrence,Superficial Spreading Melanoma,8800.876847,38026.689655,0.876847,586.689655,MEL01-3-1_roi_532_row_8800_col_37440_patch_2.tiff,border,Dendritic cell
3,182946,MEL01-3-1_roi_532_row_8800_col_37440.ome.tif,MEL01-3-1_roi_532_row_8800_col_37440,532,8800,37440,MEL01-3-1,HTA7_1_9,Recurrence,Superficial Spreading Melanoma,8800.463964,37564.418919,0.463964,124.418919,MEL01-3-1_roi_532_row_8800_col_37440_patch_3.tiff,border,Dendritic cell
4,182947,MEL01-3-1_roi_532_row_8800_col_37440.ome.tif,MEL01-3-1_roi_532_row_8800_col_37440,532,8800,37440,MEL01-3-1,HTA7_1_9,Recurrence,Superficial Spreading Melanoma,8800.573991,37587.973094,0.573991,147.973094,MEL01-3-1_roi_532_row_8800_col_37440_patch_4.tiff,border,Tumor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
563166,397358,MEL01-3-1_roi_849_row_14400_col_39520.ome.tif,MEL01-3-1_roi_849_row_14400_col_39520,849,14400,39520,MEL01-3-1,HTA7_1_9,Recurrence,Superficial Spreading Melanoma,15197.607287,40045.615385,797.607287,525.615385,MEL01-3-1_roi_849_row_14400_col_39520_patch_19...,non-border,Tumor
563167,397359,MEL01-3-1_roi_849_row_14400_col_39520.ome.tif,MEL01-3-1_roi_849_row_14400_col_39520,849,14400,39520,MEL01-3-1,HTA7_1_9,Recurrence,Superficial Spreading Melanoma,15198.424812,40437.992481,798.424812,917.992481,MEL01-3-1_roi_849_row_14400_col_39520_patch_19...,non-border,Tumor
563168,397413,MEL01-3-1_roi_849_row_14400_col_39520.ome.tif,MEL01-3-1_roi_849_row_14400_col_39520,849,14400,39520,MEL01-3-1,HTA7_1_9,Recurrence,Superficial Spreading Melanoma,15197.411321,39790.943396,797.411321,270.943396,MEL01-3-1_roi_849_row_14400_col_39520_patch_19...,non-border,Tumor
563169,397459,MEL01-3-1_roi_849_row_14400_col_39520.ome.tif,MEL01-3-1_roi_849_row_14400_col_39520,849,14400,39520,MEL01-3-1,HTA7_1_9,Recurrence,Superficial Spreading Melanoma,15199.081301,40400.731707,799.081301,880.731707,MEL01-3-1_roi_849_row_14400_col_39520_patch_19...,non-border,Tumor


#### Step 2: Split Data by Regions and Create Test/Train Splits

In [None]:
# Create Validation Region File
# Validation region (for visualization)
validation_region = 'MEL01-3-1_roi_573_row_9600_col_33280'
validation_obs = matched_adata.obs[matched_adata.obs['region_names'] == validation_region]

# Create a new AnnData object with the filtered data
filtered_anndata = matched_adata[validation_obs.index].copy()

# Save the new AnnData object if needed
filtered_anndata.write('/mnt/disks/ecdyer-disk1/data/validation-viz-region.h5ad') 

In [40]:
# Drop rows with region_names equal to validation_region
no_val_adata = matched_adata[matched_adata.obs['region_names'] != validation_region].copy()

In [41]:
# Step 1: Calculate the total number of cells
total_cells = no_val_adata.n_obs

# Step 2: Calculate the target number of cells (70% of total)
target_cells = total_cells * 0.9

# Step 3: Group by 'region_names' and count the number of cells in each region
region_counts = (
    no_val_adata.obs.groupby('region_names')
    .size()
    .reset_index(name='cell_count')
)

# Step 4: Shuffle the 'region_names' to randomize
shuffled_regions = region_counts.sample(
    frac=1, random_state=42
).reset_index(drop=True)

# Step 5: Accumulate regions until reaching approximately 70% of total cells
cumulative_cells = 0
selected_regions = []

for idx, row in shuffled_regions.iterrows():
    cumulative_cells += row['cell_count']
    selected_regions.append(row['region_names'])
    if cumulative_cells >= target_cells:
        break

# Create a boolean mask for the selected regions
mask = no_val_adata.obs['region_names'].isin(selected_regions)

# Step 6: Split the AnnData object into two datasets
adata_90 = no_val_adata[mask].copy()
adata_10 = no_val_adata[~mask].copy()


  no_val_adata.obs.groupby('region_names')


In [42]:
# Save the splits
adata_90.write('/mnt/disks/ecdyer-disk1/data/90_split.h5ad')
adata_10.write('/mnt/disks/ecdyer-disk1/data/10_split.h5ad')

#### Step 3: Train MLP