In [1]:
import numpy as np
import pandas as pd
from PIL import Image
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import glob
import os
import re
Image.MAX_IMAGE_PIXELS = None

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
marker_list = ['CD11B','CD20','CD3d','CD45','CD4','CD68','CD8','CgA','Lysozyme','NaKATPase','PanCK','SMA','Sox9','Vimentin','OLFM4']


In [3]:
image_dir='/fs5/p_masi/rudravg/MxIF_Vxm_Registered/GCA020TIB_TISSUE01/AF_Removed'

In [4]:
marker_files = {}

# Loop over each marker
for marker in marker_list:
    # Search for files that contain the marker name
    files = glob.glob(os.path.join(image_dir, f"*{marker}*"))
    
    # Filter the files to only include those that exactly match the marker name
    files = [file for file in files if f"GCA020TIB_TISSUE01_{marker}_" in file.split('/')[-1]]    
    # Add the files to the dictionary
    marker_files[marker] = files

In [5]:
marker_files

{'CD11B': ['/fs5/p_masi/rudravg/MxIF_Vxm_Registered/GCA020TIB_TISSUE01/AF_Removed/ROUND_04_CY2_GCA020TIB_TISSUE01_CD11B_normalized_corrected.tif'],
 'CD20': ['/fs5/p_masi/rudravg/MxIF_Vxm_Registered/GCA020TIB_TISSUE01/AF_Removed/ROUND_04_CY5_GCA020TIB_TISSUE01_CD20_normalized_corrected.tif'],
 'CD3d': ['/fs5/p_masi/rudravg/MxIF_Vxm_Registered/GCA020TIB_TISSUE01/AF_Removed/ROUND_10_CY3_GCA020TIB_TISSUE01_CD3d_normalized_corrected.tif'],
 'CD45': ['/fs5/p_masi/rudravg/MxIF_Vxm_Registered/GCA020TIB_TISSUE01/AF_Removed/ROUND_02_CY5_GCA020TIB_TISSUE01_CD45_normalized_corrected.tif'],
 'CD4': ['/fs5/p_masi/rudravg/MxIF_Vxm_Registered/GCA020TIB_TISSUE01/AF_Removed/ROUND_08_CY5_GCA020TIB_TISSUE01_CD4_normalized_corrected.tif'],
 'CD68': ['/fs5/p_masi/rudravg/MxIF_Vxm_Registered/GCA020TIB_TISSUE01/AF_Removed/ROUND_14_CY3_GCA020TIB_TISSUE01_CD68_normalized_corrected.tif'],
 'CD8': ['/fs5/p_masi/rudravg/MxIF_Vxm_Registered/GCA020TIB_TISSUE01/AF_Removed/ROUND_12_CY5_GCA020TIB_TISSUE01_CD8_normaliz

In [6]:
data=[]

In [7]:
mask = Image.open('/fs5/p_masi/rudravg/MxIF_Vxm_Registered/GCA020TIB_TISSUE01/mask.tif')
mask_np = np.array(mask)
unique_instances = np.unique(mask_np)
unique_instances = unique_instances[unique_instances != 0] 

In [9]:
len(unique_instances)

18443

In [8]:
import concurrent.futures
from tqdm import tqdm

def process_instance(instance):
    # Initialize a list to hold the current row of data
    row = [instance]

    # Create a mask for the current instance
    instance_mask = mask_np == instance

    # Calculate the centroid of the current instance
    y_indices, x_indices = np.where(instance_mask)
    centroid_x = np.mean(x_indices)
    centroid_y = np.mean(y_indices)

    # Append the centroid to the row
    row.extend([centroid_x, centroid_y])

    # Loop over each marker
    for marker, files in marker_files.items():
        # Assume there's only one file per marker
        file = files[0]

        # Open the image file
        image = Image.open(file)

        # Convert the image to a numpy array
        image_np = np.array(image)

        # Get the pixels of the current instance
        instance_pixels = image_np[instance_mask]

        # Calculate the mean intensity
        mean_intensity = np.mean(instance_pixels)

        # Append the mean intensity to the row
        row.append(mean_intensity)

    return row

# Initialize a list to hold the data
data = []

# Create a ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Use the executor to map the process_instance function to the unique_instances
    data = list(tqdm(executor.map(process_instance, unique_instances), total=len(unique_instances)))

In [66]:
df1 = pd.DataFrame(data, columns=['Instance', 'Centroid_X', 'Centroid_Y'] + marker_list)

In [68]:
df1.tail()

Unnamed: 0,Instance,Centroid_X,Centroid_Y,CD11B,CD20,CD3D,CD45,CD4,CD68,CD8,CGA,LYSOZYME,NAKATPASE,PANCK,SMA,SOX9,VIMENTIN,OLFM4
2020,2021,1699.250597,2793.434368,22.262529,32.264915,14.360382,12.94272,13.940334,39.455849,9.28401,55.56802,28.980907,4.627685,15.983294,2.584726,30.51074,10.594272,6.272077
2021,2022,1695.616114,2805.753555,7.312796,7.592417,7.255924,8.270143,8.587678,16.78673,6.379147,16.047394,10.118484,3.156398,6.241706,1.620853,11.549763,2.312796,1.772512
2022,2023,1754.052933,2863.974249,9.117311,12.030043,5.463519,8.658083,10.321888,16.856939,5.37196,27.243204,16.828325,2.117311,5.357654,3.889843,16.719599,6.477826,2.018598
2023,2024,1680.765625,2869.058036,24.90625,24.131697,15.752233,16.859375,15.296875,32.745537,8.669642,67.216515,27.316965,3.566964,15.392858,4.28125,27.727678,13.973214,4.095982
2024,2025,1687.907595,2889.232911,17.944304,22.192406,10.601266,10.473417,12.175949,27.079746,6.670886,39.589874,24.096203,3.079747,8.650633,2.872152,22.970886,27.127848,3.520253


In [72]:
df1.to_csv('/fs5/p_masi/rudravg/MxIF_Vxm_Registered/GCA112TIA/Unregistered/unregistered_instance_data_allMarkers2.csv', index=False)