# Assigning Track IDs to well index

Each run has a csv file that outlines the last image captured before dispensing an object into a certain well. Thi can be used to detect the track that was dispensed

In [7]:
import tifffile as tiff
import numpy as np
import tifffile as tiff
from skimage.measure import label, regionprops
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
from PIL import Image

def load_resize_combine_display(tif_path, png_path, output_path=None):
    """
    Load a TIFF image and a PNG image, resize the PNG to match the TIFF dimensions, and combine/display both images.

    Parameters:
    - tif_path: str, path to the TIFF image file.
    - png_path: str, path to the PNG image file.
    - output_path: str, path to save the combined image (optional).
    """
    # Load the TIFF file
    tif_image = tiff.imread(tif_path)
    
    # Load the PNG file
    png_image = Image.open(png_path)
    
    # Resize the PNG file to match the TIFF dimensions
    png_resized = png_image.resize((tif_image.shape[1], tif_image.shape[0]), Image.LANCZOS)
    
    # Convert the resized PNG image to a numpy array
    png_array = np.array(png_resized)
    
    # Create a figure with two subplots
    fig, ax = plt.subplots(1, 2, figsize=(12, 6))
    
    # Display the TIFF image with viridis colormap
    ax[0].imshow(tif_image, cmap='viridis')
    ax[0].set_title('TIFF Image (Viridis)')
    ax[0].axis('off')
    
    # Display the resized PNG image
    ax[1].imshow(png_array)
    ax[1].set_title('Resized PNG Image')
    ax[1].axis('off')
    
    plt.tight_layout()
    
    # Save the combined image if output_path is provided
    if output_path:
        plt.savefig(output_path, dpi=300)
    
    # Show the combined image
    #plt.show()
    plt.close(fig)


def find_highest_object(frame):
    """
    Find the object that has the highest y value within the bottom quarter of the image.

    Parameters:
    - frame: numpy array, the pixel matrix where entries are object assignments.

    Returns:
    - highest_object_id: int, the ID of the object that is closest to the bottom and within the bottom quarter of the image.
    - highest_y: int, the highest y-coordinate of the object.
    """
    # Calculate the threshold for the bottom quarter
    threshold = 2.5 * frame.shape[0] // 4
    
    # Label the objects in the frame
    unique_labels = np.unique(frame)
    
    # Initialize variables to keep track of the highest object
    highest_object_id = 0
    highest_y = 0
    
    # Iterate through each labeled region
    for label in unique_labels:
        if label == 0: 
            continue 
        #print(f"iterating through the find highest y value fucntion and the current rehion is {label}")

        binary_mask = (frame == label).astype(np.uint8)
        region = regionprops(binary_mask)[0]
        # Get the coordinates of the region
        coords = region.coords
        # Find the maximum y-coordinate of the region
        max_y = coords[:, 0].max()
        #print(f"The max y coor is {max_y} and the threshold is {threshold} with the current highest is {highest_y} for track id: {label}")
        # Check if the maximum y-coordinate is within the bottom quarter
        if max_y >= threshold and max_y > highest_y:
            highest_y = max_y
            highest_object_id = label
    
    return highest_object_id, highest_y

def numerical_sort(value):
    """
    Extracts the numeric part from the filename for sorting.
    Assumes that the filename format is '<number>_htert_Run'.
    """
    parts = re.findall(r'\d+', value)
    return int(parts[0]) if parts else value


def link_track_to_well(logfile_directory, tracked_tif_directory, linkfile_directory, output_directory=None):
    linkfile = pd.read_csv(linkfile_directory)
    linkfile = linkfile.sort_values(by='tifs').reset_index(drop=True)
    logfile = pd.read_csv(logfile_directory, dtype={"file_name": str, "R": int, "C": int, "Number_of_droplets": int, "Model_output0": float, "Model_output1": float, "Model_output2": float, "Prediction": int})
    gr_logfile = logfile.groupby(['R', 'C'])

    final_dict = {"track_ID": [], 
                  "after_dispense_frame": [], 
                  "last_tracked_frame": [],
                  "last_tracked_tif": [],
                  "row": [],
                  "col": []}

    for (r, c), group in gr_logfile:
        # Establish the values to be inputted into the dataframe
        track_ID = 0  # Default value meaning no track associated
        after_dispense_frame = sorted(group['file_name'].tolist(), key=numerical_sort, reverse=True)[0]  # This is the last frame for this (r, c) well
        last_tracked_frame = after_dispense_frame 
        last_tracked_tif = ""  # Empty because the image may not have a corresponding tif file

        # Obtain the 3 frames right before the frame after dispense 
        tif_to_consider = []
        img_to_consider = []
        frames_before = [f for f in linkfile["imgs"] if numerical_sort(f) <= numerical_sort(after_dispense_frame)]

        if len(frames_before) > 0: 
            prev_closest_frame = sorted(frames_before, key=numerical_sort, reverse=True)[0]
            tif_after_dispense = linkfile.loc[linkfile["imgs"] == prev_closest_frame]["tifs"]

            if len(tif_after_dispense) != 0:
                i = tif_after_dispense.index[0]
                if i >= 4 and numerical_sort(after_dispense_frame) > numerical_sort(prev_closest_frame): 
                    tif_to_consider = sorted(linkfile.iloc[i-4:i+1]['tifs'], key=numerical_sort, reverse=True)
                    img_to_consider = sorted(linkfile.iloc[i-4:i+1]['imgs'], key=numerical_sort, reverse=True)
                elif i >= 4: 
                    tif_to_consider = sorted(linkfile.iloc[i-4:i]['tifs'], key=numerical_sort, reverse=True)
                    img_to_consider = sorted(linkfile.iloc[i-4:i]['imgs'], key=numerical_sort, reverse=True)
                elif i != 0: 
                    tif_to_consider = sorted(linkfile.iloc[:i]['tifs'], key=numerical_sort, reverse=True)
                    img_to_consider = sorted(linkfile.iloc[:i]['imgs'], key=numerical_sort, reverse=True)

            for tif_name, img_name in zip(tif_to_consider, img_to_consider):
                tracked_tif = tiff.imread(os.path.join(tracked_tif_directory, tif_name))
                track_ID, _ = find_highest_object(tracked_tif)
                last_tracked_frame = img_name
                last_tracked_tif = tif_name
                if track_ID:
                    break  # Stop if we find one that fits into our threshold

        # Append the results to the final dictionary
        final_dict["track_ID"].append(int(track_ID))
        final_dict["after_dispense_frame"].append(after_dispense_frame)
        final_dict["last_tracked_frame"].append(last_tracked_frame)
        final_dict["last_tracked_tif"].append(last_tracked_tif)
        final_dict["row"].append(r)
        final_dict["col"].append(c)

    if output_directory: 
        pd.DataFrame(final_dict).to_csv(os.path.join(output_directory, "track_to_well_unfiltered.csv"), index=False)
    
    return pd.DataFrame(final_dict)





In [35]:
dataset = "A138856A"
printrun = "10dropRun4"
logfile_directory = f'/projects/steiflab/archive/data/imaging/{dataset}/NozzleImages/{printrun}/LogFile.csv'
tracked_tif_directory = f'/projects/steiflab/scratch/leli/trackastra/{dataset}/{printrun}/tracked_again_postprocessed_2.0'
fluro_directory = f'/projects/steiflab/archive/data/imaging/{dataset}/MicroscopeImages/S0000/'
linkfile_directory = f'/projects/steiflab/scratch/leli/trackastra/{dataset}/{printrun}/tracked_again_postprocessed_2.0/tif_to_img.csv'

df = link_track_to_well(logfile_directory, tracked_tif_directory, linkfile_directory, output_directory = f'/projects/steiflab/scratch/leli/{dataset}/{printrun}/track_to_well')

## Debugging

In [None]:
r = 18
c = 36
linkfile = pd.read_csv(linkfile_directory)
linkfile = linkfile.sort_values(by='tifs').reset_index(drop=True)
# print(linkfile)
logfile = pd.read_csv(logfile_directory, dtype={"file_name": str, "R": int, "C": int, "Number_of_droplets": int, "Model_output0": float, "Model_output1": float, "Model_output2": float, "Prediction": int})
gr_logfile = logfile.groupby(['R', 'C'])
group = gr_logfile.get_group((r, c))

fluro_img = tiff.imread(os.path.join(fluro_directory, f"C{c:02d}", f"R{r:02d}_C{c:02d}_0000_00_Cyan.tif"))


# establish the values to be inputted into the dataframe
track_ID = 0 # this is default meaning no track associated
after_dispense_frame = sorted(group['file_name'].tolist(), key = numerical_sort, reverse=True)[0] # This is the last frame for this (r,c) well 
last_tracked_frame = after_dispense_frame 
last_tracked_tif = "" # empty because the image may not have a correspnding tif file 
prediction = np.max(group['Prediction'].tolist())
if group.loc[group['Prediction'] > 0].shape[0] > 1: raise ValueError(f"there are multiple predicitons that are greater than 0 {group['Prediction']}")
fluro_image = os.path.join(f"C{c:02d}", f"R{r:02d}_C{c:02d}_0000_00_Cyan.tif")


# here we implement the label which are determined by the fluorescent images and the model prediction 
if prediction == 0: 
    is_fluro = "NCC"
elif prediction == 1: 
    is_fluro = "singlecell"
    if np.max(np.array(fluro_img)) < 2000: 
        is_fluro = "deadcell"
elif prediction == 2: 
    if np.max(np.array(fluro_img)) >= 2000:
        is_fluro = "debris"
    else: 
        is_fluro = "multicell"

#is_fluro = "livecell" if np.max(np.array(fluro_img)) >= 2000 else "NCC_debris"

# obtain the 3 frames right before the frame after dispense 
tif_to_consider = []
img_to_consider = []
frames_before = [f for f in linkfile["imgs"] if numerical_sort(f) <= numerical_sort(after_dispense_frame)]
if len(frames_before) == 0: 
    print(f"There are zero frames including the current after dispense frame one {after_dispense_frame}")
else: 
    prev_closest_frame = sorted(frames_before, key = numerical_sort, reverse = True)[0]
    print(f"prev_closest_frame is {prev_closest_frame}")
    tif_after_dispense = linkfile.loc[linkfile["imgs"] == prev_closest_frame]["tifs"]
    if len(tif_after_dispense) != 0:
        i = tif_after_dispense.index[0]
        #print(f"The index of the tif after dispense is {i}")
        if i >= 4 and numerical_sort(after_dispense_frame) > numerical_sort(prev_closest_frame): 
            tif_to_consider = sorted(linkfile.iloc[i-4:i+1]['tifs'], key = numerical_sort, reverse = True)
            img_to_consider = sorted(linkfile.iloc[i-4:i+1]['imgs'], key = numerical_sort, reverse = True)
        elif i >= 4: 
            tif_to_consider = sorted(linkfile.iloc[i-4:i]['tifs'], key = numerical_sort, reverse = True)
            img_to_consider = sorted(linkfile.iloc[i-4:i]['imgs'], key = numerical_sort, reverse = True)
        elif i != 0: 
            tif_to_consider = sorted(linkfile.iloc[:i]['tifs'], key = numerical_sort, reverse = True)
            img_to_consider = sorted(linkfile.iloc[:i]['imgs'], key = numerical_sort, reverse = True)
    print(f"tif_to_consider: {tif_to_consider}")
    for tif_name, img_name in zip(tif_to_consider, img_to_consider):

        tracked_tif = tiff.imread(os.path.join(tracked_tif_directory, tif_name))
        print(f"the tracked tif file is {tif_name}")

        track_ID, highest_y = find_highest_object(tracked_tif)
        last_tracked_frame = img_name
        last_tracked_tif = tif_name
        if track_ID: break # since the filename is sorted from later frames to earlier, if we find one that fits into our treshold then we stop 

print(f"The print ID ends up being : {track_ID}")

## Here we have the df ready where we can get some analysis done and see what is happening 

In [None]:
#df = pd.read_csv('/projects/steiflab/scratch/leli/trackastra/track_to_well/track_to_well.csv')
# Filter the DataFrame for rows where 'track_ID' is 0 and 'type' is 'livecell'
livecell_track0 = df[(df['track_ID'] == 0) & (df['type'] == 'livecell')]

# Get the number of such rows
num_livecell_track0 = livecell_track0.shape[0]

# Get the total number of 'livecell' types
total_livecell = df[df['type'] == 'livecell'].shape[0]

# Calculate the percentage
percentage_livecell_track0 = (num_livecell_track0 / total_livecell) * 100

print(f"Number of 'livecell' types with a track_ID of 0: {num_livecell_track0}")
print(f"Total number of 'livecell' types: {total_livecell}")
print(f"Percentage of 'livecell' types with a track_ID of 0: {percentage_livecell_track0:.2f}%")

# Get the corresponding 'last_tracked_tif' values
last_tracked_tif_values = livecell_track0['last_tracked_tif']

# Print the 'last_tracked_tif' values
print("Corresponding 'last_tracked_tif' values for 'livecell' types with a track_ID of 0:")
for tif in last_tracked_tif_values:
    print(tif)

## Q: How many livecells do not have a track to it 

In [None]:
len(df.loc[(df['track_ID'] == 0) & (df['type'] == 'singlecell')]['fluro_image'])
df.columns

In [None]:
# Calculate the counts of each type
type_counts = df['type'].value_counts()

# Calculate the percentages of each type
type_percentages = df['type'].value_counts(normalize=True) * 100

# Print the counts and percentages
print("Counts and Percentages of each type:")
for type_value, count in type_counts.items():
    percentage = type_percentages[type_value]
    print(f"{type_value}: {count} ({percentage:.2f}%)")

# Define the number of samples to take from each type
n_samples = 10

# Function to print the last_tracked_tif values for each type
def print_random_samples(df, n_samples):
    unique_types = df['type'].unique()
    for type_value in unique_types:
        # Filter the dataframe for the current type
        type_df = df[df['type'] == type_value]
        
        # Randomly sample n_samples rows from the filtered dataframe
        sampled_df = type_df.sample(n=min(n_samples, len(type_df)), random_state=42)
        
        # Print the last_tracked_tif values for the sampled rows
        print(f"Type: {type_value}")
        print(sampled_df['last_tracked_tif'].tolist())
        print()

# Call the function with your dataframe
print_random_samples(df, n_samples)


This is just to update the track info file, the test version is just to make sure not overwrite the previous version

In [3]:
def update_track_info_across_frame(old_track_info_df, track_info_df, frame, frame_number):
    unique_labels = np.unique(frame)
    unique_labels = unique_labels[unique_labels != 0]  # Remove background (label 0)

    for track_id in unique_labels:
        row = track_info_df[track_info_df['Track_ID'] == track_id]

        if not row.empty:
            start_frame = int(row['Start'].values[0])
            end_frame = int(row['End'].values[0])
            start_frame = min(start_frame, frame_number)
            end_frame = max(end_frame, frame_number)
            track_info_df.loc[track_info_df['Track_ID'] == track_id, 'Start'] = start_frame
            track_info_df.loc[track_info_df['Track_ID'] == track_id, 'End'] = end_frame
        else:
            parent_value = old_track_info_df.loc[old_track_info_df['Track_ID'] == track_id, 'Parent'].values[0] if track_id in old_track_info_df['Track_ID'].values else 0
            new_row = pd.DataFrame({
                'Track_ID': [track_id],
                'Start': [frame_number],
                'End': [frame_number],
                'Parent': [parent_value]
            })
            track_info_df = pd.concat([track_info_df, new_row], ignore_index=True)
    
    return track_info_df
    
track_info_file = '/projects/steiflab/scratch/leli/trackastra/A138974A/PrintRun_Apr1223_1311/tracked_postprocessed_2.0/man_track.txt'
target_tif_dir = '/projects/steiflab/scratch/leli/trackastra/A138974A/PrintRun_Apr1223_1311/tracked_postprocessed_2.0'
track_info = pd.read_csv(track_info_file, sep='\s+', names=['Track_ID', 'Start', 'End', 'Parent']) # read in again the new versin of track info 
new_track_info = pd.DataFrame(columns=track_info.columns)
for filename in os.listdir(target_tif_dir):
    if not filename.startswith("._") and filename.endswith(".tif"):
        frame_path = os.path.join(target_tif_dir, filename)
        frame = tiff.imread(frame_path)

        frame_num = int(filename.replace('man_track', '').replace('.tif', ''))
        new_track_info = update_track_info_across_frame(track_info, new_track_info, frame, frame_num)
        new_track_info.to_csv(os.path.join(target_tif_dir, "man_track_test.txt"), sep=' ', index=False, header=False)

## Here we post process the df and make sure to eliminate duplicates

This is the actual function where we process the track to well df

In [43]:
import pandas as pd
import numpy as np
def get_group(type_value):
    if type_value in ['error0', 'NCC']:
        return 0
    elif type_value in ['error1', 'singlecell']:
        return 1
    elif type_value in ['debris', 'cluster']:
        return 2
    else:
        return -1
        
def process_track_ids(df, track_info, output_directory):

    # Process each unique track ID with multiple rows
    track_id_counts = df['track_ID'].value_counts()
    multiple_row_track_ids = track_id_counts[track_id_counts > 1].index

    for track_id in multiple_row_track_ids:
        if track_id <= 0: continue

        track_df = df[df['track_ID'] == track_id]
        
        # Determine the group of each row
        groups = track_df['type'].apply(get_group)
        
        if len(groups.unique()) == 1:
            group = groups.iloc[0]
            
            if group == 0:
                print(f"Track ID {track_id}: All rows are from group 0 (error0 and NCC).")
            elif group in [1, 2]:
                track_info_row = track_info[track_info['Track_ID'] == track_id]
                if track_info_row.empty:
                    print(f"Track ID {track_id}: No matching track info found.")
                    continue

                end_value = track_info_row['End'].values[0]
                df.loc[track_df.index, 'distance_to_end'] = [abs(numerical_sort(i) - end_value) for i in track_df['last_tracked_tif']]
                closest_row_idx = df.loc[track_df.index, 'distance_to_end'].idxmin()
                
                if group == 1:
                    new_track_id = -1
                else:  # group == 2
                    new_track_id = -2
                
                df.loc[track_df.index.difference([closest_row_idx]), 'track_ID'] = new_track_id
                #df.drop(columns=['distance_to_end'], inplace=True)
                
        else:
            track_df_group1_2 = track_df[groups.isin([1, 2])]
            if not track_df_group1_2.empty:
                track_info_row = track_info[track_info['Track_ID'] == track_id]
                if track_info_row.empty:
                    print(f"Track ID {track_id}: No matching track info found.")
                    continue

                end_value = track_info_row['End'].values[0]
                print([numerical_sort(i) for i in track_df_group1_2['last_tracked_tif']] )
                df.loc[track_df_group1_2.index, 'distance_to_end'] = [abs(numerical_sort(i) - end_value) for i in track_df_group1_2['last_tracked_tif']]
                closest_row_idx = df.loc[track_df_group1_2.index, 'distance_to_end'].idxmin()


                df.loc[track_df.index.difference([closest_row_idx]), 'track_ID'] = -3
 
    if output_directory: df.to_csv(os.path.join(output_directory, "track_to_well.csv"), index = False)

    return df

# Example usage:
dataset = "A138856A"
printrun = "10dropRun4"
df = pd.read_csv(f"/projects/steiflab/scratch/leli/{dataset}/{printrun}/track_to_well/track_to_well.csv")
new_track_info = pd.read_csv(f"/projects/steiflab/scratch/leli/trackastra/{dataset}/{printrun}/tracked_again_postprocessed_2.0/man_track.txt",sep='\s+', names=['Track_ID', 'Start', 'End', 'Parent'])
processed_df = process_track_ids(df, new_track_info, output_directory = f'/projects/steiflab/scratch/leli/{dataset}/{printrun}/track_to_well')


[579, 577, 579, 579]
[1655, 1660, 1660]
[990, 990, 990, 990, 990]
[323, 324, 324, 324, 324, 324]
[1057, 1057]
[74, 77, 83, 86, 117, 100, 99]
[1326, 1358, 1341, 1341]
[629, 629, 629]
[907, 907]
[1752, 1752, 1752]
[1432, 1432]
[670, 669]
[1265, 1265]
[1712, 1719, 1722]
[353, 353]
[518]
[234]
[267, 268]
[924, 924]
[1039, 1039, 1039]
[1816, 1816]
[771, 777]
[1212]
[1464, 1465]
[1180]
[1105]
[751]
[891]
[1249]
[1027]
[1554]
[1075]
[1495]
[1564]
[1262]
[1577]
[1597]
[1300]
[1642]
[1415]
[1384]
[1367]
[1537, 1537]
[1692]
[154]
[340]
[483]
[517]
[566]
[615]
[253]
[694]
[382]
[801]
[31, 31]


## CORRECTED !!!!!!! VERSION 2 Here we post process the df and make sure to eliminate duplicates

In [36]:
import pandas as pd
import os

def process_track_ids_by_distance(df, track_info, output_directory=None):
    # Process each unique track ID with multiple rows
    track_id_counts = df['track_ID'].value_counts()
    multiple_row_track_ids = track_id_counts[track_id_counts > 1].index

    for track_id in multiple_row_track_ids:
        if track_id <= 0: 
            continue

        track_df = df[df['track_ID'] == track_id]

        # Find the corresponding track info row
        track_info_row = track_info[track_info['Track_ID'] == track_id]
        if track_info_row.empty:
            print(f"Track ID {track_id}: No matching track info found.")
            continue

        # Calculate the distance to the end value for each row
        end_value = track_info_row['End'].values[0]
        df.loc[track_df.index, 'distance_to_end'] = [abs(numerical_sort(i) - end_value) for i in track_df['last_tracked_tif']]

        # Find the row with the minimum distance to the end
        closest_row_idx = df.loc[track_df.index, 'distance_to_end'].idxmin()

        # Label all other rows with -1 in the track_ID column
        df.loc[track_df.index.difference([closest_row_idx]), 'track_ID'] = -1

    # Save the final DataFrame if an output directory is provided
    if output_directory: 
        if "distance_to_end" in list(df.columns): df.drop(columns=['distance_to_end'], inplace=True)
        df.to_csv(os.path.join(output_directory, "track_to_well_pp.csv"), index=False)

    return df

# Example usage:
'''dataset = "A138856A"
printrun = "10dropRun1"'''
df = pd.read_csv(f"/projects/steiflab/scratch/leli/{dataset}/{printrun}/track_to_well/track_to_well_unfiltered.csv")
new_track_info = pd.read_csv(f"/projects/steiflab/scratch/leli/trackastra/{dataset}/{printrun}/tracked_again_postprocessed_2.0/man_track.txt", sep='\s+', names=['Track_ID', 'Start', 'End', 'Parent'])
processed_df = process_track_ids_by_distance(df, new_track_info, output_directory=f'/projects/steiflab/scratch/leli/{dataset}/{printrun}/track_to_well')



here we assess if the all track ids have 1 row instead of multiple 

In [20]:
import pandas as pd

def display_unique_track_ids(df):
    # Filter out non-zero track IDs
    positive_nonzero_track_ids = df[df['track_ID'] > 0]['track_ID']

    # Get the value counts of these track IDs
    track_id_counts = positive_nonzero_track_ids.value_counts()

    # Display the track IDs and their counts
    print("Unique positive non-zero track IDs and their counts:")
    print(track_id_counts)

    # Verify that each track ID has only one row
    duplicate_tracks = track_id_counts[track_id_counts > 1]
    if duplicate_tracks.empty:
        print("All positive non-zero track IDs have only one row.")
    else:
        print("Some track IDs have duplicates:")
        print(duplicate_tracks)

# Assuming df is your DataFrame
display_unique_track_ids(processed_df)
processed_df[processed_df['track_ID'] == 147]

Unique positive non-zero track IDs and their counts:
track_ID
1    1
2    1
3    1
4    1
5    1
Name: count, dtype: int64
All positive non-zero track IDs have only one row.


Unnamed: 0,track_ID,after_dispense_frame,last_tracked_frame,last_tracked_tif,distance_to_end
