# This document serves as an analysis of cellenONE 

What we want to do is to find false predictions which incorporates: 
    1. Cells that are within the ranges but discarded 
    2. Cells that are out of the ranges but kept 


## Among all the recorded how many are not allowed 


In [None]:
import pandas as pd

# Read the Excel file
df = pd.read_csv('/projects/steiflab/scratch/leli/A138856A/htert_20230822_131349_843.Run/record.csv')

# Parameters from the image
diameter_min = 5
diameter_max = 50
elongation_max = 4
ejection_boundary = 267
sedimentation_boundary = 200

# Filter the data
# Finding particles with diameter outside the specified range
outside_diameter = df[(df['Diameter'] < diameter_min) | (df['Diameter'] > diameter_max)]

# Finding particles with elongation greater than the specified max
outside_elongation = df[df['Elongation'] > elongation_max]

# Finding particles outside the ejection boundary
outside_ejection_boundary = df[df['X'] > ejection_boundary]

# Finding particles outside the sedimentation boundary
outside_sedimentation_boundary = df[df['Y'] < sedimentation_boundary]

# Combining all filters to find any particles outside the boundaries
outside_all = pd.concat([outside_diameter, outside_elongation, outside_ejection_boundary, outside_sedimentation_boundary]).drop_duplicates()

# Display the results
print("Particles outside diameter boundaries:")
print(outside_diameter)

print("\nParticles outside elongation boundaries:")
print(outside_elongation)

print("\nParticles outside ejection boundary:")
print(outside_ejection_boundary)

print("\nParticles outside sedimentation boundary:")
print(outside_sedimentation_boundary)

print("\nParticles outside any of the specified boundaries:")
print(outside_all)

print(df)

## Going back to the post processing steps 

We need to have a look at the frame numbers and compare with the classic normal ones

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Step 1: Read the TXT file into a pandas DataFrame
file_path = '/projects/steiflab/scratch/leli/trackastra/A138856A/htert_20230822_131349_843.Run/tracked_postprocessed/man_track.txt'
df = pd.read_csv(file_path, sep='\s+', names=['Track_ID', 'Start', 'End', 'Parent']) # Adjust separator if necessary

# Step 2: Calculate the number of frames for each track
df['num_frames'] = df['End'] - df['Start'] + 1

# Step 3: Generate summary statistics
mean_frames = df['num_frames'].mean()
median_frames = df['num_frames'].median()
std_frames = df['num_frames'].std()
min_frames = df['num_frames'].min()
max_frames = df['num_frames'].max()

summary_stats = {
    'Mean': mean_frames,
    'Median': median_frames,
    'Standard Deviation': std_frames,
    'Minimum': min_frames,
    'Maximum': max_frames
}

# Print summary statistics
print("Summary Statistics:")
for stat, value in summary_stats.items():
    print(f"{stat}: {value}")

# Step 4: Plot the histogram
plt.figure(figsize=(10, 6))
plt.hist(df['num_frames'], bins=30, edgecolor='black', alpha=0.7)
plt.title('Histogram of Number of Frames')
plt.xlabel('Number of Frames')
plt.ylabel('Frequency')

# Add summary statistics to the plot
textstr = '\n'.join((
    f"Mean: {mean_frames:.2f}",
    f"Median: {median_frames:.2f}",
    f"Std Dev: {std_frames:.2f}",
    f"Min: {min_frames}",
    f"Max: {max_frames}"
))

# Place the summary statistics text box on the plot
props = dict(boxstyle='round', facecolor='white', alpha=0.5)
plt.gcf().text(0.95, 0.5, textstr, fontsize=12, bbox=props, transform=plt.gcf().transFigure)

plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Step 1: Read the TXT file into a pandas DataFrame
file_path = '/projects/steiflab/scratch/leli/trackastra/A138974A/PrintRun_Apr1223_1311/tracked_postprocessed/man_track.txt'
df = pd.read_csv(file_path, sep='\s+', names=['Track_ID', 'Start', 'End', 'Parent']) # Adjust separator if necessary

# Step 2: Calculate the number of frames for each track
df['num_frames'] = df['End'] - df['Start'] + 1

# Step 3: Generate summary statistics
mean_frames = df['num_frames'].mean()
median_frames = df['num_frames'].median()
std_frames = df['num_frames'].std()
min_frames = df['num_frames'].min()
max_frames = df['num_frames'].max()

summary_stats = {
    'Mean': mean_frames,
    'Median': median_frames,
    'Standard Deviation': std_frames,
    'Minimum': min_frames,
    'Maximum': max_frames
}

# Print summary statistics
print("Summary Statistics:")
for stat, value in summary_stats.items():
    print(f"{stat}: {value}")

# Step 4: Plot the histogram
plt.figure(figsize=(10, 6))
plt.hist(df['num_frames'], bins=30, edgecolor='black', alpha=0.7)
plt.title('Histogram of Number of Frames')
plt.xlabel('Number of Frames')
plt.ylabel('Frequency')

# Add summary statistics to the plot
textstr = '\n'.join((
    f"Mean: {mean_frames:.2f}",
    f"Median: {median_frames:.2f}",
    f"Std Dev: {std_frames:.2f}",
    f"Min: {min_frames}",
    f"Max: {max_frames}"
))

# Place the summary statistics text box on the plot
props = dict(boxstyle='round', facecolor='white', alpha=0.5)
plt.gcf().text(0.95, 0.5, textstr, fontsize=12, bbox=props, transform=plt.gcf().transFigure)

plt.show()


## Fix tracking 

Now we see that the parameters that we set up will not work for the cellenONE so we need to change all that. To reduce over correction, we set up boundries to increase accuracy when it comes to the frame numbers. The tracks need to have a certain frame number already for us to make decisions but now we need to lower it or make another strategic move

In [None]:
import os
import shutil
import pandas as pd
import numpy as np
from skimage import io
from skimage.measure import label, regionprops
from scipy.ndimage import label
import joblib
import tifffile as tiff
from skimage.morphology import binary_erosion
from scipy.ndimage import distance_transform_edt
from skimage.segmentation import watershed

def connect_objects_localized(mask1, mask2, kernel_size=5, iterations=1):
    """
    Connects objects in two masks using a localized morphological dilation.

    Parameters:
    - mask1: numpy array, the first mask.
    - mask2: numpy array, the second mask.
    - kernel_size: int, size of the dilation kernel.
    - iterations: int, number of dilation iterations.

    Returns:
    - connected_labels: numpy array, the labeled image after connecting objects.
    """
    combined_mask = np.maximum(mask1, mask2)
    binary_mask = (combined_mask > 0)
    distance = distance_transform_edt(binary_mask)
    markers, _ = label(binary_mask)
    labels = watershed(-distance, markers, mask=binary_mask)
    connection_mask = np.zeros_like(combined_mask)
    for label_val in np.unique(labels):
        if label_val == 0:
            continue
        component_mask = (labels == label_val)
        if np.sum(component_mask & mask1) > 0 and np.sum(component_mask & mask2) > 0:
            connection_mask[component_mask] = 255
    kernel = np.ones((kernel_size, kernel_size), np.uint8)
    eroded_connection = binary_erosion(connection_mask, kernel).astype(np.uint8) * 255
    connected_objects = np.where(eroded_connection > 0, eroded_connection, combined_mask)
    return connected_objects

def get_centroid_y(mask, label_value):
    regions = regionprops(mask)
    for region in regions:
        if region.label == label_value:
            return region.centroid[1]  # Corrected to return the y-coordinate
    return None

def update_track_info_across_frame(old_track_info_df, track_info_df, frame, frame_number):
    unique_labels = np.unique(frame)
    unique_labels = unique_labels[unique_labels != 0]  # Remove background (label 0)

    for track_id in unique_labels:
        row = track_info_df[track_info_df['Track_ID'] == track_id]

        if not row.empty:
            start_frame = int(row['Start'].values[0])
            end_frame = int(row['End'].values[0])
            start_frame = min(start_frame, frame_number)
            end_frame = max(end_frame, frame_number)
            track_info_df.loc[track_info_df['Track_ID'] == track_id, 'Start'] = start_frame
            track_info_df.loc[track_info_df['Track_ID'] == track_id, 'End'] = end_frame
        else:
            parent_value = old_track_info_df.loc[old_track_info_df['Track_ID'] == track_id, 'Parent'].values[0] if track_id in old_track_info_df['Track_ID'].values else 0
            new_row = pd.DataFrame({
                'Track_ID': [track_id],
                'Start': [frame_number],
                'End': [frame_number],
                'Parent': [parent_value]
            })
            track_info_df = pd.concat([track_info_df, new_row], ignore_index=True)
    
    return track_info_df

import os
import numpy as np
import tifffile as tiff
import matplotlib.pyplot as plt
import cv2

def color_labels(tif, labels, colors):
    """
    Color the specified labels in the TIFF image with the given colors.

    Parameters:
    - tif: numpy array, the TIFF image.
    - labels: list of int, the labels to color.
    - colors: list of tuple, the colors corresponding to each label.

    Returns:
    - colored_img: numpy array, the image with colored labels.
    """
    colored_img = cv2.cvtColor(tif.astype(np.uint8), cv2.COLOR_GRAY2BGR)

    for label, color in zip(labels, colors):
        mask = (tif == label)
        colored_img[mask] = color

    return colored_img

def display_colored_images(frame, labels_to_color, title):
    """
    Display the colored images for the specified labels
    """
    plt.figure(figsize=(8, 8))
    colors = [
        (255, 0, 0),  # Red
        (0, 255, 0),  # Green
        (0, 0, 255),  # Blue
        (255, 255, 0),  # Yellow
        (255, 0, 255), # Magenta
        (255, 165, 0),  # Orange
        (128, 0, 128) ,  # Purple
    ]

    colored_img1 = color_labels(frame, labels_to_color, colors)

    plt.imshow(colored_img1)
    plt.title(title)
    plt.tight_layout()
    plt.show()

def postprocess_frame(frame, track_info, classification, min_size=100):
    
    groups = track_info.groupby('Root')['Track_ID'].apply(list).to_dict()
    unique_labels = np.unique(frame)
    unique_labels = unique_labels[unique_labels != 0]

    groups = {root: tracks for root, tracks in groups.items() if any(track in unique_labels for track in tracks)}

    for root, group in groups.items():
        print(f"This group has root {root} and contains {group}")

        objs_to_be_merged = []

        for track_id in group:
            print(f"iteration: {track_id} where it is \n {track_info.loc[track_info['Track_ID'] == track_id]}")
            if track_id == root or track_info.loc[track_info['Track_ID'] == track_id, 'Parent'].values[0] == 0:
                continue
            print(f"iteration: {track_id}")
            mask = (frame == track_id)
            print(f"This objects {track_id} has a size of {np.sum(mask)}")

            # Remove small objects
            if np.sum(mask) < min_size:
                frame[mask] = 0
                print("Object with track id: {track_id} is too small so we remove all together")
                continue

            action = classification.loc[track_id, 'action']

            if action == 1:
                print(f"Object with track id: {track_id} has action 1")
                continue  # Keep as is

            elif action == 2:
                print(f"Object with track id: {track_id} has action 2")
                frame[mask] = 0  # Remove

            elif action == 0:
                print(f"Object with track id: {track_id} has action 0")
                centroid_y = get_centroid_y(frame, track_id)
                objs_to_be_merged.append((track_id, centroid_y))
        
        if len(objs_to_be_merged) != 0:
            objs_to_be_merged.sort(key=lambda x: x[1])
            print(f"These are the objects that will be merged {objs_to_be_merged}")

            combined_mask = (frame == objs_to_be_merged[0][0])
            for i in range(1, len(objs_to_be_merged)):
                        next_mask = (frame == objs_to_be_merged[i][0])

                        # here this is becuase we realize that the function morphologyEx alter other part of the masks so it does not just add the connecting part it alters the original
                        old_mask1 = combined_mask.copy()
                        old_mask2 = next_mask.copy()

                        for kernel_size in range(4):
                            temp = np.logical_or(combined_mask, next_mask).astype(np.uint8)
                            combined_mask = cv2.morphologyEx(temp, cv2.MORPH_CLOSE, np.ones((kernel_size * 5, kernel_size * 5), np.uint8))
                            num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(combined_mask, connectivity=8)
                            if num_labels-1 == 1: 
                                break
                        
                        combined_mask = np.logical_or(combined_mask, old_mask1).astype(np.uint8)
                        combined_mask = np.logical_or(combined_mask, old_mask2).astype(np.uint8)
                        
                        num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(combined_mask, connectivity=8)
                        if num_labels-1 > 1:
                            #display_colored_images(combined_mask, labels_to_color = [1, 2, 3, 4], title= f'current Frame: {frame_num}')
                            print("Failed to connect objects after multiple attempts.")
                        else: 
                            print(f"Succeeded to connect objects with kernel size {kernel_size}")

            # do the final painting of pixels 
            frame[combined_mask > 0] = root

    return frame

import pandas as pd
from sklearn.linear_model import LinearRegression

def predict_next_centroids(centroids, centroids_frame_with_prediction, predict_this_frame):
    """
    Predicts the next centroid positions given a list of centroids and the corresponding frame numbers.

    Parameters:
    - centroids: list of tuples (y, x) representing the coordinates of the centroids.
    - centroids_frame_with_prediction: list of frame numbers corresponding to the centroids.

    Returns:
    - predicted_centroids: tuple (y, x) representing the predicted coordinates of the next centroid.
    """
    # Create a DataFrame for easy handling
    df = pd.DataFrame(centroids, columns=['y', 'x'])
    df['frame'] = centroids_frame_with_prediction

    # Extract features and targets
    X = df['frame'].values.reshape(-1, 1)  # Frames as features
    y_y = df['y'].values  # y-coordinates as target
    y_x = df['x'].values  # x-coordinates as target

    # Fit linear regression models
    model_y = LinearRegression()
    model_x = LinearRegression()
    model_y.fit(X, y_y)
    model_x.fit(X, y_x)

    # Predict the next frame
    next_frame = np.array([predict_this_frame]).reshape(-1, 1)
    pred_y = model_y.predict(next_frame)[0]
    pred_x = model_x.predict(next_frame)[0]

    return pred_y, pred_x

def remove_track(track_info_file, target_tif_dir, to_be_removed_id):

    track_info = pd.read_csv(track_info_file, sep='\s+', names=['Track_ID', 'Start', 'End', 'Parent'])
    track_id, start_frame, end_frame, parent_id = track_info.loc[track_info['Track_ID'] == to_be_removed_id].values[0]

    for frame_number in range(start_frame, end_frame + 1):
        frame_path = os.path.join(target_tif_dir, f'man_track{frame_number:04d}.tif')
        if os.path.exists(frame_path):
            frame = tiff.imread(frame_path)
            frame[frame == to_be_removed_id] = 0
            tiff.imwrite(frame_path, frame)
            print(f"Removed track {to_be_removed_id} from frame {frame_number}")

    new_track_info = track_info[track_info['Track_ID'] != to_be_removed_id]
    new_track_info.to_csv(track_info_file, sep=' ', index=False, header=False)
    
    return new_track_info

def diverge_track(track_info_file, target_tif_dir, to_be_split_id, new_id, diverging_start_frame):

    track_info = pd.read_csv(track_info_file, sep='\s+', names=['Track_ID', 'Start', 'End', 'Parent'])
    track_id, start_frame, end_frame, parent_id = track_info.loc[track_info['Track_ID'] == to_be_split_id].values[0]

    if diverging_start_frame <= start_frame or diverging_start_frame > end_frame:
        raise ValueError("Diverging start frame must be within the original track's range.")
    
    # Iterate through the frames and update the track
    for frame_number in range(diverging_start_frame, end_frame + 1):
        frame_path = os.path.join(target_tif_dir, f'man_track{frame_number:04d}.tif')
        if os.path.exists(frame_path):
            frame = tiff.imread(frame_path)
            frame[frame == to_be_split_id] = new_id
            tiff.imwrite(frame_path, frame)
            print(f"Diverge frame {frame_number}: track {to_be_split_id} -> {new_id}")
    
    # Create a new row for the new track
    new_track_row = pd.DataFrame({
        'Track_ID': [new_id],
        'Start': [diverging_start_frame],
        'End': [end_frame],
        'Parent': [to_be_split_id]
    })
    
    # Append the new track row to the DataFrame 
    new_track_info = track_info._append(new_track_row, ignore_index=True)
    temp = new_track_info['Track_ID'].values
    print(f"Added {new_id} in the track info so now {new_id} is in new_track_info {new_id in temp}")

    # Update the end frame of the original track
    new_track_info.loc[new_track_info['Track_ID'] == to_be_split_id, 'End'] = diverging_start_frame - 1

    new_track_info.to_csv(track_info_file, sep=' ', index=False, header=False)

    return new_track_info

def merge_track(track_info_file, target_tif_dir, to_be_merged_ids):

    track_info = pd.read_csv(track_info_file, sep='\s+', names=['Track_ID', 'Start', 'End', 'Parent'])
    new_track_info = track_info.copy()

    # Determine alpha and beta tracks based on start frames
    alpha_id, beta_id = to_be_merged_ids if track_info.loc[track_info['Track_ID'] == to_be_merged_ids[0], 'Start'].values[0] < track_info.loc[track_info['Track_ID'] == to_be_merged_ids[1], 'Start'].values[0] else to_be_merged_ids[::-1]

    alpha_id, alpha_start_frame, alpha_end_frame, alpha_parent_id = track_info.loc[track_info['Track_ID'] == alpha_id].values[0]
    beta_id, beta_start_frame, beta_end_frame, beta_parent_id = track_info.loc[track_info['Track_ID'] == beta_id].values[0]

    # For frames from alpha_end_frame + 1 to beta_end_frame, all beta track objects are relabeled to alpha.
    sizes_after_alpha_end = []
    for frame_number in range(alpha_end_frame+1, beta_end_frame+1):
        frame_path = os.path.join(target_tif_dir, f'man_track{frame_number:04d}.tif')
        if os.path.exists(frame_path):
            frame = tiff.imread(frame_path)
            beta_mask = (frame == beta_id).astype(np.uint8)
            sizes_after_alpha_end.append(np.sum(beta_mask))
            frame[beta_mask > 0] = alpha_id  # Relabel beta to alpha
            tiff.imwrite(frame_path, frame)
            print(f"Merge frame {frame_number}: track {beta_id} -> {alpha_id}")

    median = np.median(sizes_after_alpha_end)
    mad = np.median(np.abs(sizes_after_alpha_end - median)) 
    lower_bound = median - 2 * mad

    # assume the beta starts at the same frame but ends at (include) alpha_end_frame
    new_beta_start_frame = beta_start_frame.copy()
    new_beta_end_frame = alpha_end_frame.copy()

    if beta_start_frame < alpha_end_frame+1:

        for frame_number in range(beta_start_frame, alpha_end_frame + 1):
            frame_path = os.path.join(target_tif_dir, f'man_track{frame_number:04d}.tif')
            if os.path.exists(frame_path):
                frame = tiff.imread(frame_path)
                beta_mask = (frame == beta_id).astype(np.uint8)
                size_beta = np.sum(beta_mask)
                if size_beta < lower_bound:
                    frame[beta_mask > 0] = 0  # Remove small objects
                    print(f"Removed small object in frame {frame_number} with size {size_beta}")
                else:
                    frame[beta_mask > 0] = alpha_id  # Merge beta into alpha
                    print(f"Merged frame {frame_number}: track {beta_id} -> {alpha_id} (This is before alpha end frame)")
                new_beta_start_frame = frame_number + 1
                tiff.imwrite(frame_path, frame)

    # Update track_info DataFrame
    new_track_info.loc[new_track_info['Track_ID'] == alpha_id, 'End'] = max(alpha_end_frame, beta_end_frame)
    if new_beta_start_frame > new_beta_end_frame: 
        print(f"Remove track ID: {beta_id}")
        new_track_info = new_track_info[new_track_info['Track_ID'] != beta_id]

    else:
        print(f"Putting in new start and end frames {new_beta_start_frame}, {new_beta_end_frame}")
        new_track_info.loc[new_track_info['Track_ID'] == beta_id, 'Start'] = new_beta_start_frame
        new_track_info.loc[new_track_info['Track_ID'] == beta_id, 'End'] = new_beta_end_frame

    new_track_info.to_csv(track_info_file, sep=' ', index=False, header=False)

    temp = new_track_info["Track_ID"].values
    print(f"confirmation that the new track info file do not have beta id {beta_id} {beta_id not in temp}")

    return new_track_info

import numpy as np
from scipy.spatial.distance import cdist

def maj_object_within_radius(frame, point, radius):
    """
    Check if there are any object pixels within a radius around a specified point
    and return the label of the object with the most pixels inside the radius.

    Parameters:
    - frame: numpy array, the pixel assignment matrix.
    - point: tuple (x, y), the coordinates of the given point.
    - radius: float, the radius within which to check for object pixels.

    Returns:
    - int, label of the object with the most pixels within the radius, or 0 if none.
    """
    # Get the coordinates and labels of all non-background pixels
    object_coords = np.column_stack(np.where(frame != 0))
    object_labels = frame[frame != 0]

    # Calculate the Euclidean distance from the given point to each object pixel
    distances = cdist([point], object_coords, metric='euclidean')

    # Get pixels within the specified radius
    within_radius = distances[0] <= radius
    if np.any(within_radius):
        # Count the number of pixels for each label within the radius
        labels_within_radius = object_labels[within_radius]
        unique_labels, counts = np.unique(labels_within_radius, return_counts=True)
        non_zero_labels = unique_labels[unique_labels != 0]
        if len(non_zero_labels) > 0:
            max_count_label = non_zero_labels[np.argmax(counts[unique_labels != 0])]
            return max_count_label
    return 0



In [None]:
import warnings
import joblib
import shutil

# Define paths
out_folder = "A138856A/htert_20230822_131349_843.Run/tracked"
source_tif_dir = f"/projects/steiflab/scratch/leli/trackastra/{out_folder}_postprocessed"
target_tif_dir = f"/projects/steiflab/scratch/leli/trackastra/{out_folder}_postprocessed_2.0"
track_info_file = f'/projects/steiflab/scratch/leli/trackastra/{out_folder}_postprocessed_2.0/man_track.txt'


os.makedirs(target_tif_dir, exist_ok=True)
# Copy original tif files to the target directory
if os.path.exists(target_tif_dir):
    shutil.rmtree(target_tif_dir)
shutil.copytree(source_tif_dir, target_tif_dir)

print("Initiate Main track Processing ... ")
track_info = pd.read_csv(track_info_file, sep='\s+', names=['Track_ID', 'Start', 'End', 'Parent']) # read in again the new versin of track info 
track_ids = sorted(track_info['Track_ID'])
#track_ids  = [50]

# This number is the original max track id, for divered track we would want to creat new ones
new_track_label = np.max(track_info['Track_ID'].values)

while track_ids: 
    
    print(f"Check for track ID: {track_ids[0]}")

    # there may be changes evertime we go throuh a track
    track_info = pd.read_csv(track_info_file, sep='\s+', names=['Track_ID', 'Start', 'End', 'Parent']) # read in again the new versin of track info 

    #### Check 1: <=2 Frames
    track_id, start_frame, end_frame, parent_id = track_info.loc[track_info['Track_ID'] == track_ids[0]].values[0]
    if end_frame - start_frame <=0: 
        remove_track(track_info_file, target_tif_dir, to_be_removed_id = track_ids[0])
        track_ids = track_ids[1:]
        continue

    #### Check 2: Moving up Tracks and paused tracks 
    track_id, start_frame, end_frame, parent_id = track_info.loc[track_info['Track_ID'] == track_ids[0]].values[0]
    centroids = []
    centroids_frame = []
    skipped_frames = []
    for frame_number in range(start_frame, end_frame+1):

        #print(f"centroids_with_prediction is : {centroids_with_prediction}")

        frame_path = os.path.join(target_tif_dir, f'man_track{frame_number:04d}.tif')
        frame = tiff.imread(frame_path)
        binary_mask = (frame == track_id).astype(np.uint8)

        #When this track does not exist in this frame we keep going 
        if len(np.unique(binary_mask)) == 1:
            skipped_frames.append(frame_number)
            continue

        # add in the centroid
        centroid = regionprops(binary_mask)[0].centroid
        if centroid is None: raise ValueError("The centroid point being added is Nnne")

        if len(centroids) >=3: # if we are in the middle of the tracklet 
            y_changes = np.diff([c[0] for c in centroids])
            median_change = np.median(y_changes)
            mad = np.median(np.abs(y_changes - median_change))
            threshold = 2.5 * mad # 2.5 is the usual value but can be changed

            #print(f"The centroids are currently {centroids}")

            # here we already have the MAD threshold, the lower bound is median change in y direction - the threshold. 
            # Since the object is always going down, the y value should only increase. So once the object move up, the change in y value should be negative so it is on the lower bound. 
            # Here we are checking if it is outside the lower bound. 
            #print(f"The difference between the current object and the last one is {np.diff([centroids[-1][0], centroid[0]])} with the last item being {centroids[-1][0]} and the current centroid y value is {centroid[0]} with the lower bound be {median_change - threshold}")
            if np.diff([centroids[-1][0], centroid[0]]) <= median_change - threshold: 

                # since this is case 2 so we add a prefix to the track id so we can come back to it
                new_track_label = new_track_label+1
                diverge_track(track_info_file, target_tif_dir, to_be_split_id = track_id, new_id = int(new_track_label), diverging_start_frame = frame_number)
                track_ids.append(int(new_track_label))
                break

            elif len(skipped_frames) >=2:

                # since this is case 2.5 so we add a prefix to the track id so we can come back to it
                new_track_label = new_track_label+1
                diverge_track(track_info_file, target_tif_dir, to_be_split_id = track_id, new_id = int(new_track_label), diverging_start_frame = frame_number)
                track_ids.append(int(new_track_label))
                break
        
        centroids.append(centroid)
        centroids_frame.append(frame_number)


        # else: # if we are at the beginning we do not do anything yet, might change later

    #### Check 3: label switching
    centroids_with_prediction = centroids.copy() # here this centroid will contain the LR predicted centroid
    centroids_frame_with_prediction = centroids_frame.copy()

    #print(f"IN CASE 3: The centroids are {centroids_with_prediction} and the frame numbers are {centroids_frame_with_prediction}")

    if len(centroids_with_prediction) >= 3: 
        covered_by = []
        
        for frame_number in range(sorted(centroids_frame_with_prediction, reverse = True)[0] + 1, sorted(centroids_frame_with_prediction, reverse = True)[0] + 4):
            frame_path = os.path.join(target_tif_dir, f'man_track{frame_number:04d}.tif')
            if os.path.exists(frame_path):
                frame = np.array(tiff.imread(frame_path))

                curr_c = predict_next_centroids(centroids_with_prediction, centroids_frame_with_prediction, predict_this_frame = frame_number)

                #centroids_with_prediction.append(curr_c)
                #centroids_frame_with_prediction.append(frame_number)

                if 0 <= int(curr_c[0]) < frame.shape[0] and 0 <= int(curr_c[1]) < frame.shape[1]:
                    maj_label = maj_object_within_radius(frame, curr_c, radius = 3.5)
                    covered_by.append(maj_label)
                    if maj_label != 0:
                        binary_mask = (frame == maj_label).astype(np.uint8)
                        centroid = regionprops(binary_mask)[0].centroid
                        centroids_with_prediction.append(curr_c)
                        centroids_frame_with_prediction.append(frame_number)

                else:
                    print("prediction went out of bound")
                    covered_by.append(0)
                    centroid = regionprops(binary_mask)[0].centroid


        #print(f"here we see that the track is covered by {covered_by} when the centroids are {centroids_with_prediction}")
        if len(covered_by) == 2:
            non_zero_values = [x for x in covered_by if x > 0]
            for label in set(non_zero_values):
                if non_zero_values.count(label) >= 2:
                    # the ids that are to be merge do not matter because we pick which one is which within the merge track function
                    new = merge_track(track_info_file, target_tif_dir, to_be_merged_ids = (label, track_id))

                    temp = new["Track_ID"].values
                    print(f"The label {label} is not in the track info {label not in temp} and the track id {track_id}")
                    if label not in new["Track_ID"].values and label in track_ids: # make sure to have .values, apparently pd series check index not the value if we do no include this :(
                        track_ids.remove(label)

    #### Check 4: Overall y movement to remove the ones that did not make a movement 
    if len(centroids) < 5 and len(centroids) > 1:
        overall_y_movement = centroids[-1][0] - centroids[0][0]
        if overall_y_movement <= 5:
            remove_track(track_info_file, target_tif_dir, to_be_removed_id = track_ids[0])


    assert len(track_ids) == len(np.unique(track_ids))

    track_ids = track_ids[1:]


print("Main track Processing completed.")

print("initiate updating tracking info csv at the very end to correct and ensure the track info final version")

track_info = pd.read_csv(track_info_file, sep='\s+', names=['Track_ID', 'Start', 'End', 'Parent']) # read in again the new versin of track info 
new_track_info = pd.DataFrame(columns=track_info.columns)
for filename in os.listdir(target_tif_dir):
    if not filename.startswith("._") and filename.endswith(".tif"):
        frame_path = os.path.join(target_tif_dir, filename)
        frame = tiff.imread(frame_path)

        frame_num = int(filename.replace('man_track', '').replace('.tif', ''))
        new_track_info = update_track_info_across_frame(track_info, new_track_info, frame, frame_num)
        new_track_info.to_csv(os.path.join(target_tif_dir, "man_track.txt"), sep=' ', index=False, header=False)



In [None]:
import os
import cv2
import numpy as np
from skimage import measure
from skimage.segmentation import watershed
from scipy.ndimage import distance_transform_edt, center_of_mass
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import connected_components
from scipy.spatial.distance import cdist
from PIL import Image
import re
import pandas as pd

def numerical_sort(value):
    """
    Extracts the numeric part from the filename for sorting.
    Assumes that the filename format is '<number>_htert_Run'.
    """
    parts = re.findall(r'\d+', value)
    return int(parts[0]) if parts else value

def load_images_from_directory(directory):
    images = []
    filenames = sorted([filename for filename in os.listdir(directory) if filename.endswith("_htert_Run.png") and not filename.startswith("._") and "Printed" not in filename], key = numerical_sort) # for cellenONE
    #filenames = sorted([filename for filename in os.listdir(directory) if filename.endswith(".png") and not filename.startswith("._")], key = numerical_sort)
    print(f"filenames are {filenames}")

    for filename in filenames:
        img_path = os.path.join(directory, filename)
        img = Image.open(img_path).convert('L')
        #img = expand_image(img, mode = "images")
        img_array = np.array(img)
        img_array = np.rot90(img_array)
        images.append(img_array) # for cellenONE
    return np.array(images), [f.replace(".png", "") for f in filenames]


def expand_image(img, mode, factor=3,):
    # Get original dimensions
    original_width, original_height = img.size

    # Calculate new dimensions
    new_width = original_width * factor
    new_height = original_height * factor

    # Resize the image
    if mode == "images": 
        new_img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
    elif mode == "masks":
        new_img = img.resize((new_width, new_height), Image.Resampling.NEAREST)

    return new_img

def remove_empty_frame(imgs, masks):
    ind_to_remove = []
    for i in range(masks.shape[0]):
        if np.all(masks[i] == 0):
            ind_to_remove.append(i)

    imgs_new = np.delete(imgs, ind_to_remove, axis = 0)
    masks_new = np.delete(masks, ind_to_remove, axis = 0)

    assert imgs_new.shape == masks_new.shape

    return imgs_new, masks_new, ind_to_remove


def mask_to_bbox(mask):
    """
    Converts a binary mask to a bounding box.

    :param numpy.ndarray mask: Binary mask.
    :return: Bounding box in the format (x, y, w, h).
    :rtype: list[int]
    """
    rows, cols = np.where(mask == 255)
    x1, x2 = np.min(cols), np.max(cols)
    y1, y2 = np.min(rows), np.max(rows)
    return [x1, y1, x2-x1, y2-y1]


def load_masks_from_directory(directory, img_shape, fix_overlap=False, overlap_threshold = 0.6, top_threshold = 0.005, bottom_threshold = 0.995):
    """
    Load masks from a directory and handle overlaps if specified.

    Parameters:
    - directory: str, path to the directory containing the masks.
    - img_shape: tuple, shape of the images.
    - fix_overlap: bool, whether to fix overlaps between masks.

    Returns:
    - masks: numpy array, combined masks for each frame.
    """
    masks = []
    current_object_index = 1
    frames = sorted([frame for frame in os.listdir(directory) if os.path.isdir(os.path.join(directory, frame)) and frame.endswith("_htert_Run")], key = numerical_sort) # for cellenONE
    #frames = sorted([frame for frame in os.listdir(directory) if os.path.isdir(os.path.join(directory, frame)) and frame.startswith("Image_")], key = numerical_sort)
    print(f"The frames are {frames}")

    for frame in frames: #sorted(os.listdir(directory), key = numerical_sort):  # Loop through the frame folders
        frame_dir = os.path.join(directory, frame)
        #if os.path.isdir(frame_dir) and frame.startswith("Image_"):  # The directory needs to start with Image for normal runs 
        if os.path.isdir(frame_dir) and frame.endswith("_htert_Run"): # for cellenONE
            frame_mask = np.zeros(img_shape, dtype=np.int32)

            if fix_overlap:
                curr_masks = []
                
                for filename in sorted(os.listdir(frame_dir)):
                    if filename.endswith(".png") and not filename.startswith("._"):  # Loop through the png files 
                        mask_path = os.path.join(frame_dir, filename)
                        mask = Image.open(mask_path).convert('L')
                        mask_array = np.array(mask)
                        bbox = mask_to_bbox(mask_array)
                        _, y, _, h = bbox

                        if y >= top_threshold * mask_array.shape[0] and (y + h) <= bottom_threshold * mask_array.shape[0]:  # Ignore detections close to top and bottom thresholds
                            if len(np.unique(mask_array)) != 2:
                                raise ValueError("something is up", np.unique(mask_array))
                            curr_masks.append(mask_array)
                        #else:
                            #print(f"Mask {filename} ignored due to top/bottom threshold")


                #print(f"Number of masks in the current frame: {len(curr_masks)}")

                # Create an overlap matrix
                overlap_matrix = np.zeros((len(curr_masks), len(curr_masks)), dtype=int)
                for i in range(len(curr_masks)):
                    for j in range(i + 1, len(curr_masks)):
                        mask_i = curr_masks[i]
                        mask_j = curr_masks[j]

                        # Ensure masks are binary
                        mask1_binary = (mask_i == 255)
                        mask2_binary = (mask_j == 255)

                        # Calculate the size of each mask
                        size1 = np.sum(mask1_binary)
                        size2 = np.sum(mask2_binary)

                        # Identify the smaller and larger masks
                        if size1 < size2:
                            smaller_mask = mask1_binary
                            larger_mask = mask2_binary
                            smaller_size = size1
                        else:
                            smaller_mask = mask2_binary
                            larger_mask = mask1_binary
                            smaller_size = size2

                        # Calculate the overlap
                        overlap = np.sum(smaller_mask & larger_mask)
                        overlap_percentage = overlap / smaller_size
                        #print(f"overalp between mask {i} and maks {j} is {overlap_percentage} with threshold being {overlap_threshold}")

                        if overlap_percentage >= overlap_threshold:
                            overlap_matrix[i, j] = 1
                            overlap_matrix[j, i] = 1

                #print(f"Overlap matrix:\n{overlap_matrix}")

                # Cluster overlapping objects
                sparse_matrix = csr_matrix(overlap_matrix)
                n_components, labels = connected_components(csgraph=sparse_matrix, directed=False, return_labels=True)

                # Group masks by their component labels to form clusters
                clusters = [[] for _ in range(n_components)]
                for mask_index, component_label in enumerate(labels):
                    clusters[component_label].append(mask_index)

                #print(f"Clusters: {clusters}")

                for c in range(len(clusters)):  # Loop through each cluster and merge them
                    masks_in_cluster = [curr_masks[j] for j in clusters[c]]

                    # Create a combined mask
                    combined_mask = np.zeros_like(masks_in_cluster[0], dtype=np.int32)
                    for mask in masks_in_cluster:
                        combined_mask[mask > 0] = 1

                    # Label the combined mask
                    frame_mask[combined_mask > 0] = current_object_index
                    current_object_index += 1

                    #print(f"Processed cluster {c} with {len(masks_in_cluster)} masks.")

            else:  # If not fixing overlaps
                for filename in sorted(os.listdir(frame_dir)):
                    if filename.endswith(".png") and not filename.startswith("._"):  # Loop through the png files  _htert_Run
                        mask_path = os.path.join(frame_dir, filename)
                        mask = Image.open(mask_path).convert('L')
                        mask_array = np.array(mask)
                        if len(np.unique(mask_array)) != 2:
                            raise ValueError("something is up", np.unique(mask_array))
                        # Assign the current object index to the mask pixels
                        frame_mask[(mask_array == 255)] = current_object_index
                        current_object_index += 1
                        #print(f"Processed mask {filename} with index {current_object_index - 1}")

            masks.append(frame_mask)
            # if np.all(frame_mask == 0): print(f"The frame that has all zero is {frame}")
            #print(f"Added frame mask for {frame}, current number of masks: {len(masks)}")

    #print(f"Total frames processed: {len(masks)}")
    return np.array(masks)


In [None]:
# Define the main directory
chip = "A138856A" # "A118880" #"A138974A" # "A138856A"   A138856A/htert_20230822_131349_843.Run/tracked
run = "htert_20230822_131349_843.Run" # "PrintRun_Jan2624_1252" # "PrintRun_Apr1223_1311" #'htert_20230822_131349_843.Run'
#main_img_directory = f"/projects/steiflab/archive/data/imaging/{chip}/NozzleImages/{run}"
main_img_directory = f"/projects/steiflab/archive/data/imaging/{chip}/CellenONEImages/{run}" # for cellenONE
main_mask_directory = f"/projects/steiflab/scratch/leli/{chip}/{run}/rcnn_output_masks"
out_folder = f'{chip}/{run}/tracked'

# Load images
imgs, img_names = load_images_from_directory(main_img_directory)

# Load masks
masks = load_masks_from_directory(main_mask_directory, imgs[0].shape, fix_overlap = True, overlap_threshold = 0.5)

print("Images shape:", imgs.shape)
print("Masks shape:", masks.shape)

# Ensure the shape matches the required format: (time, y, x)
imgs = imgs.reshape(-1, imgs.shape[1], imgs.shape[2])
masks = masks.reshape(-1, masks.shape[1], masks.shape[2])

imgs, masks, ind_to_remove = remove_empty_frame(imgs, masks)

print("Images shape:", imgs.shape)
print("Masks shape:", masks.shape)


'''# Load a pretrained model
# or from a local folder
# model = Trackastra.from_folder('path/my_model_folder/', device=device)
model = Trackastra.from_pretrained("general_2d", device=device)

# Track the cells
track_graph = model.track(imgs, masks, mode="greedy")  # or mode="ilp", or "greedy_nodiv"

# Write to cell tracking challenge format
ctc_tracks, masks_tracked = graph_to_ctc(
      track_graph,
      masks,
      outdir=out_folder,
)'''


## create a file that connects tiffs with the images
tifs = sorted([t for t in os.listdir(out_folder) if t.endswith(".tif")])
img_names_new = np.delete(img_names, ind_to_remove, axis = 0)
link_file = pd.DataFrame({"tifs": tifs, "imgs": img_names_new})
link_file.to_csv(os.path.join(out_folder, "tif_to_img.csv"), index=False)


In [None]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm
import tifffile as tiff

# Function to create a video from the saved PNGs
def create_video(output_dir, output_video, num_frames, width, height, fps=5, n_img = 2, frames_to_process = None):
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    video_writer = cv2.VideoWriter(output_video, fourcc, fps, (width*n_img+10*(n_img-1), height))

    for frame_idx in range(num_frames):

        # Check frame range
        if frames_to_process is not None: 
            if frame_idx not in frames_to_process:
                continue

        if not os.path.isfile(os.path.join(output_dir, f'man_track{frame_idx:04d}.png')): 
            raise ValueError(f"This file does not exists {os.path.join(output_dir, f'man_track{frame_idx:04d}.png')}")

        frame_path = os.path.join(output_dir, f'man_track{frame_idx:04d}.png')
        frame = cv2.imread(frame_path)
        video_writer.write(frame)

    video_writer.release()
    print(f'Video saved as {output_video}')

import os
import cv2
import numpy as np
import tifffile as tiff

def process_frames(imgs, tracking_dir, output_dir, frames_to_process = None):
    font = cv2.FONT_HERSHEY_SIMPLEX

    # Load images if `imgs` is a file path
    if os.path.isfile(imgs):
        imgs = np.load(imgs)
        #print(f"Loaded images from {imgs}")

    tif_files = [file for file in os.listdir(tracking_dir) if not file.startswith("._") and file.endswith("tif")]
    tif_files = sorted(tif_files)
    #print(f"Found {len(tif_files)} tif files")

    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        #print(f"Created output directory: {output_dir}")

    for i, file in enumerate(tif_files):
        #print(f"Processing file {i}: {file}")

        # Check frame range
        if frames_to_process is not None: 
            if i not in frames_to_process:
                continue
            
        #print(f"Frame {i} is within the specified range")

        # Read the tiff file
        tif = tiff.imread(os.path.join(tracking_dir, file))
        if tif is None:
            print(f"Error reading label image: {os.path.join(tracking_dir, file)}")
            continue

        # Process the corresponding image
        img = imgs[i]
        img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX)  # Normalize the image to 8-bit range
        img = img.astype(np.uint8)  # Convert to 8-bit for visualization
        original_img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
        annotated_img = original_img.copy()

        unique_labels = np.unique(tif)
        #print(f"Unique labels found: {unique_labels}")

        for label in unique_labels:
            if label == 0:  # Skip the background
                continue

            # Create a mask for the current label
            mask = np.zeros(tif.shape, dtype=np.uint8)
            mask[tif == label] = 255

            # Find contours
            contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            #print(f"Found {len(contours)} contours for label {label}")

            # Draw contours and label
            for contour in contours:
                cv2.drawContours(annotated_img, [contour], -1, (57, 255, 20), 1)  # Neon green color with thinner trace
                # Get the bounding box for placing the label
                x, y, w, h = cv2.boundingRect(contour)
                cv2.putText(annotated_img, str(label), (x, y - 10), font, 0.5, (0, 0, 255), 1, cv2.LINE_AA)  # Purple

        # Create a white space (column) between the images
        white_space = np.ones((original_img.shape[0], 10, 3), dtype=np.uint8) * 255

        # Combine the original and annotated images with white space in between
        combined_img = cv2.hconcat([original_img, white_space, annotated_img])
        #print(f"Combined image shape: {combined_img.shape}")

        output_path = os.path.join(output_dir, file.replace(".tif", ".png"))
        # Save the output image
        # print(f"Save and complete file {i}: {file}")
        if not cv2.imwrite(output_path, combined_img):
            print(f"Error saving image: {output_path}")
        else:
            print(f"Saved image for frame: {i} at {output_path}")


import os
import numpy as np
import cv2
import tifffile as tiff
import matplotlib.pyplot as plt

# Predefined colors with names
color_map = {
    'red': (255, 0, 0),
    'green': (0, 255, 0),
    'blue': (0, 0, 255),
    'cyan': (0, 255, 255),
    'magenta': (255, 0, 255),
    'yellow': (255, 255, 0),
    'orange': (255, 165, 0),
    'purple': (128, 0, 128),
    'pink': (255, 192, 203),
    'lime': (0, 255, 0)
}

def get_color_name(index):
    """
    Get the color name and RGB values based on the index.

    Parameters:
    - index: int, index of the color in the color map.

    Returns:
    - tuple: (color_name, color_rgb), where color_name is the name of the color and color_rgb is the RGB tuple.
    """
    color_names = list(color_map.keys())
    color_name = color_names[index % len(color_map)]
    color_rgb = color_map[color_name]
    return color_name, color_rgb

def display_colored_labels(path, labels):
    """
    Display an image with each label colored uniquely.

    Parameters:
    - path: str, path to the directory containing TIFF files or a single TIFF file.
    - labels: list of int, list of labels to be colored.

    Output:
    - Display the image with colored labels.
    """
    # Create a blank canvas for the final image
    final_image = None

    def process_tif_file(file_path):
        nonlocal final_image
        tif = tiff.imread(file_path)

        # Initialize the final image if it hasn't been already
        if final_image is None:
            final_image = np.zeros((tif.shape[0], tif.shape[1], 3), dtype=np.uint8)

        # Color each label with a unique color
        for i, label in enumerate(labels):
            color_name, color_rgb = get_color_name(i)
            print(f"Label {label} is colored with {color_name} (RGB: {color_rgb})")
            mask = (tif == label)
            final_image[mask] = color_rgb

    # Check if the path is a directory or a single file
    if os.path.isdir(path):
        # Process each TIFF file in the directory
        for file in os.listdir(path):
            if file.endswith(".tif"):
                file_path = os.path.join(path, file)
                process_tif_file(file_path)
    else:
        # Process the single TIFF file
        process_tif_file(path)

    # Display the final image
    if final_image is not None:
        plt.figure(figsize=(10, 10))
        plt.title("Colored Labels")
        plt.imshow(final_image)
        plt.axis('off')
        plt.show()
    else:
        print("No TIFF files found or processed.")






In [None]:
# Directory containing tracking results (TIFF files and text file)
tracking_dir = f"/projects/steiflab/scratch/leli/trackastra/{out_folder}_postprocessed_2.0"
print(f"{os.path.isdir(tracking_dir)}")
# Create an output directory for PNGs
output_dir = f"/projects/steiflab/scratch/leli/trackastra/{out_folder}_postprocessed_2.0_imgs"
os.makedirs(output_dir, exist_ok=True)

# Process each frame
#total_frame_num = len([file for file in os.listdir(main_img_directory) if file.startswith("Image_") and file.endswith(".png")])
total_frame_num = len([filename for filename in os.listdir(main_img_directory) if filename.endswith("_htert_Run.png") and not filename.startswith("._") and "Printed" not in filename])
act_rcnn_inds = [i+1 for i in range(total_frame_num) if i not in ind_to_remove] # here we are tying to find the corresponding frame index that matches with the rcnn results
assert len([file for file in os.listdir(tracking_dir) if not file.startswith("._") and file.endswith("tif")]) == len(act_rcnn_inds)

start = 3274
end = 3757
frames_to_process = [i for i, act in enumerate(act_rcnn_inds) if act >= start and act <= end] # here if teh act rcnn index is in range then we include the 0-starting index which will be used to index the ti files later
print(frames_to_process)

process_frames(imgs, tracking_dir, output_dir, frames_to_process = None)
print(f"Process frames done!")

# Create a video from the saved PNGs
'''print(f"These are the tiffs that should be in the ground truth")
output_video = f'/projects/steiflab/scratch/leli/trackastra/postprocessing/tracked_1.0_imgs_pp/tracked_video_val.mp4'
height, width = imgs.shape[1], imgs.shape[2]  # Get height and width from images
create_video(output_dir, output_video, imgs.shape[0], width, height, fps=3, frames_to_process = frames_to_process)'''

height, width = imgs.shape[1], imgs.shape[2]  # Get height and width from images
output_video = f'/projects/steiflab/scratch/leli/trackastra/{out_folder}_postprocessed_2.0_imgs/tracked_video_full.mp4'
create_video(output_dir, output_video, imgs.shape[0], width, height, fps=3, frames_to_process = None)


## Perform Track to Well 

Since cellenONE format is different we just want to change how we get the last frame and then we will move on 

In [None]:
import tifffile as tiff
import numpy as np
import tifffile as tiff
from skimage.measure import label, regionprops
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
from PIL import Image

def load_resize_combine_display(tif_path, png_path, output_path=None):
    """
    Load a TIFF image and a PNG image, resize the PNG to match the TIFF dimensions, and combine/display both images.

    Parameters:
    - tif_path: str, path to the TIFF image file.
    - png_path: str, path to the PNG image file.
    - output_path: str, path to save the combined image (optional).
    """
    # Load the TIFF file
    tif_image = tiff.imread(tif_path)
    
    # Load the PNG file
    png_image = Image.open(png_path)
    
    # Resize the PNG file to match the TIFF dimensions
    png_resized = png_image.resize((tif_image.shape[1], tif_image.shape[0]), Image.LANCZOS)
    
    # Convert the resized PNG image to a numpy array
    png_array = np.array(png_resized)
    
    # Create a figure with two subplots
    fig, ax = plt.subplots(1, 2, figsize=(12, 6))
    
    # Display the TIFF image with viridis colormap
    ax[0].imshow(tif_image, cmap='viridis')
    ax[0].set_title('TIFF Image (Viridis)')
    ax[0].axis('off')
    
    # Display the resized PNG image
    ax[1].imshow(png_array)
    ax[1].set_title('Resized PNG Image')
    ax[1].axis('off')
    
    plt.tight_layout()
    
    # Save the combined image if output_path is provided
    if output_path:
        plt.savefig(output_path, dpi=300)
    
    # Show the combined image
    #plt.show()
    plt.close(fig)


def find_highest_object(frame):
    """
    Find the object that has the highest y value within the bottom quarter of the image.

    Parameters:
    - frame: numpy array, the pixel matrix where entries are object assignments.

    Returns:
    - highest_object_id: int, the ID of the object that is closest to the bottom and within the bottom quarter of the image.
    - highest_y: int, the highest y-coordinate of the object.
    """
    # Calculate the threshold for the bottom quarter
    threshold = 2.5 * frame.shape[0] // 4
    
    # Label the objects in the frame
    unique_labels = np.unique(frame)
    
    # Initialize variables to keep track of the highest object
    highest_object_id = 0
    highest_y = 0
    
    # Iterate through each labeled region
    for label in unique_labels:
        if label == 0: 
            continue 
        #print(f"iterating through the find highest y value fucntion and the current rehion is {label}")

        binary_mask = (frame == label).astype(np.uint8)
        region = regionprops(binary_mask)[0]
        # Get the coordinates of the region
        coords = region.coords
        # Find the maximum y-coordinate of the region
        max_y = coords[:, 0].max()
        #print(f"The max y coor is {max_y} and the threshold is {threshold} with the current highest is {highest_y} for track id: {label}")
        # Check if the maximum y-coordinate is within the bottom quarter
        if max_y >= threshold and max_y > highest_y:
            highest_y = max_y
            highest_object_id = label
    
    return highest_object_id, highest_y

def numerical_sort(value):
    """
    Extracts the numeric part from the filename for sorting.
    Assumes that the filename format is '<number>_htert_Run'.
    """
    parts = re.findall(r'\d+', value)
    return int(parts[0]) if parts else value

import pandas as pd
import os
import re
import tifffile as tiff

def link_track_to_well(logfile_directory, tracked_tif_directory, fluro_directory, linkfile_directory, output_directory=None):

    linkfile = pd.read_csv(linkfile_directory)
    linkfile = linkfile.sort_values(by='tifs').reset_index(drop=True)
    logfile = pd.read_csv(logfile_directory)
    gr_logfile = logfile.groupby(['ImageFile'])

    # Filter out groups where the key contains "samplename"
    filtered_groups = [name for name in gr_logfile.groups if "samplename" not in name]

    # Get the number of such groups
    number_of_filtered_groups = len(filtered_groups)
    print(f"There are {number_of_filtered_groups} groups where 'ImageFile' does not contain 'samplename'.")

    final_dict = {
        "track_ID": [], 
        "after_dispense_frame": [], 
        "last_tracked_frame": [],
        "last_tracked_tif": [],
        "row": [],
        "col": []}
    

    for (ImageFile), group in gr_logfile:
        if "samplename" in ImageFile[0]: continue
        print(f"ImageFile is {ImageFile[0]}")

        # Use regex to find the numbers following 'R' and 'C'
        match = re.search(r'_R(\d+)_C(\d+)', ImageFile[0])
        if match:
            r = int(match.group(1))
            c = int(match.group(2))
        else:
            raise ValueError("Filename format is incorrect")
        
        fluro_img = tiff.imread(os.path.join(fluro_directory, f"C{c:02d}", f"R{r:02d}_C{c:02d}_0000_00_Cyan.tif"))

        # Use regex to find the numbers following 'R' and 'C'
        match = re.search(r'(\d+)_Printed', ImageFile[0])
        if match:
            after_dispense_frame = str(match.group(1)) + "_htert_Run"
        else:
            raise ValueError("Filename format is incorrect")

        print(f"The current iteration has r: {r}, c: {c}, with the after dispense frame being {after_dispense_frame}")

        # Establish the values to be inputted into the dataframe
        track_ID = 0  # Default meaning no track associated
        last_tracked_frame = after_dispense_frame 
        last_tracked_tif = ""  # Empty because the image may not have a corresponding tif file

        # Obtain the 3 frames right before the frame after dispense 
        tif_to_consider = []
        img_to_consider = []
        frames_before = [f for f in linkfile["imgs"] if numerical_sort(f) <= numerical_sort(after_dispense_frame)]
        if len(frames_before) == 0: 
            print(f"There are zero frames including the current after dispense frame one {after_dispense_frame}")
        else: 
            prev_closest_frame = sorted(frames_before, key=numerical_sort, reverse=True)[0]
            tif_after_dispense = linkfile.loc[linkfile["imgs"] == prev_closest_frame]["tifs"]
            if len(tif_after_dispense) != 0:
                i = tif_after_dispense.index[0]
                if i >= 2: 
                    a = sorted(linkfile.iloc[i-2:i+1]['tifs'], key=numerical_sort, reverse=True)
                    b = sorted(linkfile.iloc[i-2:i+1]['imgs'], key=numerical_sort, reverse=True)
                    tif_to_consider = sorted([tif for tif, img in zip(a, b) if numerical_sort(after_dispense_frame) - numerical_sort(img) <= 2], reverse=True)
                    img_to_consider = sorted([img for tif, img in zip(a, b) if numerical_sort(after_dispense_frame) - numerical_sort(img) <= 2], reverse=True)
                elif i != 0: 
                    a = sorted(linkfile.iloc[:i+1]['tifs'], key=numerical_sort, reverse=True)
                    b = sorted(linkfile.iloc[:i+1]['imgs'], key=numerical_sort, reverse=True)
                    tif_to_consider = sorted([tif for tif, img in zip(a, b) if numerical_sort(after_dispense_frame) - numerical_sort(img) <= 2], reverse=True)
                    img_to_consider = sorted([img for tif, img in zip(a, b) if numerical_sort(after_dispense_frame) - numerical_sort(img) <= 2], reverse=True)

            print(f"The img to consider is {img_to_consider}")

            for tif_name, img_name in zip(tif_to_consider, img_to_consider):

                tracked_tif = tiff.imread(os.path.join(tracked_tif_directory, tif_name))
                track_ID, highest_y = find_highest_object(tracked_tif)
                last_tracked_frame = img_name
                last_tracked_tif = tif_name
                if track_ID: 
                    break  # Since the filename is sorted from later frames to earlier, stop if we find one that fits into our threshold

        final_dict["track_ID"].append(int(track_ID))
        final_dict["after_dispense_frame"].append(after_dispense_frame)
        final_dict["last_tracked_frame"].append(last_tracked_frame)
        final_dict["last_tracked_tif"].append(last_tracked_tif)
        final_dict["row"].append(r)
        final_dict["col"].append(c)

    if output_directory: 
        pd.DataFrame(final_dict).to_csv(os.path.join(output_directory, "track_to_well_unfiltered.csv"), index=False)
    
    return pd.DataFrame(final_dict)


logfile_directory = '/projects/steiflab/scratch/leli/A138856A/htert_20230822_131349_843.Run/record.csv'
tracked_tif_directory = '/projects/steiflab/scratch/leli/trackastra/A138856A/htert_20230822_131349_843.Run/tracked_postprocessed_2.0'
fluro_directory = '/projects/steiflab/archive/data/imaging/A138856A/MicroscopeImages/S0000/'
linkfile_directory = '/projects/steiflab/scratch/leli/trackastra/A138856A/htert_20230822_131349_843.Run/tracked/tif_to_img.csv'

df = link_track_to_well(logfile_directory, tracked_tif_directory, fluro_directory, linkfile_directory, output_directory = '/projects/steiflab/scratch/leli/A138856A/htert_20230822_131349_843.Run/track_to_well')

Process the track to well file

In [None]:
import pandas as pd
import numpy as np

def process_track_ids(df, track_info, output_directory):

    # Process each unique track ID with multiple rows
    track_id_counts = df['track_ID'].value_counts()
    multiple_row_track_ids = track_id_counts[track_id_counts > 1].index

    for track_id in multiple_row_track_ids:
        if track_id <= 0:
            continue

        track_df = df[df['track_ID'] == track_id]
        
        # Get the corresponding track info row
        track_info_row = track_info[track_info['Track_ID'] == track_id]
        if track_info_row.empty:
            print(f"Track ID {track_id}: No matching track info found.")
            continue

        # Calculate the distance to the end value for each row in the track_df
        end_value = track_info_row['End'].values[0]
        df.loc[track_df.index, 'distance_to_end'] = [abs(numerical_sort(i) - end_value) for i in track_df['last_tracked_tif']]
        
        # Find the row with the minimum distance to the end value
        closest_row_idx = df.loc[track_df.index, 'distance_to_end'].idxmin()
        
        # Label all other rows with -1
        df.loc[track_df.index.difference([closest_row_idx]), 'track_ID'] = -1


    # Save the processed DataFrame if an output directory is provided
    if output_directory:
        if "distance_to_end" in list(df.columns): df.drop(columns=['distance_to_end'], inplace=True)
        df.to_csv(os.path.join(output_directory, "track_to_well_pp.csv"), index=False)

    return df

# Example usage:
# df = pd.DataFrame(...)  # Your df dataframe
track_info = pd.read_csv('/projects/steiflab/scratch/leli/trackastra/A138856A/htert_20230822_131349_843.Run/tracked_postprocessed_2.0/man_track.txt', sep='\s+', header=None, names=['Track_ID', 'Start', 'End', 'Mother'])  # Your track_info dataframe
processed_df = process_track_ids(df, track_info, output_directory='/projects/steiflab/scratch/leli/A138856A/htert_20230822_131349_843.Run/track_to_well')


In [None]:
import pandas as pd

def display_unique_track_ids(df):
    # Filter out non-zero track IDs
    positive_nonzero_track_ids = df[df['track_ID'] > 0]['track_ID']

    # Get the value counts of these track IDs
    track_id_counts = positive_nonzero_track_ids.value_counts()

    # Display the track IDs and their counts
    print("Unique positive non-zero track IDs and their counts:")
    print(track_id_counts)

    # Verify that each track ID has only one row
    duplicate_tracks = track_id_counts[track_id_counts > 1]
    if duplicate_tracks.empty:
        print("All positive non-zero track IDs have only one row.")
    else:
        print("Some track IDs have duplicates:")
        print(duplicate_tracks)

# Assuming df is your DataFrame
display_unique_track_ids(processed_df)


In [None]:
import tifffile as tiff
import matplotlib.pyplot as plt

# Step 1: Read the TIFF file
tif_file_path = '/projects/steiflab/archive/data/imaging/A138856A/MicroscopeImages/S0000/C60/R20_C60_0000_00_Cyan.tif'
image = tiff.imread(tif_file_path)

# Step 2: Display the image with viridis colormap
plt.figure(figsize=(10, 8))
plt.imshow(image, cmap='viridis')
plt.colorbar()  # Optional: Add a colorbar to show the intensity scale
plt.title('TIFF Image with Viridis Colormap')
plt.axis('off')  # Optional: Hide the axis
plt.show()


## Lets analyze it 

So here we can see there are 40 out of 412 wells that do not associate with any 

In [None]:
value_counts = processed_df['track_ID'].value_counts()
duplicate_values = value_counts[value_counts > 1]
print(duplicate_values)
print(len(np.unique(processed_df['track_ID'])))

There are 1313 missing tracks that are not passed.

Intepretation of this group: 

    - tracking mistake  --> How we can try our best to filter this out is to check the last frame this track happened and if it is not in the bottom quater of the image then we discard it
    
    - discarded --> This is what we wanted. 

In [None]:
print(list(np.unique(processed_df['track_ID'])))

In [None]:
track_info = pd.read_csv('/projects/steiflab/scratch/leli/trackastra/A138856A/htert_20230822_131349_843.Run/tracked_postprocessed_2.0/man_track.txt', sep='\s+', header=None, names=['Index', 'Start', 'End', 'Mother'])
missing_track_ids = list(set(track_info['Index']).difference(set(processed_df['track_ID'])))
print(missing_track_ids)
print(len(missing_track_ids))

There are 765 that are discarded and there are 603 that are tracking mistakes

In [None]:
import os
import tifffile as tiff
import numpy as np

def end_to_tif_filename(end_value, base_folder):
    # Example conversion logic, you might need to adjust the format
    return os.path.join(base_folder, f"man_track{end_value:04d}.tif")

def is_in_bottom_quarter(coords, image_shape):
    bottom_threshold = 5 * image_shape[0] // 8
    return np.all(coords[:, 0] >= bottom_threshold)

def check_missing_tracks(missing_track_ids, man_track, processed, base_folder):
    # Lists to categorize track IDs
    not_found_tracks = []
    bottom_quarter_tracks = []
    not_bottom_quarter_tracks = []
    
    for track_id in missing_track_ids:
        end_value = man_track.loc[man_track['Index'] == track_id, 'End'].values[0]
        tif_filename = end_to_tif_filename(end_value, base_folder)
        
        if not os.path.exists(tif_filename):
            raise FileNotFoundError(f"TIFF file {tif_filename} does not exist.")
        
        # Load the TIFF file
        tiff_image = tiff.imread(tif_filename)
        
        # Check if the track ID exists in the TIFF image
        if track_id not in np.unique(tiff_image):
            not_found_tracks.append(track_id)
        else:
            # Get the coordinates of the object
            coords = np.column_stack(np.where(tiff_image == track_id))
            
            # Check if the object is in the bottom quarter
            if is_in_bottom_quarter(coords, tiff_image.shape):
                bottom_quarter_tracks.append(track_id)
            else:
                not_bottom_quarter_tracks.append(track_id)
    
    return not_found_tracks, bottom_quarter_tracks, not_bottom_quarter_tracks

# Assuming the base_folder is where the TIFF files are located
base_folder = '/projects/steiflab/scratch/leli/trackastra/A138856A/htert_20230822_131349_843.Run/tracked_postprocessed_2.0'

# Check the missing tracks
not_found_tracks, bottom_quarter_tracks, not_bottom_quarter_tracks = check_missing_tracks(missing_track_ids, track_info, processed_df, base_folder)

# Output results
print(f"Track IDs not found in their end frame: {not_found_tracks}")
print(f"Track IDs entirely in the bottom quarter of their end frame: {bottom_quarter_tracks} wiht length {len(bottom_quarter_tracks)}")
print(f"Track IDs NOT entirely in the bottom quarter of their end frame: {not_bottom_quarter_tracks} with length {len(not_bottom_quarter_tracks)}")



In [None]:
import random 
print(f"Here are ten picked out from the bottom quarter: {sorted(random.sample([i for i in bottom_quarter_tracks if i < 500], 10))}")
print(f"Here are ten picked out from the NOT bottom quarter: {sorted(random.sample([i for i in not_bottom_quarter_tracks if i  < 500], 10))}")

Here we can conclude that ALL the ones picked out in the bottom quater are good tracks that travel down with no issue

Here we can also conclude that 70% of the ones that we put in the not in bottom quarter are mistakes like label switching and 30% of them are just tracks that ends abruptly do not know where they went since that is the last frame they were ever observed

The rest of the analysis will be conducted in the feature_extraction.ipynb file the goal is to have the same analysis done to try to find a way to distinguish between the two or to train a tree where we can logically distinguish the two. 

In [None]:
print(list(np.unique(processed_df['track_ID'])))

## Confusion Matrix for isolated


In [None]:
import pandas as pd

# Assuming 'df' is your DataFrame
df = pd.read_csv('/projects/steiflab/scratch/leli/A138856A/htert_20230822_131349_843.Run/isolated_metadata.csv')  # Load your DataFrame if needed
print(df.shape)
# Count unique values in the 'Prediction' column
prediction_counts = df['Type'].value_counts()

# Print the counts
print(prediction_counts)

In [None]:
# Create a dictionary to store the results
sampled_images = {}

# Loop through each unique value in the 'Type' column
for type_value in df['Type'].unique():
    # Filter the DataFrame for the current Type value
    filtered_df = df[df['Type'] == type_value]
    
    # Sample 10 values from the 'Fluro_image' column
    sampled_images[type_value] = filtered_df['Fluro_image'].sample(n=10, random_state=42).tolist()

# Print the results
for type_value, images in sampled_images.items():
    print(f"Type: {type_value}")
    print(f"Sampled Fluro_image values: {images}")
    print("\n")  # Add a newline for better readability

In [None]:
import os
import matplotlib.pyplot as plt
from tifffile import imread
import numpy as np

# Function to display a .tif image using the Viridis colormap
def display_tif_image(file_path):
    # Read the .tif image
    image = imread(file_path)
    
    # Display the image with the Viridis colormap
    plt.figure(figsize=(6, 6))
    plt.imshow(image, cmap='viridis')
    plt.colorbar()
    plt.title(f"Image: {os.path.basename(file_path)}")
    plt.axis('off')  # Hide the axis
    plt.show()

# Loop through the dictionary and display each image
for type_value, images in sampled_images.items():
    print(f"Displaying images for Type: {type_value}")
    for image_path in images:
        display_tif_image(os.path.join("/projects/steiflab/archive/data/imaging/A138856A/MicroscopeImages/S0000", image_path))

## Merging between sequence data and fluoro

In [None]:
import csv
t_df = pd.read_csv('/projects/steiflab/scratch/leli/A138856A/htert_20230822_131349_843.Run/isolated_metadata.csv')
#seq_df = pd.read_csv('/projects/steiflab/scratch/glchang/other/leon/A138856.tsv', sep = '\t')

# Initialize a list to store rows
seq_df = []

# Open and read the TSV file using csv.reader
with open('/projects/steiflab/scratch/glchang/other/leon/A138856.tsv', mode='r') as file:
    reader = csv.reader(file, delimiter=' ')
    
    for row in reader:
        seq_df.append(row)
seq_df = pd.DataFrame(seq_df)

seq_df.columns = seq_df.iloc[0]  # Set the first row as the header
seq_df = seq_df[1:]  # Remove the first row from the data
seq_df.reset_index(drop=True, inplace=True)  # Reset the index

In [None]:
t_df.columns

In [None]:
seq_df['row']

In [None]:
# Convert the 'row' and 'col' columns in seq_df to int64
seq_df['row'] = seq_df['row'].astype(int)
seq_df['col'] = seq_df['col'].astype(int)
seq_df['total_reads'] = seq_df['total_reads'].astype(int)
# Perform the merge with the converted columns
merged_df = pd.merge(t_df, seq_df, left_on=['R', 'C'], right_on=['row', 'col'], how='left')


In [None]:
merged_df['total_reads']

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'df' is your DataFrame that contains 'total_reads' and 'Type' columns
# Adjust the DataFrame name if needed

# Create the boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(x='Type', y='total_reads', data=merged_df)

# Add title and labels
plt.title('Boxplot of Total Reads by Type')
plt.xlabel('Type')
plt.ylabel('Total Reads')

# Display the plot
plt.show()


In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from tifffile import imread

# Assuming 'merged_df' is your DataFrame that contains 'total_reads', 'Fluro_image', and 'cell_condition' columns

# Step 1: Calculate the 80th percentile of total_reads
percentile_80 = merged_df['total_reads'].quantile(0.20)
ncc_cutoff = 113014 + 72160
# Step 2: Split the data into two groups based on the 80th percentile
below_80th = merged_df[merged_df['total_reads'] <= ncc_cutoff]
above_80th = merged_df[merged_df['total_reads'] > ncc_cutoff]

# Step 3: Randomly sample 10 values from the 'Fluro_image' column in each group
below_80th_sample = below_80th.sample(n=10)
above_80th_sample = above_80th.sample(n=10)

# Function to display a .tif image using the Viridis colormap
def display_tif_image(file_path, cell_condition, experimental_condition):
    # Read the .tif image
    image = imread(file_path)
    
    # Calculate the maximum pixel intensity
    max_intensity = image.max()
    
    # Display the image with the Viridis colormap
    plt.figure(figsize=(6, 6))
    plt.imshow(image, cmap='viridis')
    plt.colorbar()
    
    # Print the max intensity on the image
    plt.text(0.05, 0.95, f"Max Intensity: {max_intensity}", color='white', fontsize=12,
             ha='left', va='top', transform=plt.gca().transAxes, bbox=dict(facecolor='black', alpha=0.5))
    
    plt.title(f"Image: {os.path.basename(file_path)}\nCell Condition: {cell_condition}\n Exp condition: {experimental_condition}")
    plt.axis('off')  # Hide the axis
    plt.show()

# Display images and corresponding cell conditions below the 80th percentile
print("Displaying images below the ncc_cutoff:")
for _, row in below_80th_sample.iterrows():
    image_file = row['Fluro_image']
    cell_condition = row['cell_condition']
    experimental_condition = row['experimental_condition']
    display_tif_image(os.path.join("/projects/steiflab/archive/data/imaging/A138856A/MicroscopeImages/S0000", image_file), cell_condition, experimental_condition)

# Display images and corresponding cell conditions above the 80th percentile
print("Displaying images above the ncc_cutoff:")
for _, row in above_80th_sample.iterrows():
    image_file = row['Fluro_image']
    cell_condition = row['cell_condition']
    experimental_condition = row['experimental_condition']
    display_tif_image(os.path.join("/projects/steiflab/archive/data/imaging/A138856A/MicroscopeImages/S0000", image_file), cell_condition, experimental_condition)


In [None]:
import matplotlib.pyplot as plt

# Assuming 'merged_df' is your DataFrame that contains the 'total_reads' column

# Step 1: Calculate the 20th percentile of total_reads
percentile_20 = merged_df['total_reads'].quantile(0.20)
percentile_80 = merged_df['total_reads'].quantile(0.80)
# Step 2: Create the histogram
plt.figure(figsize=(10, 6))
plt.hist(merged_df['total_reads'], bins=30, color='blue', edgecolor='black')

# Step 3: Add a vertical line at the 20th percentile
plt.axvline(percentile_20, color='red', linestyle='dashed', linewidth=2, label=f'20th Percentile: {percentile_20:.2f}')
plt.axvline(percentile_80, color='red', linestyle='dashed', linewidth=2, label=f'80th Percentile: {percentile_80:.2f}')

# Step 4: Add title and labels
plt.title('Histogram of Total Reads with 20th Percentile')
plt.xlabel('Total Reads')
plt.ylabel('Frequency')

# Add legend
plt.legend()

# Display the plot
plt.show()


In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from tifffile import imread
import numpy as np

# Assuming 'merged_df' is your DataFrame that contains 'total_reads', 'Fluro_image', and 'cell_condition' columns

# Step 1: Calculate the 80th percentile of total_reads
ncc_cutoff = 113014   + 72160

# Step 2: Split the data into two groups based on the ncc_cutoff
below_80th = merged_df[merged_df['total_reads'] <= ncc_cutoff]
above_80th = merged_df[merged_df['total_reads'] > ncc_cutoff]

# Step 3: Calculate the Z-score of max pixel intensity for each image
def get_max_intensity_zscore(row):
    image_file = os.path.join("/projects/steiflab/archive/data/imaging/A138856A/MicroscopeImages/S0000", row['Fluro_image'])
    image = imread(image_file)
    max_intensity = image.max()
    mean_intensity = np.mean(image)
    std_intensity = np.std(image)
    
    # Calculate the Z-score for the max intensity
    z_score = (max_intensity - mean_intensity) / std_intensity if std_intensity > 0 else 0
    return z_score

# Apply the function to calculate Z-score for max intensity for each group
below_80th = below_80th.copy()  # Create a copy to avoid SettingWithCopyWarning
above_80th = above_80th.copy()  # Create a copy to avoid SettingWithCopyWarning

below_80th.loc[:, 'max_intensity_zscore'] = below_80th.apply(get_max_intensity_zscore, axis=1)
above_80th.loc[:, 'max_intensity_zscore'] = above_80th.apply(get_max_intensity_zscore, axis=1)

# Combine the results into a single DataFrame for plotting
plot_data = pd.concat([
    below_80th[['max_intensity_zscore']].assign(group='Below ncc_cutoff'),
    above_80th[['max_intensity_zscore']].assign(group='Above ncc_cutoff')
])

# Step 4: Create a violin plot to visualize the Z-score distributions
plt.figure(figsize=(10, 6))
sns.violinplot(x='group', y='max_intensity_zscore', data=plot_data, palette='viridis')

# Add title and labels
plt.title('Violin Plot of Max Pixel Intensity Z-Score by Group')
plt.xlabel('Group')
plt.ylabel('Max Pixel Intensity Z-Score')

# Display the plot
plt.show()


In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from tifffile import imread

# Assuming 'merged_df' is your DataFrame that contains 'total_reads', 'Fluro_image', and 'cell_condition' columns

# Step 1: Calculate the 80th percentile of total_reads
ncc_cutoff = 113014 + 72160

# Step 2: Split the data into two groups based on the ncc_cutoff
below_80th = merged_df[merged_df['total_reads'] <= ncc_cutoff]
above_80th = merged_df[merged_df['total_reads'] > ncc_cutoff]

# Step 3: Calculate max pixel intensity for each group
def get_max_intensity(row):
    image_file = os.path.join("/projects/steiflab/archive/data/imaging/A138856A/MicroscopeImages/S0000", row['Fluro_image'])
    image = imread(image_file)
    return image.max()

# Apply the function to calculate max intensity for each group
below_80th['max_intensity'] = below_80th.apply(get_max_intensity, axis=1)
above_80th['max_intensity'] = above_80th.apply(get_max_intensity, axis=1)

# Combine the results into a single DataFrame for plotting
plot_data = pd.concat([
    below_80th[['max_intensity']].assign(group='Below ncc_cutoff'),
    above_80th[['max_intensity']].assign(group='Above ncc_cutoff')
])

# Step 4: Create a boxplot to visualize the distributions
plt.figure(figsize=(10, 6))
sns.boxplot(x='group', y='max_intensity', data=plot_data, palette='viridis')

# Add title and labels
plt.title('Boxplot of Max Pixel Intensity by Group')
plt.xlabel('Group')
plt.ylabel('Max Pixel Intensity')

# Display the plot
plt.show()
