# This document is meant to prepare the figures required for the paper

## Part a) 

### Isolatrix data preparation 

In [None]:
import pandas as pd
import os
import csv

# Initialize empty DataFrames to store the results
df_max_prediction_1 = pd.DataFrame()
df_max_prediction_0 = pd.DataFrame()

# Your home directory path (change this to your actual path)
home_dir = "/projects/steiflab/archive/data/imaging/A138856A/NozzleImages"

# Traverse through each subdirectory in the home directory
for root, dirs, files in os.walk(home_dir):
    for file in files:
        if file == "LogFile.csv":
            # Construct the full file path
            file_path = os.path.join(root, file)
            
            # Read the logfile.csv file
            df = pd.read_csv(file_path)
            
            # Group by R and C, then find the row with the maximum Prediction for each group
            df_max_pred = df.loc[df.groupby(['R', 'C'])['Prediction'].idxmax()]

            # Filter for prediction = 1
            df_pred_1 = df_max_pred[df_max_pred['Prediction'] == 1][['R', 'C', 'Prediction']]
            df_max_prediction_1 = pd.concat([df_max_prediction_1, df_pred_1], ignore_index=True)
            
            # Filter for prediction = 0
            df_pred_0 = df_max_pred[df_max_pred['Prediction'] == 0][['R', 'C', 'Prediction']]
            df_max_prediction_0 = pd.concat([df_max_prediction_0, df_pred_0], ignore_index=True)


# Rename columns in df_max_prediction_1 and df_max_prediction_0 to 'row', 'col', and 'prediction'
df_max_prediction_1.columns = ['row', 'col', 'prediction']
df_max_prediction_0.columns = ['row', 'col', 'prediction']

# Concatenate the two dataframes together
df_combined_predictions = pd.concat([df_max_prediction_1, df_max_prediction_0], ignore_index=True)

# Detect and resolve duplicates in df_combined_predictions by choosing the maximum prediction
df_combined_predictions = df_combined_predictions.groupby(['row', 'col'], as_index=False).agg({'prediction': 'max'})

# Check for duplicates based on the combination of 'row' and 'col'
duplicates = df_combined_predictions[df_combined_predictions.duplicated(subset=['row', 'col'], keep=False)]

# Print the duplicated rows
if not duplicates.empty:
    print("Found duplicates in the df_combined_predictions df  of 'row' and 'col':")
    print(duplicates)
else:
    print("No duplicates found in the df_combined_predictions df  of 'row' and 'col'.")

# Step 3: Read the new file (A138856.tsv) and prepare the new_df
new_df = pd.read_csv('/projects/steiflab/archive/data/wgs/single_cell/internal/A138856/merge/metadata.tsv', delimiter = '\t')
print(f"The metadata has shape {new_df.shape}")

# Check for duplicates based on the combination of 'row' and 'col'
duplicates = new_df[new_df.duplicated(subset=['row', 'col'], keep=False)]

# Print the duplicated rows
if not duplicates.empty:
    print("Found duplicates in the new df  of 'row' and 'col':")
    print(duplicates)
else:
    print("No duplicates found in the new df  of 'row' and 'col'.")

# Merge the combined dataframe with new_df, ensuring that all rows in new_df are retained
merged_df = pd.merge(new_df, df_combined_predictions, how='left', on=['row', 'col'])

combination_counts = new_df.groupby(['experimental_condition', 'cell_condition']).size().reset_index(name='counts')
combination_counts.sort_values(by='counts', ascending=False, inplace=True)

print(combination_counts)

combination_counts_2 = merged_df[~merged_df['prediction'].isna()].groupby(['experimental_condition', 'cell_condition']).size().reset_index(name='counts')
combination_counts_2.sort_values(by='counts', ascending=False, inplace=True)

print("got rif of the prediction is NA")
print(combination_counts_2)

isolatrix_df = merged_df[~merged_df['prediction'].isna()]

isolatrix_df = isolatrix_df[['row', 'col', 'experimental_condition','cell_condition', 'prediction']]
isolatrix_df.columns

# Check for duplicates based on the combination of 'row' and 'col'
duplicates = isolatrix_df[isolatrix_df.duplicated(subset=['row', 'col'], keep=False)]

# Print the duplicated rows
if not duplicates.empty:
    print("Found duplicates in the isolatrix_df df  of 'row' and 'col':")
    print(duplicates)
else:
    print("No duplicates found in the isolatrix_df df  of 'row' and 'col'.")


### cellenONE data preparation

In [None]:
new_file_path = "/projects/steiflab/archive/data/wgs/single_cell/internal/A138856/merge/metadata.tsv"
new_df = pd.read_csv('/projects/steiflab/archive/data/wgs/single_cell/internal/A138856/merge/metadata.tsv', delimiter = '\t')
print(f"The metadata has shape {new_df.shape}")

# Filter new_df to keep only rows where 'experimental_condition' is 'CellenONE'
cellenone_df = new_df[new_df['experimental_condition'] == 'CellenONE'].copy()

combination_counts_2 = cellenone_df.groupby(['experimental_condition', 'cell_condition']).size().reset_index(name='counts')
combination_counts_2.sort_values(by='counts', ascending=False, inplace=True)

print(combination_counts_2)

# Add a 'prediction' column with value 1 (as all predictions for CellenONE are considered as 1)
cellenone_df['prediction'] = 1

# Keep only the 'row', 'col', and 'prediction' columns
cellenone_df = cellenone_df[['row', 'col',  'experimental_condition','cell_condition','prediction']]

# Print the resulting DataFrame
print(f"The cellenone_df has shape {cellenone_df.shape}")




In [None]:
combined_df = pd.concat([isolatrix_df, cellenone_df], ignore_index=True)
# Check for duplicates based on the combination of 'row' and 'col'
duplicates = combined_df[combined_df.duplicated(subset=['row', 'col'], keep=False)]

# Print the duplicated rows
if not duplicates.empty:
    print("Found duplicates in the combination of 'row' and 'col':")
    print(duplicates)
else:
    print("No duplicates found in the combination of 'row' and 'col'.")

combined_df.columns

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Define the conditions for Isolatrix and CellenONE
isolatrix_conditions = ['Iso', 'Iso10']
cellenone_condition = 'CellenONE'

# Create a new column 'technology' to categorize rows as Isolatrix or CellenONE
combined_df['technology'] = combined_df['experimental_condition'].apply(
    lambda x: 'Isolatrix' if x in isolatrix_conditions else 'CellenONE' if x == cellenone_condition else None
)

# Drop rows where 'technology' is None (i.e., those that are neither Isolatrix nor CellenONE)
combined_df = combined_df.dropna(subset=['technology'])

# Define the ground truth: 1 for LiveCell, 0 for all other conditions
combined_df['ground_truth'] = combined_df['cell_condition'].apply(lambda x: 1 if x == 'LiveCell' else 0)

# Verify total counts for Isolatrix and CellenONE
isolatrix_count = combined_df[combined_df['technology'] == 'Isolatrix'].shape[0]
cellenone_count = combined_df[combined_df['technology'] == 'CellenONE'].shape[0]

print(f"Total number of samples for Isolatrix: {isolatrix_count}")
print(f"Total number of samples for CellenONE: {cellenone_count}")

# Create the confusion matrix for Isolatrix
iso_df = combined_df[combined_df['technology'] == 'Isolatrix']
iso_cm = confusion_matrix(iso_df['ground_truth'], iso_df['prediction'])

# Create the confusion matrix for CellenONE
cellenone_df = combined_df[combined_df['technology'] == 'CellenONE']
cellenone_cm = confusion_matrix(cellenone_df['ground_truth'], cellenone_df['prediction'])

# Function to calculate percentages relative to total sample size
# Function to calculate percentages relative to total sample size and format as integers
def add_percentage_labels_total(cm, total_count):
    cm_perc = cm / total_count * 100
    labels = []
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            count = cm[i, j]
            perc = int(cm_perc[i, j])  # Convert percentage to an integer
            labels.append(f"{count}\n({perc}%)")
    return np.array(labels).reshape(cm.shape)


# Generate labels with percentages for Isolatrix
iso_labels = add_percentage_labels_total(iso_cm, isolatrix_count)

# Generate labels with percentages for CellenONE
cellenone_labels = add_percentage_labels_total(cellenone_cm, cellenone_count)

# Plot the confusion matrix for Isolatrix
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.heatmap(iso_cm, annot=iso_labels, fmt='', cmap='Blues', cbar=False, square=True,
            annot_kws={"size": 16},  # Adjust the font size here
            xticklabels=['Predicted Negative (class 0)', 'Predicted Positive (class 1)'],
            yticklabels=['True Negative', 'True Positive'])
plt.title('Isolatrix Confusion Matrix', fontsize=14)

# Plot the confusion matrix for CellenONE
plt.subplot(1, 2, 2)
sns.heatmap(cellenone_cm, annot=cellenone_labels, fmt='', cmap='YlOrBr', cbar=False, square=True,
            annot_kws={"size": 16},  # Adjust the font size here
            xticklabels=['Predicted Negative (discarded)', 'Predicted Positive (isolated)'],
            yticklabels=['True Negative', 'True Positive'])
plt.title('CellenONE Confusion Matrix', fontsize=14)

plt.tight_layout()
plt.savefig("/projects/steiflab/scratch/leli/A138856A/plots/confusion_matrix.svg", format="svg")
plt.savefig("/projects/steiflab/scratch/leli/A138856A/plots/confusion_matrix.png")
plt.show()


In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Define the conditions for Isolatrix and CellenONE
isolatrix_conditions = ['Iso', 'Iso10']
cellenone_condition = 'CellenONE'

# Create a new column 'technology' to categorize rows as Isolatrix or CellenONE
combined_df['technology'] = combined_df['experimental_condition'].apply(
    lambda x: 'Isolatrix' if x in isolatrix_conditions else 'CellenONE' if x == cellenone_condition else None
)

# Drop rows where 'technology' is None (i.e., those that are neither Isolatrix nor CellenONE)
# combined_df = combined_df.dropna(subset=['technology'])

# Define the ground truth: 1 for LiveCell, 0 for all other conditions
combined_df['ground_truth'] = combined_df['cell_condition'].apply(lambda x: 1 if x == 'LiveCell' else 0)

# Verify total counts for Isolatrix and CellenONE
isolatrix_count = combined_df[combined_df['technology'] == 'Isolatrix'].shape[0]
cellenone_count = combined_df[combined_df['technology'] == 'CellenONE'].shape[0]

print(f"Total number of samples for Isolatrix: {isolatrix_count}")
print(f"Total number of samples for CellenONE: {cellenone_count}")

# Create the confusion matrix for Isolatrix
iso_df = combined_df[combined_df['technology'] == 'Isolatrix']
iso_cm = confusion_matrix(iso_df['ground_truth'], iso_df['prediction'])

# Create the confusion matrix for CellenONE
cellenone_df = combined_df[combined_df['technology'] == 'CellenONE']
cellenone_cm = confusion_matrix(cellenone_df['ground_truth'], cellenone_df['prediction'])

# Function to calculate percentages relative to total sample size
# Function to calculate percentages relative to total sample size and format as integers
def add_percentage_labels_total(cm, total_count):
    cm_perc = cm / total_count * 100
    labels = []
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            count = cm[i, j]
            perc = int(cm_perc[i, j])  # Convert percentage to an integer
            labels.append(f"{count}\n({perc}%)")
    return np.array(labels).reshape(cm.shape)


# Generate labels with percentages for Isolatrix
iso_labels = add_percentage_labels_total(iso_cm, isolatrix_count)

# Generate labels with percentages for CellenONE
cellenone_labels = add_percentage_labels_total(cellenone_cm, cellenone_count)

# Plot the confusion matrix for Isolatrix
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.heatmap(iso_cm, annot=iso_labels, fmt='', cmap='Blues', cbar=False, square=True,
            annot_kws={"size": 24},  # Adjust the font size here
            xticklabels=['Predicted Negative (class 0)', 'Predicted Positive (class 1)'],
            yticklabels=['True Negative', 'True Positive'])
plt.title('Isolatrix Confusion Matrix', fontsize=14)

# Plot the confusion matrix for CellenONE
plt.subplot(1, 2, 2)

# Mask for the "Predicted Negative" column in CellenONE matrix
mask = np.zeros_like(cellenone_cm, dtype=bool)
mask[:, 0] = True  # Mask the "Predicted Negative" column

# Plot the CellenONE confusion matrix without the "Predicted Negative" values
sns.heatmap(cellenone_cm, annot=cellenone_labels, fmt='', cmap='YlOrBr', cbar=False, square=True,
            annot_kws={"size": 24},  # Adjust the font size here
            xticklabels=['Predicted Negative (discarded)', 'Predicted Positive (isolated)'],
            yticklabels=['True Negative', 'True Positive'], mask=mask)

# Overlay the dark gray boxes for the Predicted Negative column
sns.heatmap(np.zeros_like(cellenone_cm), mask=~mask, cbar=False, square=True, cmap='Greys',
            xticklabels=['Predicted Negative (discarded)', 'Predicted Positive (isolated)'],
            yticklabels=['True Negative', 'True Positive'],)

plt.title('CellenONE Confusion Matrix', fontsize=14)

plt.tight_layout()
plt.savefig("/projects/steiflab/scratch/leli/A138856A/plots/confusion_matrix_wo_discarded.svg", format="svg")
plt.savefig("/projects/steiflab/scratch/leli/A138856A/plots/confusion_matrix_wo_discarded.png")
plt.show()


### Here is for the isolaltrix where we try to correct it to make the weighted 

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Define the conditions for Isolatrix
isolatrix_conditions = ['Iso', 'Iso10']

# Create a new column 'technology' to categorize rows as Isolatrix
combined_df['technology'] = combined_df['experimental_condition'].apply(
    lambda x: 'Isolatrix' if x in isolatrix_conditions else None
)

# Drop rows where 'technology' is None (i.e., those that are not Isolatrix)
combined_df = combined_df.dropna(subset=['technology'])

# Define the ground truth: 1 for LiveCell, 0 for all other conditions
combined_df['ground_truth'] = combined_df['cell_condition'].apply(lambda x: 1 if x == 'LiveCell' else 0)

# Verify total count for Isolatrix
isolatrix_count = combined_df[combined_df['technology'] == 'Isolatrix'].shape[0]
class_0_sample_count = combined_df[(combined_df['technology'] == 'Isolatrix') & (combined_df['prediction'] == 0)].shape[0]
class_1_sample_count = combined_df[(combined_df['technology'] == 'Isolatrix') & (combined_df['prediction'] == 1)].shape[0]

print(f"Total number of samples for Isolatrix: {isolatrix_count}")
print(f"Class 0 sample count: {class_0_sample_count}")
print(f"Class 1 sample count: {class_1_sample_count}")

# Create the confusion matrix for Isolatrix
iso_df = combined_df[combined_df['technology'] == 'Isolatrix']
iso_cm = confusion_matrix(iso_df['ground_truth'], iso_df['prediction'])

# Define population proportions
total_population = 3311
class_0_population = 1824
class_1_population = 1189

# Calculate sample proportions
class_0_sample_proportion = class_0_sample_count / isolatrix_count
class_1_sample_proportion = class_1_sample_count / isolatrix_count

# Calculate class weights based on population distribution
class_weight_0 = (class_0_population / total_population) / class_0_sample_proportion
class_weight_1 = (class_1_population / total_population) / class_1_sample_proportion

# Create a weight matrix
weight_matrix = np.array([[class_weight_0, class_weight_1], [class_weight_0, class_weight_1]])

# Apply the weights to the confusion matrix
# Add a constant of 1 to ensure all values in the confusion matrix are nonzero for the weighted matrix
weighted_iso_cm = (iso_cm + 1) * weight_matrix

# Calculate metrics for the original confusion matrix
original_accuracy = accuracy_score(iso_df['ground_truth'], iso_df['prediction'])
original_precision = precision_score(iso_df['ground_truth'], iso_df['prediction'])
original_recall = recall_score(iso_df['ground_truth'], iso_df['prediction'])
original_f1 = f1_score(iso_df['ground_truth'], iso_df['prediction'])

# Calculate metrics for the weighted confusion matrix
# Note: For weighted metrics, we need to manually calculate based on the weighted confusion matrix
weighted_tp = weighted_iso_cm[1, 1]
weighted_fp = weighted_iso_cm[0, 1]
weighted_fn = weighted_iso_cm[1, 0]
weighted_tn = weighted_iso_cm[0, 0]

weighted_accuracy = (weighted_tp + weighted_tn) / weighted_iso_cm.sum()
weighted_precision = weighted_tp / (weighted_tp + weighted_fp)
weighted_recall = weighted_tp / (weighted_tp + weighted_fn)
weighted_f1 = 2 * (weighted_precision * weighted_recall) / (weighted_precision + weighted_recall)

# Print the metrics for comparison
print("Original Confusion Matrix Metrics:")
print(f"Accuracy: {original_accuracy:.4f}")
print(f"Precision: {original_precision:.4f}")
print(f"Recall: {original_recall:.4f}")
print(f"F1 Score: {original_f1:.4f}\n")

print("Weighted Confusion Matrix Metrics:")
print(f"Accuracy: {weighted_accuracy:.4f}")
print(f"Precision: {weighted_precision:.4f}")
print(f"Recall: {weighted_recall:.4f}")
print(f"F1 Score: {weighted_f1:.4f}")

# Function to calculate percentages relative to total sample size and format as integers
def add_percentage_labels_total(cm, total_count):
    cm_perc = cm / total_count * 100
    labels = []
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            count = cm[i, j]
            if count > 1: 
                count = int(count)
            else: 
                count = round(count, 2)
            perc = int(cm_perc[i, j])  # Convert percentage to an integer
            labels.append(f"{count}\n({perc}%)")
    return np.array(labels).reshape(cm.shape)

# Generate labels with percentages for the original confusion matrix
original_iso_labels = add_percentage_labels_total(iso_cm, isolatrix_count)

# Generate labels with percentages for the weighted confusion matrix
weighted_iso_labels = add_percentage_labels_total(weighted_iso_cm, isolatrix_count)

# Plot the original and weighted confusion matrices side by side
plt.figure(figsize=(16, 6))

# Original Confusion Matrix
plt.subplot(1, 2, 1)
sns.heatmap(iso_cm, annot=original_iso_labels, fmt='', cmap='Blues', cbar=False, square=True,
            annot_kws={"size": 16},  # Adjust the font size here
            xticklabels=['Predicted Negative (class 0)', 'Predicted Positive (class 1)'],
            yticklabels=['True Negative', 'True Positive'])
plt.title('Isolatrix Confusion Matrix', fontsize=14)

# Weighted Confusion Matrix
plt.subplot(1, 2, 2)
sns.heatmap(weighted_iso_cm, annot=weighted_iso_labels, fmt='', cmap='Blues', cbar=False, square=True,
            annot_kws={"size": 16},  # Adjust the font size here
            xticklabels=['Predicted Negative (class 0)', 'Predicted Positive (class 1)'],
            yticklabels=['True Negative', 'True Positive'])
plt.title('Weighted Isolatrix Confusion Matrix', fontsize=14)

plt.tight_layout()
plt.savefig("/projects/steiflab/scratch/leli/A138856A/plots/compare_confusion_matrices.svg", format="svg")
plt.savefig("/projects/steiflab/scratch/leli/A138856A/plots/compare_confusion_matrices.png")
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

combined_df = pd.read_csv("/projects/steiflab/scratch/leli/A138856A/final_combined_df.csv")


# Ensure reproducibility
np.random.seed(42)

# Define the number of iterations and the number of wells to sample
iterations = 1000
num_wells = 384

# Initialize lists to store the results
results_isolatrix = {'True Positives': [], 'False Positives': [], 'True Negatives': [], 'False Negatives': []}
results_cellenone = {'True Positives': [], 'False Positives': [], 'True Negatives': [], 'False Negatives': []}

# Iterate 1000 times to perform random sampling and calculate metrics
for _ in range(iterations):
    # Sample 384 wells for Isolatrix
    sampled_isolatrix = combined_df[combined_df['technology'] == 'Isolatrix'].sample(n=num_wells, replace=True)
    tp_isolatrix = len(sampled_isolatrix[(sampled_isolatrix['ground_truth'] == 1) & (sampled_isolatrix['prediction'] == 1)])
    fp_isolatrix = len(sampled_isolatrix[(sampled_isolatrix['ground_truth'] == 0) & (sampled_isolatrix['prediction'] == 1)])
    tn_isolatrix = len(sampled_isolatrix[(sampled_isolatrix['ground_truth'] == 0) & (sampled_isolatrix['prediction'] == 0)])
    fn_isolatrix = len(sampled_isolatrix[(sampled_isolatrix['ground_truth'] == 1) & (sampled_isolatrix['prediction'] == 0)])
    
    results_isolatrix['True Positives'].append(tp_isolatrix)
    results_isolatrix['False Positives'].append(fp_isolatrix)
    results_isolatrix['True Negatives'].append(tn_isolatrix)
    results_isolatrix['False Negatives'].append(fn_isolatrix)

    # Sample 384 wells for CellenONE
    sampled_cellenone = combined_df[combined_df['technology'] == 'CellenONE'].sample(n=num_wells, replace=True)
    tp_cellenone = len(sampled_cellenone[(sampled_cellenone['ground_truth'] == 1) & (sampled_cellenone['prediction'] == 1)])
    fp_cellenone = len(sampled_cellenone[(sampled_cellenone['ground_truth'] == 0) & (sampled_cellenone['prediction'] == 1)])
    tn_cellenone = len(sampled_cellenone[(sampled_cellenone['ground_truth'] == 0) & (sampled_cellenone['prediction'] == 0)])
    fn_cellenone = len(sampled_cellenone[(sampled_cellenone['ground_truth'] == 1) & (sampled_cellenone['prediction'] == 0)])
    
    results_cellenone['True Positives'].append(tp_cellenone)
    results_cellenone['False Positives'].append(fp_cellenone)
    results_cellenone['True Negatives'].append(tn_cellenone)
    results_cellenone['False Negatives'].append(fn_cellenone)

# Calculate the mean values across iterations
mean_results_isolatrix = {key: np.mean(value) for key, value in results_isolatrix.items()}
mean_results_cellenone = {key: np.mean(value) for key, value in results_cellenone.items()}

# Plotting
metrics = ['True Positives', 'False Positives', 'True Negatives', 'False Negatives']
isolatrix_counts = [mean_results_isolatrix[metric] for metric in metrics]
cellenone_counts = [mean_results_cellenone[metric] for metric in metrics]

# Adjust zero bars to appear slightly higher (e.g., at 5) but display as 0
adjusted_isolatrix_counts = [2 if count == 0 else count for count in isolatrix_counts]
adjusted_cellenone_counts = [2 if count == 0 else count for count in cellenone_counts]

x = np.arange(len(metrics))  # the label locations
bar_width = 0.35  # the width of the bars
offset = 0.02  # This adds a small gap between the bars

# Create the bar chart
fig, ax = plt.subplots(figsize=(10, 6))

# Plot the bars for Isolatrix
bars1 = ax.bar(x - bar_width/2 - offset, adjusted_isolatrix_counts, bar_width, 
               label='Isolatrix', color='#5A79A5', edgecolor='#334F73', linewidth=2)

# Plot the bars for CellenONE
bars2 = ax.bar(x + bar_width/2 + offset, adjusted_cellenone_counts, bar_width, 
               label='CellenONE', color='#F3E197', edgecolor='#BBA653', linewidth=2)

# Add the count values and percentages on top of the bars for Isolatrix
for bar, original_value in zip(bars1, isolatrix_counts):
    yval = bar.get_height()
    percentage = (original_value / num_wells) * 100
    ax.text(bar.get_x() + bar.get_width()/2, max(yval + 1, 1), f'{int(original_value):,}\n({int(percentage):}%)', 
            ha='center', va='bottom', fontsize=12)

# Add the count values and percentages on top of the bars for CellenONE
for bar, original_value in zip(bars2, cellenone_counts):
    yval = bar.get_height()
    percentage = (original_value / num_wells) * 100
    ax.text(bar.get_x() + bar.get_width()/2, max(yval + 1, 1), f'{int(original_value):,}\n({int(percentage):}%)', 
            ha='center', va='bottom', fontsize=12)

# Add labels and title
ax.set_xlabel('Metrics')
ax.set_ylabel('Count')
ax.set_title('Expected Performance (n = 384)')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()

# Adjust y-lim to ensure bars and text are not cut off
ax.set_ylim(0, max(max(adjusted_isolatrix_counts), max(adjusted_cellenone_counts)) + 30)

# Show the plot
plt.savefig("/projects/steiflab/scratch/leli/A138856A/plots/exp_performance.svg", format="svg")
plt.savefig("/projects/steiflab/scratch/leli/A138856A/plots/exp_performance.png")
plt.show()




### Get value ready

In [None]:
import pandas as pd
import numpy as np

# Assuming the previous dataframes have been created:
# merged_df_pred_1 (Isolatrix with Prediction = 1 and total_reads > 113014 + 72160)
# merged_df_pred_0 (Isolatrix with Prediction = 0 and total_reads <= 113014 + 72160)
# merged_isolated_metadata_df (CellenONE without Prediction column)

# Set a seed for reproducibility
#np.random.seed(364599)
sampling_frac = 0.2

# Step 1: Add a 'Prediction' column with value 1 for merged_isolated_metadata_df
merged_isolated_metadata_df['Prediction'] = 1

# Step 2: Combine the Isolatrix dataframes into one
isolatrix_combined_df = pd.concat([merged_df_pred_1, merged_df_pred_0], ignore_index=True)

isolatrix_combined_df = isolatrix_combined_df.sample(n=400)
merged_isolated_metadata_df = merged_isolated_metadata_df.sample(n=400)
print(f"the original dataframe is of size isolatrix {isolatrix_combined_df.shape} and the cellenone is {merged_isolated_metadata_df.shape}")

# Step 3: Determine the minimum number of positives (actual single cells) from each technology
isolatrix_positives = isolatrix_combined_df[isolatrix_combined_df['total_reads'] > 113014 + 72160]
isolatrix_negatives = isolatrix_combined_df[isolatrix_combined_df['total_reads'] <= 113014 + 72160]

cellenone_positives = merged_isolated_metadata_df[merged_isolated_metadata_df['total_reads'] > 113014 + 72160]
cellenone_negatives = merged_isolated_metadata_df[merged_isolated_metadata_df['total_reads'] <= 113014 + 72160]

sample_size = min(len(isolatrix_positives), len(isolatrix_negatives), len(cellenone_positives), len(cellenone_negatives))
print(f"The sample size is {sample_size}, {[len(isolatrix_positives), len(isolatrix_negatives), len(cellenone_positives), len(cellenone_negatives)]}")

# Step 4: Randomly pick out a sample of positives (actual single cells) for each technology
'''isolatrix_positives = isolatrix_positives.sample(frac = sampling_frac) #n = 132)#
isolatrix_negatives = isolatrix_negatives.sample(frac = sampling_frac) #n = 25)
cellenone_positives = cellenone_positives.sample(frac = sampling_frac) #n = 132)
cellenone_negatives = cellenone_negatives.sample(frac = sampling_frac) #n = 25)#'''

# Step 5: For each technology, count how many of the positives have a Prediction of 1 and 0
isolatrix_tp_count = len(isolatrix_positives[isolatrix_positives['Prediction'] == 1])
isolatrix_fn_count = len(isolatrix_positives[isolatrix_positives['Prediction'] == 0])
isolatrix_tn_count = len(isolatrix_negatives[isolatrix_negatives['Prediction'] == 0])
isolatrix_fp_count = len(isolatrix_negatives[isolatrix_negatives['Prediction'] == 1])

cellenone_tp_count = len(cellenone_positives[cellenone_positives['Prediction'] == 1])
cellenone_fn_count = len(cellenone_positives[cellenone_positives['Prediction'] == 0])
cellenone_tn_count = len(cellenone_negatives[cellenone_negatives['Prediction'] == 0])
cellenone_fp_count = len(cellenone_negatives[cellenone_negatives['Prediction'] == 1])

# Print out the results
print("\nIsolatrix Results:")
print(f"True Positives (TP): {isolatrix_tp_count}")
print(f"False Negatives (FN): {isolatrix_fn_count}")
print(f"True Negatives (TN): {isolatrix_tn_count}")
print(f"False Positives (FP): {isolatrix_fp_count}")

print("\nCellenONE Results:")
print(f"True Positives (TP): {cellenone_tp_count}")
print(f"False Negatives (FN): {cellenone_fn_count}")
print(f"True Negatives (TN): {cellenone_tn_count}")
print(f"False Positives (FP): {cellenone_fp_count}")

print(f"True Positives (TP): {cellenone_positives[cellenone_positives['Prediction'] == 1]}")
print(f"False Positives (FP): {cellenone_negatives[cellenone_negatives['Prediction'] == 1]}")


### PLOT NOT SELECTED

In [None]:
import matplotlib.pyplot as plt
import numpy as np

cellenone_fn_count = None  # FN for CellenONE will be replaced with text

# Data for the bar chart
metrics = ['True Positives (TP)', 'False Positives (FP)', 'False Negatives (FN)']
isolatrix_counts = [isolatrix_tp_count, isolatrix_fp_count, isolatrix_fn_count]
cellenone_counts = [cellenone_tp_count, cellenone_fp_count, 0]  # FN is set to 0, and we'll replace it with text

x = np.arange(len(metrics))  # the label locations
bar_width = 0.35  # the width of the bars

# Create the bar chart
fig, ax = plt.subplots(figsize=(10, 6))

# Adjust the offset to create space between the bars
offset = 0.02  # This adds a small gap between the bars

# Plot the bars for Isolatrix with a blue color and a darker blue border
bars1 = ax.bar(x - bar_width/2 - offset, isolatrix_counts, bar_width, 
               label='Isolatrix', color='#5A79A5', edgecolor='#334F73', linewidth=2)

# Plot the bars for CellenONE with a yellow color and a darker yellow border
bars2 = ax.bar(x + bar_width/2 + offset, cellenone_counts, bar_width, 
               label='CellenONE', color='#F3E197', edgecolor='#BBA653', linewidth=2)

# Replace the CellenONE FN bar with text
for i in range(len(cellenone_counts)):
    if cellenone_counts[i] == 0 and metrics[i] == 'False Negatives (FN)':
        ax.text(x[i] + bar_width/2 + offset, 0, '  * CellenONE: No ground truth for validation', 
                ha='center', va='bottom', fontsize=12, color='black', rotation=90, style='italic')

# Add the count values on top of the bars for Isolatrix
for bar in bars1:
    yval = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2, yval, int(yval), ha='center', va='bottom', fontsize=12)

# Add the count values on top of the bars for CellenONE
i = 0
for bar in bars2:
    if i == 2: continue
    yval = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2, yval, int(yval), ha='center', va='bottom', fontsize=12)
    i += 1

# Add labels and title
ax.set_xlabel('Metrics')
ax.set_ylabel('Count')
ax.set_title('Evaluation of Isolatrix and CellenONE Performance', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()

# Show plot
plt.show()




## Part b)

### Data Prep

In [None]:
combined_df = pd.read_csv("/projects/steiflab/scratch/leli/A138856A/feature_metadata.csv")
combined_df

In [None]:
import os
import pandas as pd

# Initialize new columns in combined_df
combined_df['track_ID'] = pd.NA
combined_df['subdirectory_name'] = pd.NA
combined_df['area'] = pd.NA
combined_df['circularity'] = pd.NA
combined_df['intensity'] = pd.NA
combined_df['energy'] = pd.NA
combined_df['intensity_not_norm'] = pd.NA

# Your home directory path (change this to your actual directory)
home_dir = "/projects/steiflab/scratch/leli/A138856A"  # Replace with your actual directory

# Read the features_metadata.csv file
features_metadata_df = pd.read_csv("/projects/steiflab/scratch/leli/A138856A/feature_metadata.csv")

# Iterate through each subfolder in the home directory
for subfolder in os.listdir(home_dir):
    subfolder_path = os.path.join(home_dir, subfolder)
    
    if os.path.isdir(subfolder_path):
        # Process subfolders containing "dropRun"
        track_to_well_path = os.path.join(subfolder_path, "track_to_well", "track_to_well_pp.csv")
        
        if os.path.exists(track_to_well_path):
            # Read the track_to_well_pp.csv file
            track_to_well_df = pd.read_csv(track_to_well_path)
            
            # Filter to keep only positive, nonzero track IDs
            track_to_well_df = track_to_well_df[track_to_well_df['track_ID'] > 0]
            
            # Merge track_to_well_df with features_metadata_df on 'track_ID' (track_to_well) and 'track_id' (features_metadata)
            merged_df = pd.merge(track_to_well_df, features_metadata_df, left_on='track_ID', right_on='track_id', how='left')
            
            # Check for rows in track_to_well_df that do not have associated entries in features_metadata_df
            missing_metadata = merged_df[merged_df['track_id'].isna()]
            if not missing_metadata.empty:
                print(f"Warning: {len(missing_metadata)} rows in 'track_to_well_pp.csv' with positive track IDs do not have associated entries in 'features_metadata.csv' for subdirectory: {subfolder}")
                print("Missing Track IDs:", missing_metadata['track_ID'].tolist())
            
            # Now link the merged_df with combined_df based on 'row' and 'col'
            for _, row in merged_df.iterrows():
                combined_index = combined_df[(combined_df['row'] == row['row']) & (combined_df['col'] == row['col'])].index
                if not combined_index.empty:
                    combined_df.loc[combined_index, 'track_ID'] = row['track_ID']
                    combined_df.loc[combined_index, 'subdirectory_name'] = subfolder
                    combined_df.loc[combined_index, 'area'] = row['area']
                    combined_df.loc[combined_index, 'circularity'] = row['circularity']
                    combined_df.loc[combined_index, 'intensity'] = row['intensity']
                    combined_df.loc[combined_index, 'energy'] = row['energy']
                    combined_df.loc[combined_index, 'intensity_not_norm'] = row['intensity_not_norm']

# Assign a name to the big dataframe before saving it
big_dataframe = combined_df

# Save the big dataframe
#big_dataframe.to_csv(os.path.join(home_dir, "final_combined_df.csv"), index=False)

print("The final_combined_dataframe.csv has been saved successfully.")


In [None]:
# Calculate the total number of combinations of experimental_condition and cell_condition
total_combinations = combined_df.groupby(['technology', 'ground_truth']).size().reset_index(name='counts')

# Calculate the number of combinations that have a track_ID (i.e., not NaN)
combinations_with_track_ID = combined_df.dropna(subset=['track_ID']).groupby(['technology', 'ground_truth']).size().reset_index(name='counts')

# Calculate the number of combinations with a track_ID and group by prediction
combinations_with_track_ID_and_prediction = combined_df.dropna(subset=['track_ID']).groupby(['technology', 'ground_truth', 'prediction']).size().reset_index(name='counts')

# Calculate the number of combinations that do not have a track_ID (i.e., NaN) and group by prediction
combinations_with_na_track_ID_and_prediction = combined_df.groupby(['technology', 'ground_truth', 'prediction']).size().reset_index(name='counts')

# Print the total number of combinations
print(f"Total number of experimental_condition and cell_condition combinations:")
print(total_combinations)

# Print the number of combinations that have a track_ID
print(f"\nNumber of combinations with track_ID (non-NaN):")
print(combinations_with_track_ID)

# Print the number of combinations that do not have a track_ID grouped by prediction
print(f"\nNumber of combinations with NaN track_ID grouped by prediction:")
print(combinations_with_na_track_ID_and_prediction)

# Print the number of combinations with track_ID grouped by prediction
print(f"\nNumber of combinations with track_ID (non-NaN) grouped by prediction:")
print(combinations_with_track_ID_and_prediction)



In [None]:
# Filter the DataFrame
isolatrix_livecell_tp = combined_df[(combined_df['technology'] == 'Isolatrix') & 
                                    (combined_df['cell_condition'] == 'LiveCell') & 
                                    (combined_df['prediction'] == 0)][['track_ID', 'subdirectory_name']].dropna(subset=['track_ID'])

# Print the filtered rows
print(isolatrix_livecell_tp)


In [None]:
# Filter the DataFrame
isolatrix_livecell_tp = combined_df[(combined_df['technology'] == 'Isolatrix') & 
                                    (combined_df['cell_condition'] == 'NCC') & 
                                    (combined_df['prediction'] == 0)][['track_ID', 'subdirectory_name']].dropna(subset=['track_ID'])

# Print the filtered rows
print(isolatrix_livecell_tp)

### Plot TP and FN

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import mannwhitneyu
from cliffs_delta import cliffs_delta

def create_violinplot_with_boxplot(data, xticklabels, title, output_directory=None):
    """
    Create a violin plot with embedded boxplots for multiple lists of values,
    and display the results of a Mann-Whitney U test and Cliff's Delta for effect size.
    """
    # Check that the data list is not empty
    if not data or all(len(d) == 0 for d in data):
        print("No data available for plotting.")
        return

    # Remove empty lists and their corresponding labels
    filtered_data = [d for d in data if len(d) > 0]
    filtered_labels = [f"{xticklabels[i]} (n={len(data[i])})" for i in range(len(data)) if len(data[i]) > 0]
    
    if not filtered_data:
        print("No data available for plotting after filtering empty lists.")
        return

    # Colors for the violins (shades of blue)
    colors = ['#5A79A5', '#334F73']  # Two different shades of blue
    
    # Perform Mann-Whitney U Test
    u_statistic, p_value = mannwhitneyu(filtered_data[0], filtered_data[1], alternative='two-sided')
    
    # Calculate Cliff's Delta for effect size
    delta, magnitude = cliffs_delta(filtered_data[0], filtered_data[1])

    # Create the plot
    fig, ax = plt.subplots(figsize=(12, 8))

    # Create violin plot
    parts = ax.violinplot(filtered_data, showmeans=False, showmedians=True, showextrema=True)
    
    # Customize the violin plot colors
    for i, pc in enumerate(parts['bodies']):
        pc.set_facecolor(colors[i % len(colors)])  # Assign colors cyclically if more violins than colors
        pc.set_edgecolor('black')
        pc.set_alpha(0.7)
    
    # Add boxplots
    boxprops = dict(linestyle='-', linewidth=2, color='black')
    medianprops = dict(linestyle='-', linewidth=2.5, color='red')
    flierprops = dict(marker='o', color='black', markersize=3, markerfacecolor='black')
    ax.boxplot(filtered_data, positions=np.arange(1, len(filtered_data) + 1), widths=0.1,
               boxprops=boxprops, medianprops=medianprops, showmeans=False, flierprops=flierprops)
    
    # Set axis labels and title
    ax.set_xticks(np.arange(1, len(filtered_labels) + 1))
    ax.set_xticklabels(filtered_labels, rotation=0, ha='center')
    ax.set_ylabel(title.capitalize())
    ax.set_title(title, fontsize=16, fontweight='bold')

    # Annotate the x-axis to show which category each violin belongs to
    ax.text(0.25, -0.1, 'True Positive', ha='center', va='center', transform=ax.transAxes, fontsize=12, fontweight='bold')
    ax.text(0.75, -0.1, 'False Negative', ha='center', va='center', transform=ax.transAxes, fontsize=12, fontweight='bold')

    # Add grid for better readability
    ax.grid(True, linestyle='--', alpha=0.7)

    # Display the statistical test results on the plot in the top left corner
    ax.text(0.02, 0.99, f'Wilcoxon: p-value={p_value:.3f}  Effect Size: {delta:.2f} ({magnitude})',
            transform=ax.transAxes, fontsize=12, verticalalignment='top')#, bbox=dict(facecolor='white', alpha=0.5))

    # Adjust layout to make room for the legend
    plt.tight_layout(rect=[0, 0, 0.85, 1])  # Adjust rect to leave space for the legend
    
    # Save the plot if an output directory is provided
    if output_directory:
        os.makedirs(output_directory, exist_ok=True)
        plt.savefig(os.path.join(output_directory, f'{title}_violin.svg'), format = "svg")
        plt.savefig(os.path.join(output_directory, f'{title}_violin.png'))
    
    # Show the plot
    plt.show()


def plot_isolatrix_tp_fn(combined_df, feature_name, output_directory=None):
    """
    Plots a violin plot with embedded box plots for Isolatrix True Positives (TP) and False Negatives (FN).

    Parameters:
    - combined_df: The combined dataframe containing the data.
    - feature_name: The column name of the feature to be plotted.
    - output_directory: Directory to save the plot (default is None).
    """
    isolatrix_tp = combined_df[(combined_df['technology'] == 'Isolatrix') & 
                               (combined_df['cell_condition'] == 'LiveCell') & 
                               (combined_df['prediction'] == 1)][feature_name].dropna().tolist()

    isolatrix_fn = combined_df[(combined_df['technology'] == 'Isolatrix') & 
                               (combined_df['cell_condition'] == 'LiveCell') & 
                               (combined_df['prediction'] == 0)][feature_name].dropna().tolist()

    # Prepare data for plotting
    data = [isolatrix_tp, isolatrix_fn] 
    xticklabels = ['True Positive', 'False Negative']

    # Create the plot
    if feature_name == "circularity": feature_name = "Circularity"
    if feature_name == "intensity": feature_name = "Normalized Intensity"
    if feature_name == "area": feature_name = "Area (n Pixels)"
    if feature_name == "energy": feature_name = "Textual Homogeneity"
    create_violinplot_with_boxplot(data, xticklabels, feature_name, output_directory)

    return data


In [None]:
data = plot_isolatrix_tp_fn(combined_df, "circularity", output_directory="/projects/steiflab/scratch/leli/A138856A/plots/")

In [None]:
data = plot_isolatrix_tp_fn(combined_df, "intensity", output_directory="/projects/steiflab/scratch/leli/A138856A/plots/")

In [None]:
data = plot_isolatrix_tp_fn(combined_df, "intensity_not_norm", output_directory="/projects/steiflab/scratch/leli/A138856A/plots/")

In [None]:
combined_df

In [None]:
data = plot_isolatrix_tp_fn(combined_df, "area", output_directory="/projects/steiflab/scratch/leli/A138856A/plots/")

In [None]:
data = plot_isolatrix_tp_fn(combined_df, "energy", output_directory="/projects/steiflab/scratch/leli/A138856A/plots/")

### Plot class 0 and class 1

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import mannwhitneyu
from cliffs_delta import cliffs_delta

def add_stat_annotation(ax, p_value, effect_size, magnitude, x1, x2, y, h, col):
    """
    Annotates the plot with the p-value and effect size.

    Parameters:
    - ax: Matplotlib axis object.
    - p_value: The p-value to annotate.
    - effect_size: The effect size to annotate.
    - x1, x2: The positions on the x-axis for the two groups.
    - y: The y position to place the annotation.
    - h: The height of the annotation.
    - col: The color of the annotation.
    """
    text = f"p={p_value:.4f}, effect size={effect_size:.2f} ({magnitude})"
    if p_value < 0.0001: text = f"p < 0.0001, effect size={effect_size:.2f} ({magnitude})"
    ax.plot([x1, x1, x2, x2], [y, y + h, y + h, y], lw=1.5, c=col)
    ax.text((x1 + x2) * 0.5, y + h, text, ha='center', va='bottom', color=col, fontsize=14)

def create_violinplot_with_boxplot(data, xticklabels, title, output_directory=None):
    """
    Create a violin plot with embedded boxplots for multiple lists of values.

    Parameters:
    - data: List of lists containing numerical values to plot.
    - xticklabels: Labels for the x-axis corresponding to each list in data.
    - title: Title of the plot.
    - output_directory: Directory to save the plot (default is None).
    """
    # Check that the data list is not empty
    if not data or all(len(d) == 0 for d in data):
        print("No data available for plotting.")
        return

    # Remove empty lists and their corresponding labels
    filtered_data = [d for d in data if len(d) > 0]
    filtered_labels = [f"{xticklabels[i]} (n={len(data[i])})" for i in range(len(data)) if len(data[i]) > 0]
    
    if not filtered_data:
        print("No data available for plotting after filtering empty lists.")
        return

    # Colors for the violins
    colors = ['#5A79A5', '#5A79A5', '#F3E197', '#F3E197']  
    # Create the plot
    fig, ax = plt.subplots(figsize=(12, 8))

    # Create violin plot
    parts = ax.violinplot(filtered_data, showmeans=False, showmedians=True, showextrema=True)
    
    # Customize the violin plot colors
    for i, pc in enumerate(parts['bodies']):
        pc.set_facecolor(colors[i % len(colors)])  # Assign colors cyclically if more violins than colors
        pc.set_edgecolor('black')
        pc.set_alpha(0.7)
    
    # Add boxplots
    boxprops = dict(linestyle='-', linewidth=2, color='black')
    medianprops = dict(linestyle='-', linewidth=2.5, color='red')
    flierprops = dict(marker='o', color='black', markersize=3, markerfacecolor='black')
    ax.boxplot(filtered_data, positions=np.arange(1, len(filtered_data) + 1), widths=0.1,
               boxprops=boxprops, medianprops=medianprops, showmeans=False, flierprops=flierprops)
    
    # Set axis labels and title
    ax.set_xticks(np.arange(1, len(filtered_labels) + 1))
    ax.set_xticklabels(filtered_labels, rotation=0, ha='center')
    ax.set_ylabel(title.capitalize())
    ax.set_title(title, fontsize=16, fontweight='bold')

    # Annotate the x-axis to show which technology each pair belongs to
    ax.text(0.25, -0.1, 'Isolatrix', ha='center', va='center', transform=ax.transAxes, fontsize=12, fontweight='bold')
    ax.text(0.75, -0.1, 'CellenONE', ha='center', va='center', transform=ax.transAxes, fontsize=12, fontweight='bold')

    # Add grid for better readability
    ax.grid(True, linestyle='--', alpha=0.7)

    # Statistical analysis and annotations
    pairs = [(0, 1), (2, 3)]
    y_max = max(max(data[0]), max(data[1]), max(data[2]), max(data[3]))
    y_min = min(min(data[0]), min(data[1]), min(data[2]), min(data[3]))
    h = (y_max - y_min) * 0.05  # Height of the bracket
    for (x1, x2) in pairs:
        # Perform Mann-Whitney U test (Wilcoxon rank-sum test)
        if len(data[x1]) > 0 and len(data[x2]) > 0:
            _, p_value = mannwhitneyu(data[x1], data[x2], alternative='two-sided')
            # Calculate effect size (Cliff's delta)
            effect_size, magnitude = cliffs_delta(data[x1], data[x2])
            # Annotate the plot
            add_stat_annotation(ax, p_value, effect_size, magnitude, x1 + 1, x2 + 1, y_max + h, h, 'black')
            y_max += h * 2  # Update y_max for next annotation

    # Adjust layout to make room for the legend
    plt.tight_layout(rect=[0, 0, 0.85, 1])  # Adjust rect to leave space for the legend
    
    # Save the plot if an output directory is provided
    if output_directory:
        os.makedirs(output_directory, exist_ok=True)
        plt.savefig(os.path.join(output_directory, f'{title}_Iso_cell_comparison_violin.png'), dpi=300)
    
    # Show the plot
    plt.show()

def plot_feature_violin(home_directory, feature_name, output_directory=None):
    """
    Plots a violin plot with embedded box plots for a specified feature across different categories:
    Isolatrix isolated, Isolatrix discarded, CellenONE isolated, CellenONE discarded.

    Parameters:
    - home_directory: The home directory containing the folders.
    - feature_name: The column name of the feature to be plotted.
    - output_directory: Directory to save the plot (default is None).
    """
    isolatrix_isolated = []
    isolatrix_discarded = []
    cellenone_isolated = []
    cellenone_discarded = []
    
    empty_datasets = {"Isolatrix Isolated": 0, "Isolatrix Discarded": 0, "CellenONE Isolated": 0, "CellenONE Discarded": 0}
    
    # Traverse the home directory
    for root, dirs, files in os.walk(home_directory):
        for file in files:
            if file in ["features_p1.csv", "features_p0.csv", "features_p2.csv", "features.csv"]:
                file_path = os.path.join(root, file)
                print(f"Processing file: {file_path}")
                df = pd.read_csv(file_path)
                if feature_name not in df.columns:
                    print(f"Feature '{feature_name}' not found in file: {file_path}")
                    continue
                
                if "dropRun" in root:
                    if file == "features_p1.csv":
                        if df[feature_name].dropna().empty:
                            empty_datasets["Isolatrix Isolated"] += 1
                            print(f"No data in 'Isolatrix Isolated' for file: {file_path}")
                        else:
                            isolatrix_isolated.extend(df[feature_name].dropna().tolist())
                            print(f"isolatrix isolated: {len(isolatrix_isolated)}")
                    else:
                        if df[feature_name].dropna().empty:
                            empty_datasets["Isolatrix Discarded"] += 1
                            print(f"No data in 'Isolatrix Discarded' for file: {file_path}")
                        else:
                            isolatrix_discarded.extend(df[feature_name].dropna().tolist())
                            print(f"isolatrix discarded: {len(isolatrix_discarded)}")
                else:
                    if file == "features.csv":
                        df_isolated = df[df['label'] == 'isolated']
                        df_discarded = df[df['label'] == 'discarded']
                        cellenone_isolated.extend(df_isolated[feature_name].dropna().tolist())
                        cellenone_discarded.extend(df_discarded[feature_name].dropna().tolist())

    # Log empty datasets
    for category, count in empty_datasets.items():
        if count > 0:
            print(f"Empty dataset found in {category}: {count} files with no data")

    # Prepare data for plotting
    data = [
        isolatrix_isolated,
        isolatrix_discarded,
        cellenone_isolated,
        cellenone_discarded
    ]
    xticklabels = ['Class 1', 'Class 0', 'Isolated', 'Discarded']

    if feature_name == "energy": feature_name = "Textual Homogeneity"
    if feature_name == "circularity": feature_name = "Circularity"
    if feature_name == "area": feature_name = "Area"
    if feature_name == "intensity": feature_name = "Normalized Intensity"

    # Create the plot
    create_violinplot_with_boxplot(data, xticklabels, feature_name, output_directory)
    return data


# Example usage:
data = plot_feature_violin("/projects/steiflab/scratch/leli/A138856A", "area", output_directory = "/projects/steiflab/scratch/leli/A138856A/plots")


In [None]:
data = plot_feature_violin("/projects/steiflab/scratch/leli/A138856A", "circularity", "/projects/steiflab/scratch/leli/A138856A/plots")

In [None]:
data = plot_feature_violin("/projects/steiflab/scratch/leli/A138856A", "intensity", "/projects/steiflab/scratch/leli/A138856A/plots")

In [None]:
data = plot_feature_violin("/projects/steiflab/scratch/leli/A138856A", "energy", "/projects/steiflab/scratch/leli/A138856A/plots")

## Compile to 1 Fig

In [None]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

# Define the desired size in centimeters
width_cm = 17.1  # Double column width as per the requirement
height_cm = 14  # Maximum allowed height

# Convert to inches
width_inch = width_cm / 2.54
height_inch = height_cm / 2.54

# Create the figure and the grid spec
fig = plt.figure(figsize=(width_inch, height_inch), dpi=600)
gs = gridspec.GridSpec(2, 2, figure=fig, height_ratios=[1, 1])

# Reduce the spacing between subplots
#gs.update(wspace=0.1, hspace=0.1)

# Load the images
metrics_img = plt.imread('/projects/steiflab/scratch/leli/A138856A/metrics_barplot.png')
circularity_img = plt.imread('/projects/steiflab/scratch/leli/A138856A/Circularity_violin.png')
energy_img = plt.imread('/projects/steiflab/scratch/leli/A138856A/GLCM Energy_violin.png')
intensity_img = plt.imread('/projects/steiflab/scratch/leli/A138856A/Intensity Z-score_violin.png')

# Plot the metrics bar plot on the left column, spanning all three rows
ax4 = fig.add_subplot(gs[0, 0])
ax4.imshow(metrics_img, aspect='auto', extent=ax4.get_xlim() + ax4.get_ylim())
ax4.axis('off')
ax4.set_title('A', loc='left', fontsize=16, fontweight='bold')

# Plot the circularity violin plot on the top right
ax1 = fig.add_subplot(gs[0, 1])
ax1.imshow(circularity_img, aspect='auto', extent=ax1.get_xlim() + ax1.get_ylim())
ax1.axis('off')
ax1.set_title('B', loc='left', fontsize=16, fontweight='bold')

# Plot the energy violin plot on the bottom left
ax2 = fig.add_subplot(gs[1, 0])
ax2.imshow(energy_img, aspect='auto', extent=ax2.get_xlim() + ax2.get_ylim())
ax2.axis('off')
ax2.set_title('C', loc='left', fontsize=16, fontweight='bold')

# Plot the intensity violin plot on the bottom right
ax3 = fig.add_subplot(gs[1, 1])
ax3.imshow(intensity_img, aspect='auto', extent=ax3.get_xlim() + ax3.get_ylim())
ax3.axis('off')
ax3.set_title('D', loc='left', fontsize=16, fontweight='bold')

# Adjust layout and save the final figure
plt.tight_layout()
plt.savefig('/projects/steiflab/scratch/leli/A138856A/combined_figure.png', dpi=1200, bbox_inches='tight')
plt.show()


## Get ground truth

In [None]:
import pandas as pd
import os
from tifffile import imread
import csv
import numpy as np 
# Your home directory path (change this to your actual path)
home_dir = "/projects/steiflab/archive/data/imaging/A138856A/NozzleImages"

well_df = pd.DataFrame()
# Traverse through each subdirectory in the home directory
for root, dirs, files in os.walk(home_dir):
    for file in files:
        if file == "LogFile.csv":
            # Construct the full file path
            file_path = os.path.join(root, file)
            
            # Read the logfile.csv file
            df = pd.read_csv(file_path)
            
            # Group by R and C, then find the row with the maximum Prediction for each group
            df_max_pred = df.loc[df.groupby(['R', 'C'])['Prediction'].idxmax()][['R', 'C', 'Prediction']]
            well_df = pd.concat([well_df, df_max_pred], ignore_index=True)

# After gathering the data, read the new file to merge
new_file_path = "/projects/steiflab/scratch/glchang/other/leon/A138856.tsv"
new_df = []
with open(new_file_path, mode='r') as file:
    reader = csv.reader(file, delimiter=' ')
    
    for row in reader:
        new_df.append(row)

new_df = pd.DataFrame(new_df)
new_df.columns = new_df.iloc[0]  # Set the first row as the header
new_df = new_df[1:]  # Remove the first row from the data
new_df.reset_index(drop=True, inplace=True)  # Reset the index
# Convert the necessary columns to integers
new_df['row'] = new_df['row'].astype(int)
new_df['col'] = new_df['col'].astype(int)
new_df['total_mapped_reads'] = new_df['total_mapped_reads'].astype(int)
print(np.unique(new_df['experimental_condition']))
print(np.unique(new_df['cell_condition']))
print(new_df.shape)

combination_counts = new_df.groupby(['experimental_condition', 'cell_condition']).size().reset_index(name='counts')
combination_counts.sort_values(by='counts', ascending=False, inplace=True)

print(combination_counts)


# Keep only the 'row', 'col', and 'total_reads' columns
new_df = new_df[['row', 'col', 'total_mapped_reads']]

# Assuming well_df and new_df are your dataframes

# Step 1: Create the 'manual_inspection' column
well_df['manual_inspection'] = well_df.apply(
    lambda row: 1 if ((row['R'], row['C']) in zip(new_df['row'], new_df['col'])) else 0,
    axis=1
)

# Step 2: Create the 'mapped_reads' column
well_df['mapped_reads'] = well_df.apply(
    lambda row: new_df.loc[(new_df['row'] == row['R']) & (new_df['col'] == row['C']), 'total_mapped_reads'].values[0]
    if ((row['R'], row['C']) in zip(new_df['row'], new_df['col'])) else 0,
    axis=1
)

# Step 3: Create the 'max_intensity' column by reading the .tif file
def get_max_intensity(r, c):
    file_path = f"/projects/steiflab/archive/data/imaging/A138856A/MicroscopeImages/S0000/C{str(c).zfill(2)}/R{str(r).zfill(2)}_C{str(c).zfill(2)}_0000_00_Cyan.tif"
    if os.path.exists(file_path):
        image = imread(file_path)
        return image.max()
    else:
        print("The fluorescent image does not exist")
        return None  # Or you can return 0 if you prefer

well_df['fluro_intensity'] = well_df.apply(
    lambda row: get_max_intensity(row['R'], row['C']),
    axis=1
)

# Display the updated dataframe

well_df.to_csv("/projects/steiflab/scratch/leli/A138856A/gt_inspection.csv", index  = False)



In [None]:
print(f"The number of singlecells and not singlecells are: {well_df['manual_inspection'].value_counts()}")

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming well_df is already prepared with the new columns

# Correlation matrix to see how the columns are related
correlation_matrix = well_df[['Prediction', 'manual_inspection', 'mapped_reads', 'fluro_intensity']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Ground Truth Methods')
plt.show()

import matplotlib.pyplot as plt
import seaborn as sns

# Assuming well_df is your dataframe with the necessary columns

# Create a violin plot for Prediction vs. max_intensity
plt.figure(figsize=(12, 8))
sns.violinplot(x='Prediction', y='fluro_intensity', data=well_df, inner=None, palette="muted")
sns.boxplot(x='Prediction', y='fluro_intensity', data=well_df, width=0.1, palette="muted", showcaps=True, showfliers=False, whiskerprops={'linewidth':2}, boxprops={'facecolor':'white', 'edgecolor':'black'}, medianprops={'color':'red', 'linewidth':2})

plt.title("Violin and Boxplot of fluro_intensity by Prediction")
plt.xlabel("Prediction")
plt.ylabel("Max Fluorescent Intensity")
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming well_df is your dataframe with the necessary columns

# Create a violin plot for manual_inspection vs. max_intensity
plt.figure(figsize=(12, 8))
sns.violinplot(x='manual_inspection', y='fluro_intensity', data=well_df, inner=None, palette="muted")
sns.boxplot(x='manual_inspection', y='fluro_intensity', data=well_df, width=0.1, palette="muted", showcaps=True, showfliers=False, whiskerprops={'linewidth':2}, boxprops={'facecolor':'white', 'edgecolor':'black'}, medianprops={'color':'red', 'linewidth':2})

plt.title("Violin and Boxplot of Max fluro_intensity by Manual Inspection")
plt.xlabel("Manual Inspection (0 or 1)")
plt.ylabel("Max Fluorescent Intensity")
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()


In [None]:
# Create a violin plot for Prediction vs. max_intensity
plt.figure(figsize=(12, 8))
sns.violinplot(x='Prediction', y='mapped_reads', data=well_df, inner=None, palette="muted")
sns.boxplot(x='Prediction', y='mapped_reads', data=well_df, width=0.1, palette="muted", showcaps=True, showfliers=False, whiskerprops={'linewidth':2}, boxprops={'facecolor':'white', 'edgecolor':'black'}, medianprops={'color':'red', 'linewidth':2})

plt.title("Violin and Boxplot of mapped_reads by Prediction")
plt.xlabel("Prediction")
plt.ylabel("mapped_reads")
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()

In [None]:
import pandas as pd

# Assuming well_df is your DataFrame with 'Prediction' and 'manual_inspection' columns

# Create a count matrix between 'Prediction' and 'manual_inspection'
count_matrix = pd.crosstab(well_df['Prediction'], well_df['manual_inspection'])

# Display the count matrix
print(count_matrix)


In [None]:
import umap
import seaborn as sns
import matplotlib.pyplot as plt

# Select the relevant columns for UMAP
data = well_df[['Prediction', 'manual_inspection', 'max_intensity', 'mapped_reads']]

# Apply UMAP to reduce the dimensions to 2D
reducer = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1, random_state=42)
embedding = reducer.fit_transform(data)

# Create a DataFrame with UMAP results
umap_df = pd.DataFrame(embedding, columns=['UMAP_1', 'UMAP_2'])

# Plot the UMAP results
plt.figure(figsize=(10, 8))
sns.scatterplot(x='UMAP_1', y='UMAP_2', data=umap_df)
plt.title('UMAP Projection of Four Features')
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()



In [None]:
from sklearn.cluster import KMeans
import pandas as pd

# Assuming well_df is your DataFrame containing the relevant columns

# Select the four features
features = well_df[['Prediction', 'manual_inspection', 'max_intensity', 'mapped_reads']]

# Perform KMeans clustering with 2 clusters
kmeans = KMeans(n_clusters=2, random_state=42)
well_df['Cluster_2'] = kmeans.fit_predict(features)
# Analyze the range for each cluster
# Cluster 0 analysis
# Cluster 0 analysis
cluster_0 = well_df[well_df['Cluster_2'] == 0]
print("Cluster 0 Analysis:")
print(f"Prediction: min = {cluster_0['Prediction'].min()}, max = {cluster_0['Prediction'].max()}, mean = {cluster_0['Prediction'].mean()}, median = {cluster_0['Prediction'].median()}")
print(f"Manual Inspection: min = {cluster_0['manual_inspection'].min()}, max = {cluster_0['manual_inspection'].max()}, mean = {cluster_0['manual_inspection'].mean()}, median = {cluster_0['manual_inspection'].median()}")
print(f"Max Intensity: min = {cluster_0['max_intensity'].min()}, max = {cluster_0['max_intensity'].max()}, mean = {cluster_0['max_intensity'].mean()}, median = {cluster_0['max_intensity'].median()}")
print(f"Mapped Reads: min = {cluster_0['mapped_reads'].min()}, max = {cluster_0['mapped_reads'].max()}, mean = {cluster_0['mapped_reads'].mean()}, median = {cluster_0['mapped_reads'].median()}")

# Cluster 1 analysis
cluster_1 = well_df[well_df['Cluster_2'] == 1]
print("\nCluster 1 Analysis:")
print(f"Prediction: min = {cluster_1['Prediction'].min()}, max = {cluster_1['Prediction'].max()}, mean = {cluster_1['Prediction'].mean()}, median = {cluster_1['Prediction'].median()}")
print(f"Manual Inspection: min = {cluster_1['manual_inspection'].min()}, max = {cluster_1['manual_inspection'].max()}, mean = {cluster_1['manual_inspection'].mean()}, median = {cluster_1['manual_inspection'].median()}")
print(f"Max Intensity: min = {cluster_1['max_intensity'].min()}, max = {cluster_1['max_intensity'].max()}, mean = {cluster_1['max_intensity'].mean()}, median = {cluster_1['max_intensity'].median()}")
print(f"Mapped Reads: min = {cluster_1['mapped_reads'].min()}, max = {cluster_1['mapped_reads'].max()}, mean = {cluster_1['mapped_reads'].mean()}, median = {cluster_1['mapped_reads'].median()}")



## Groun truth analysis

In [None]:
import os
import pandas as pd
from tifffile import imread

# Load the final_combined_df
final_combined_df_path = "/projects/steiflab/scratch/leli/A138856A/final_combined_df.csv"
final_combined_df = pd.read_csv(final_combined_df_path)

# Load the new_df data
new_file_path = "/projects/steiflab/scratch/glchang/other/leon/A138856.tsv"
new_df = []
with open(new_file_path, mode='r') as file:
    reader = csv.reader(file, delimiter=' ')
    
    for row in reader:
        new_df.append(row)

new_df = pd.DataFrame(new_df)
new_df.columns = new_df.iloc[0]  # Set the first row as the header
new_df = new_df[1:]  # Remove the first row from the data
new_df.reset_index(drop=True, inplace=True)  # Reset the index

# Convert the necessary columns to integers
new_df['row'] = new_df['row'].astype(int)
new_df['col'] = new_df['col'].astype(int)
new_df['total_mapped_reads'] = new_df['total_mapped_reads'].astype(int)

# Create 'total_mapped_reads' column in final_combined_df
final_combined_df['total_mapped_reads'] = final_combined_df.apply(
    lambda row: new_df.loc[(new_df['row'] == row['row']) & (new_df['col'] == row['col']), 'total_mapped_reads'].values[0]
    if ((row['row'], row['col']) in zip(new_df['row'], new_df['col'])) else pd.NA,
    axis=1
)

# Create 'fluro_intensity' column in final_combined_df by reading the .tif file
def get_max_intensity(r, c):
    file_path = f"/projects/steiflab/archive/data/imaging/A138856A/MicroscopeImages/S0000/C{str(c).zfill(2)}/R{str(r).zfill(2)}_C{str(c).zfill(2)}_0000_00_Cyan.tif"
    if os.path.exists(file_path):
        image = imread(file_path)
        return image.max()
    else:
        print(f"The fluorescent image for R{r}, C{c} does not exist")
        return pd.NA  # Or you can return 0 if you prefer

final_combined_df['fluro_intensity'] = final_combined_df.apply(
    lambda row: get_max_intensity(row['row'], row['col']),
    axis=1
)

# Identify rows without 'total_mapped_reads' or 'fluro_intensity'
missing_mapped_reads = final_combined_df[final_combined_df['total_mapped_reads'].isna()]
missing_fluro_intensity = final_combined_df[final_combined_df['fluro_intensity'].isna()]

# Print out the rows missing 'total_mapped_reads'
if not missing_mapped_reads.empty:
    print("Rows without 'total_mapped_reads':")
    print(missing_mapped_reads[['row', 'col']])

# Print out the rows missing 'fluro_intensity'
if not missing_fluro_intensity.empty:
    print("Rows without 'fluro_intensity':")
    print(missing_fluro_intensity[['row', 'col']])

# Save the updated final_combined_df
updated_final_combined_df_path = "/projects/steiflab/scratch/leli/A138856A/updated_final_combined_df.csv"
final_combined_df.to_csv(updated_final_combined_df_path, index=False)

print("The updated final_combined_df.csv has been saved successfully.")


In [None]:
final_combined_df_path = "/projects/steiflab/scratch/leli/A138856A/updated_final_combined_df.csv"
final_combined_df = pd.read_csv(final_combined_df_path)

import numpy as np
from skimage.io import imread

def get_95_percentile_intensity(r, c):
    file_path = f"/projects/steiflab/archive/data/imaging/A138856A/MicroscopeImages/S0000/C{str(c).zfill(2)}/R{str(r).zfill(2)}_C{str(c).zfill(2)}_0000_00_Cyan.tif"
    if os.path.exists(file_path):
        image = imread(file_path)
        # Calculate the 95th percentile value
        return np.percentile(image, 95)
    else:
        print(f"The fluorescent image for R{r}, C{c} does not exist")
        return pd.NA  # Or you can return 0 if you prefer
   
final_combined_df['fluro_intensity_95'] = final_combined_df.apply(
    lambda row: get_95_percentile_intensity(row['row'], row['col']),
    axis=1
)
#final_combined_df.to_csv(updated_final_combined_df_path, index=False)


In [None]:
print(final_combined_df['fluro_intensity'].mean() )

print(final_combined_df['fluro_intensity_95'].mean() )




In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load your dataframe
final_combined_df_path = "/projects/steiflab/scratch/leli/A138856A/updated_final_combined_df.csv"
final_combined_df = pd.read_csv(final_combined_df_path)

# Ensure the relevant columns are numeric
final_combined_df['prediction'] = pd.to_numeric(final_combined_df['prediction'], errors='coerce')
final_combined_df['ground_truth'] = pd.to_numeric(final_combined_df['ground_truth'], errors='coerce')
final_combined_df['total_mapped_reads'] = pd.to_numeric(final_combined_df['total_mapped_reads'], errors='coerce')
final_combined_df['fluro_intensity_95'] = pd.to_numeric(final_combined_df['fluro_intensity_95'], errors='coerce')

# Drop rows with NaN values in any of the columns
corr_df = final_combined_df[['prediction', 'ground_truth', 'total_mapped_reads', 'fluro_intensity_95']]#.dropna()

# Calculate the correlation matrix
corr_matrix = corr_df.corr()

# Plot the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", vmin=-1, vmax=1, square=True, linewidths=0.5, cbar_kws={"shrink": .75})

# Set the title
plt.title('Correlation Matrix of Prediction, Ground Truth, Total Mapped Reads, and Fluro Intensity 95 percentile', fontsize=14, fontweight='bold')

# Adjust layout
plt.tight_layout()

# Save the plot
# plt.savefig("/projects/steiflab/scratch/leli/A138856A/correlation_matrix_heatmap.png", dpi=300)

# Show the plot
plt.show()


In [None]:
final_combined_df.to_csv("/projects/steiflab/scratch/leli/A138856A/final_combined_df.csv", index = False)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import mannwhitneyu
from cliffs_delta import cliffs_delta

def create_violin_boxplot_with_stats(df, x, y, output_directory):
    # Extract the data for the two categories
    categories = df[x].unique()
    if len(categories) != 2:
        print(f"Skipping plot for {x} vs {y}: requires exactly 2 categories.")
        return
    
    category1 = df[df[x] == categories[0]][y].dropna()
    category2 = df[df[x] == categories[1]][y].dropna()

    # Perform Mann-Whitney U test
    stat, p_value = mannwhitneyu(category1, category2)
    
    # Calculate effect size (Cliff's Delta)
    cliffs_delta_value, magnitude = cliffs_delta(category1, category2)
    
    # Plotting
    plt.figure(figsize=(10, 6))
    sns.violinplot(x=x, y=y, data=df, inner=None, palette="muted")
    sns.boxplot(x=x, y=y, data=df, width=0.1, palette="muted", showcaps=True, showfliers=False, whiskerprops={'linewidth':2}, boxprops={'facecolor':'white', 'edgecolor':'black'}, medianprops={'color':'red', 'linewidth':2})
    
    # Add the number of samples to the x-axis labels
    xticklabels = [f"{cat} (n={len(df[df[x] == cat])})" for cat in categories]
    plt.xticks(ticks=[0, 1], labels=xticklabels)
    
    # Add statistical results in a text box
    stats_text = f"Mann-Whitney U p-value: {p_value:.4f}\nCliff's Delta: {cliffs_delta_value:.2f} ({magnitude})"
    plt.gca().text(0.05, 0.95, stats_text, transform=plt.gca().transAxes, fontsize=12,
                   verticalalignment='top', bbox=dict(boxstyle='round,pad=0.3', edgecolor='black', facecolor='lightgrey'))
    
    # Set the title and labels
    plt.title(f"Violin and Boxplot of {y} by {x}", fontsize=14)
    plt.grid(True, linestyle='--', alpha=0.7)
    
    # Save the plot
    # plt.savefig(f"{output_directory}/violin_boxplot_{x}_vs_{y}.png", dpi=300, bbox_inches='tight')
    
    # Show the plot
    plt.show()

# Directory to save plots
output_directory = "/projects/steiflab/scratch/leli/A138856A/violin_plots_with_stats"

# Create and save violin plots for all combinations
categorical_vars = ['ground_truth', 'prediction']
numerical_vars = ['total_mapped_reads', 'fluro_intensity']

for cat_var in categorical_vars:
    for num_var in numerical_vars:
        create_violin_boxplot_with_stats(final_combined_df, x=cat_var, y=num_var, output_directory=output_directory)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler

# Load the data
df = pd.read_csv(final_combined_df_path)

# Features and target
X = df[['prediction', 'total_mapped_reads', 'fluro_intensity']]
y = df['ground_truth']

# Split the data into train and test sets, ensuring a balanced class distribution
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize models
logreg = LogisticRegression()
rf = RandomForestClassifier()
dummy = DummyClassifier(strategy="most_frequent")

# Perform cross-validation on training data
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

# Logistic Regression
logreg_scores = cross_validate(logreg, X_train_scaled, y_train, cv=5, scoring=scoring)
# Random Forest
rf_scores = cross_validate(rf, X_train_scaled, y_train, cv=5, scoring=scoring)
# Dummy Classifier
dummy_scores = cross_validate(dummy, X_train_scaled, y_train, cv=5, scoring=scoring)

# Train the models on the training data
logreg.fit(X_train_scaled, y_train)
rf.fit(X_train_scaled, y_train)
dummy.fit(X_train_scaled, y_train)

# Evaluate on the test set
models = {'Logistic Regression': logreg, 'Random Forest': rf, 'Dummy Classifier': dummy}
best_f1_score = 0
best_model = None
best_name = None

for name, model in models.items():
    y_pred = model.predict(X_test_scaled)
    f1 = f1_score(y_test, y_pred)
    print(f"\n{name} Performance on Test Set:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc_score(y_test, model.predict_proba(X_test_scaled)[:, 1]):.4f}")
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

    if f1 > best_f1_score:
        best_f1_score = f1
        best_model = model
        best_name = name

# Print the cross-validation results
print("\nCross-Validation Performance:")
for name, scores in zip(['Logistic Regression', 'Random Forest', 'Dummy Classifier'], [logreg_scores, rf_scores, dummy_scores]):
    print(f"\n{name}:")
    for metric in scoring:
        print(f"{metric.capitalize()}: {np.mean(scores['test_' + metric]):.4f}")

# Feature Importance for the best model
if best_name != "Dummy Classifier":
    print(f"\nBest Model: {best_name} with F1 Score: {best_f1_score:.4f}")
    if best_name == "Random Forest":
        importance = best_model.feature_importances_
    elif best_name == "Logistic Regression":
        importance = np.abs(best_model.coef_[0])
    else:
        importance = None
    
    if importance is not None:
        feature_names = X.columns
        importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
        importance_df.sort_values(by='Importance', ascending=False, inplace=True)
        print("\nFeature Importances:\n", importance_df)
else:
    print("\nBest Model is the Dummy Classifier, no feature importance to display.")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
df = pd.read_csv("/projects/steiflab/scratch/leli/A138856A/final_combined_df.csv")

# Filter to only include Isolatrix technology
isolatrix_df = df[df['technology'].str.contains('isolatrix', case=False)]

# Identify Isolatrix TP and FN
# TP: Ground Truth = 1, Prediction = 1
tp_df = isolatrix_df[(isolatrix_df['ground_truth'] == 1) &
                     (isolatrix_df['prediction'] == 1)]

# FN: Ground Truth = 1, Prediction = 0
fn_df = isolatrix_df[(isolatrix_df['ground_truth'] == 1) &
                     (isolatrix_df['prediction'] == 0)]

# Combine the TP and FN data for plotting
tp_fn_data = pd.DataFrame({
    'Condition': ['True Positive (TP)'] * len(tp_df) + ['False Negative (FN)'] * len(fn_df),
    'Total Mapped Reads': pd.concat([tp_df['total_mapped_reads'], fn_df['total_mapped_reads']])
})

# Create the violin plot
plt.figure(figsize=(10, 6))
sns.violinplot(x='Condition', y='Total Mapped Reads', data=tp_fn_data, palette=['#5A79A5', '#9EB3D6'])

# Add title and labels
plt.title("Violin Plot of Total Mapped Reads: TP vs FN for Isolatrix")
plt.xlabel("Condition")
plt.ylabel("Total Mapped Reads")

# Display the plot
plt.show()


## GT Validation 

In [None]:
gt= pd.read_csv('/projects/steiflab/archive/data/wgs/single_cell/internal/A138856/merge/metadata.tsv', delimiter = '\t')
gt


In [None]:
import pandas as pd
import os

# Initialize empty DataFrames to store the results
df_max_prediction_1 = pd.DataFrame()
df_max_prediction_0 = pd.DataFrame()
df_max_prediction_2 = pd.DataFrame()

# Your home directory path
home_dir = "/projects/steiflab/archive/data/imaging/A138856A/NozzleImages"

# Traverse through each subdirectory in the home directory
for root, dirs, files in os.walk(home_dir):
    for file in files:
        if file == "LogFile.csv":
            # Construct the full file path
            file_path = os.path.join(root, file)
            
            # Read the logfile.csv file
            df = pd.read_csv(file_path)
            
            # Group by R and C, then find the row with the maximum Prediction for each group
            df_max_pred = df.loc[df.groupby(['R', 'C'])['Prediction'].idxmax()]

            # Filter for prediction = 1
            df_pred_1 = df_max_pred[df_max_pred['Prediction'] == 1][['R', 'C', 'Prediction']]
            df_max_prediction_1 = pd.concat([df_max_prediction_1, df_pred_1], ignore_index=True)
            
            # Filter for prediction = 0
            df_pred_0 = df_max_pred[df_max_pred['Prediction'] == 0][['R', 'C', 'Prediction']]
            df_max_prediction_0 = pd.concat([df_max_prediction_0, df_pred_0], ignore_index=True)

            # Filter for prediction = 2
            df_pred_2 = df_max_pred[df_max_pred['Prediction'] == 2][['R', 'C', 'Prediction']]
            df_max_prediction_2 = pd.concat([df_max_prediction_2, df_pred_2], ignore_index=True)

# Rename columns in df_max_prediction_1, df_max_prediction_0, and df_max_prediction_2
df_max_prediction_1.columns = ['row', 'col', 'prediction']
df_max_prediction_0.columns = ['row', 'col', 'prediction']
df_max_prediction_2.columns = ['row', 'col', 'prediction']

print(f"Count for prediction classes: {(df_max_prediction_0.shape[0], df_max_prediction_1.shape[0], df_max_prediction_2.shape[0])}")

# Concatenate the dataframes together
df_combined_predictions = pd.concat([df_max_prediction_1, df_max_prediction_0, df_max_prediction_2], ignore_index=True)

# Detect and resolve duplicates in df_combined_predictions by choosing the maximum prediction
df_combined_predictions = df_combined_predictions.groupby(['row', 'col'], as_index=False).agg({'prediction': 'max'})

print(f"Shape of combined predictions: {df_combined_predictions.shape}")

# Step 3: Read the new file (A138856.tsv) and prepare the new_df
new_df = pd.read_csv('/projects/steiflab/archive/data/wgs/single_cell/internal/A138856/merge/metadata.tsv', delimiter='\t')

print(f"Shape of metadata: {new_df.shape}")
non_cellenone_shape = new_df[new_df['experimental_condition'] != "CellenONE"].shape
print(f"Shape of metadata with non-CellenONE conditions: {non_cellenone_shape}")

# Merge the combined dataframe with new_df
merged_df = pd.merge(new_df, df_combined_predictions, how='left', on=['row', 'col'])
print(f"Shape of merged_df: {merged_df.shape}")

# Group by 'experimental_condition', 'cell_condition', and 'prediction'
grouped_counts = merged_df.groupby(['experimental_condition', 'cell_condition', 'prediction']).size().reset_index(name='counts') 
# Display the grouped counts
print(grouped_counts)

# Check counts in the merged_df where prediction is NaN
na_predictions = merged_df[merged_df['prediction'].isna()]
print(f"Count of rows with NaN predictions: {na_predictions.shape[0]}")

# Group and print counts of missing predictions by 'experimental_condition' and 'cell_condition'
missing_counts = na_predictions.groupby(['experimental_condition', 'cell_condition']).size().reset_index(name='counts')
print("Counts of missing predictions grouped by 'experimental_condition' and 'cell_condition':")
print(missing_counts)

# Print rows where 'experimental_condition' should be "CellenONE"
cellenone_in_merged = merged_df[merged_df['experimental_condition'] == "CellenONE"]
print(f"Number of rows with 'CellenONE' in merged_df: {cellenone_in_merged.shape[0]}")


'''# Print rows where 'prediction' is NaN
nan_predictions_df = merged_df[~merged_df['prediction'].isna()]
print("Rows with NaN predictions before handling:")
print(f"nan predicton is is {nan_predictions_df.shape}")

# Print counts grouped by 'experimental_condition' and 'cell_condition' for NaN predictions
nan_counts = nan_predictions_df.groupby(['experimental_condition', 'cell_condition']).size().reset_index(name='counts')
print("Counts for rows with NaN predictions grouped by 'experimental_condition' and 'cell_condition':")
print(nan_counts)

merged_df['experimental_condition'].fillna("Iso", inplace=True)  # Default to prediction = 0 if NaN
merged_df['cell_condition'].fillna("Cell NCC", inplace=True)  # Default to prediction = 0 if NaN

merged_df = merged_df[['row', 'col', 'prediction', 'experimental_condition', 'cell_condition']]
isolatrix_df = merged_df[merged_df['prediction'] != 2]
isolatrix_df'''

In [None]:
import pandas as pd

# Assuming merged_df is your dataframe
# Sort the dataframe based on 'row' and 'col'
merged_df_sorted = merged_df.sort_values(by=['row', 'col']).reset_index(drop=True)

# Check for missing entries
max_row = merged_df_sorted['row'].max()
max_col = merged_df_sorted['col'].max()
print(f"The max row and col are {(merged_df_sorted['row'].min(), merged_df_sorted['col'].min())}")
print(f"The max row and col are {(max_row, max_col)}")
# Generate all possible (row, col) combinations
all_combinations = pd.MultiIndex.from_product([range(1, max_row + 1), range(1, max_col + 1)], names=['row', 'col'])

# Convert the sorted dataframe to a MultiIndex for easy comparison
sorted_index = pd.MultiIndex.from_frame(merged_df_sorted[['row', 'col']])

# Find missing entries
missing_entries = all_combinations.difference(sorted_index)

# Convert missing entries to a DataFrame for easier manipulation
missing_entries_df = missing_entries.to_frame(index=False)

# Group the missing entries by 'col' and count
missing_counts_by_col = missing_entries_df.groupby('col').size()

# Print the count of missing values for each column
print("Count of missing values for each column:")
print(missing_counts_by_col)


## Tif when Isolatrix performs fine

In [None]:
import pandas as pd

# Define the file paths
final_combined_df_path = "/projects/steiflab/scratch/leli/A138856A/final_combined_df.csv"
log_file_path = "/projects/steiflab/archive/data/imaging/A138856A/NozzleImages/10dropRun3/LogFile.csv"

# Read the final_combined_df.csv into a DataFrame
final_combined_df = pd.read_csv(final_combined_df_path)

# Filter the final_combined_df DataFrame based on the specified criteria
filtered_df = final_combined_df[(final_combined_df['prediction'] == 1) & 
                                (final_combined_df['ground_truth'] == 1) & 
                                (final_combined_df['subdirectory_name'] == '10dropRun3') & 
                                (final_combined_df['technology'] == 'Isolatrix')][['row', 'col']]

# Read the LogFile.csv into a DataFrame
log_file_df = pd.read_csv(log_file_path)

# Initialize an empty list to store the results
results = []

# Iterate over each row and col combination in the filtered DataFrame
for index, row in filtered_df.iterrows():
    r, c = row['row'], row['col']
    
    # Filter the log_file_df to find matching rows and columns
    matching_rows = log_file_df[(log_file_df['R'] == r) & (log_file_df['C'] == c)]
    
    # If there are matching rows, find the one with the largest file_name number
    if not matching_rows.empty:
        matching_rows['file_number'] = matching_rows['file_name'].str.extract('(\d+)').astype(int)
        max_file_row = matching_rows.loc[matching_rows['file_number'].idxmax()]
        
        # Store the result
        results.append({
            'row': r,
            'col': c,
            'file_name': max_file_row['file_name'],
            'file_number': max_file_row['file_number']
        })

# Convert the results into a DataFrame for easy viewing
results_df = pd.DataFrame(results)

# Sort the results by the 'file_number' column
sorted_results_df = results_df.sort_values(by='file_number')

# Print the sorted results
print(sorted_results_df)
