In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
import seaborn as sns
import umap
import os

In [6]:
# Define patient file paths
patients = {
    1: ['Data/organized_fcs_data1a.csv', 'Data/organized_fcs_data1b.csv', 'Data/organized_fcs_data1c.csv'],
    2: ['Data/organized_fcs_data2a.csv', 'Data/organized_fcs_data2b.csv', 'Data/organized_fcs_data2c.csv'],
    3: ['Data/organized_fcs_data3a.csv', 'Data/organized_fcs_data3b.csv', 'Data/organized_fcs_data3c.csv'],
    4: ['Data/organized_fcs_data4a.csv', 'Data/organized_fcs_data4b.csv', 'Data/organized_fcs_data4c.csv'],
    5: ['Data/organized_fcs_data5a.csv', 'Data/organized_fcs_data5b.csv', 'Data/organized_fcs_data5c.csv'],
    6: ['Data/organized_fcs_data6a.csv', 'Data/organized_fcs_data6b.csv', 'Data/organized_fcs_data6c.csv'],
    7: ['Data/organized_fcs_data7.csv'],
    8: ['Data/organized_fcs_data8a.csv', 'Data/organized_fcs_data8b.csv', 'Data/organized_fcs_data8c.csv'],
    9: ['Data/organized_fcs_data9a.csv', 'Data/organized_fcs_data9b.csv'],
    10: ['Data/organized_fcs_data10a.csv', 'Data/organized_fcs_data10b.csv', 'Data/organized_fcs_data10c.csv'],
    11: ['Data/organized_fcs_data11.csv'],
    12: ['Data/organized_fcs_data12a.csv', 'Data/organized_fcs_data12b.csv', 'Data/organized_fcs_data12c.csv']
}

In [7]:
# Function to load and combine data for given patient IDs
def load_patient_data(patient_ids):
    all_data = []
    for pid in patient_ids:
        for file in patients[pid]:
            if os.path.exists(file):
                df = pd.read_csv(file)
                all_data.append(df.values)
            else:
                print(f"‚ö†Ô∏è File not found: {file}")
    if all_data:
        return np.vstack(all_data)
    else:
        return np.empty((0, 14))  # Assuming 14 markers


In [8]:
# Load healthy and unhealthy data
healthy_cells = load_patient_data([1, 2, 3, 4, 5, 6])
unhealthy_cells = load_patient_data([7, 8, 9, 10, 11, 12])

In [9]:
# Subsample healthy data to speed up fitting (optional, but recommended)
print("üìä Subsampling healthy data for efficient fitting...")
healthy_sample_size = min(len(healthy_cells), 100000)
healthy_sample = healthy_cells[np.random.choice(len(healthy_cells), healthy_sample_size, replace=False)]

üìä Subsampling healthy data for efficient fitting...


In [10]:
# Scale data using StandardScaler
print("üß™ Scaling data...")
scaler = StandardScaler()
healthy_sample_scaled = scaler.fit_transform(healthy_sample)
healthy_cells_scaled = scaler.transform(healthy_cells)
unhealthy_cells_scaled = scaler.transform(unhealthy_cells)

üß™ Scaling data...


In [11]:
# Fit GMM to scaled healthy cells
print("üîç Fitting GMM to healthy data...")
gmm = GaussianMixture(n_components=6, covariance_type='full', random_state=42)
gmm.fit(healthy_sample_scaled)

üîç Fitting GMM to healthy data...


In [12]:
# Score both healthy & unhealthy cells
print("üìà Scoring cells...")
healthy_scores = gmm.score_samples(healthy_cells_scaled)
unhealthy_scores = gmm.score_samples(unhealthy_cells_scaled)

üìà Scoring cells...


In [13]:
threshold = np.percentile(healthy_scores, 1)

In [14]:
# Detect anomalies in unhealthy cells
anomalies = unhealthy_scores < threshold

In [15]:
# üìä Now per patient MRD percentages:
print("\nüìä MRD percentage for each unhealthy patient:")

# Track rows per patient to split results
unhealthy_patient_ids = [7, 8, 9, 10, 11, 12]
start_idx = 0

for pid in unhealthy_patient_ids:
    patient_data = load_patient_data([pid])
    patient_cells = len(patient_data)
    patient_anomalies = anomalies[start_idx:start_idx + patient_cells]
    patient_mrd_percentage = (np.sum(patient_anomalies) / patient_cells) * 100
    print(f"  Patient {pid}: {patient_mrd_percentage:.4f}% MRD")
    start_idx += patient_cells


üìä MRD percentage for each unhealthy patient:
  Patient 7: 7.9235% MRD
  Patient 8: 1.2532% MRD
  Patient 9: 9.3269% MRD
  Patient 10: 3.6535% MRD
  Patient 11: 9.6454% MRD
  Patient 12: 4.1659% MRD


In [None]:
# print("\n MRD Detection Per Patient:")
# for patient_id in range(7, 13):  # Patients 7 to 12
#     patient_data = load_patient_data([patient_id])
    
#     if patient_data.size == 0:
#         print(f" Patient {patient_id}: No data found.")
#         continue

#     scores = gmm.score_samples(patient_data)
#     anomalies = scores < threshold
    
#     total = len(patient_data)
#     mrd_cells = np.sum(anomalies)
#     mrd_percentage = (mrd_cells / total) * 100

#     print(f"üß¨ Patient {patient_id}: {mrd_cells} abnormal cells out of {total} ‚Üí MRD = {mrd_percentage:.4f}%")