In [4]:
# Install required dependencies
import subprocess
import sys

def install_package(package):
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"Successfully installed {package}")
    except subprocess.CalledProcessError:
        print(f"Failed to install {package}")

# Install missing packages (common dependencies for the alignment methods)
packages = [
    "graphtools", 
    "POT",  # Python Optimal Transport (provides 'ot' module)
    "scprep",
    "phate",
    "sklearn",
    "scipy",
    "numpy",
    "matplotlib",
    "seaborn"
]

for package in packages:
    try:
        if package == "POT":
            __import__("ot")  # POT installs as 'ot'
            print("POT (ot) is already installed")
        else:
            __import__(package)
            print(f"{package} is already installed")
    except ImportError:
        print(f"Installing {package}...")
        install_package(package)

graphtools is already installed
POT (ot) is already installed
Installing scprep...
Failed to install scprep
Installing phate...
Failed to install scprep
Installing phate...
Failed to install phate
sklearn is already installed
scipy is already installed
numpy is already installed
matplotlib is already installed
seaborn is already installed
Failed to install phate
sklearn is already installed
scipy is already installed
numpy is already installed
matplotlib is already installed
seaborn is already installed


# Demonstration File

In [5]:
#Import necessary libraries
# Import specific classes to avoid __init__.py issues during development
import sys
import os

# Add the source directory to path for imports
sys.path.append(os.path.join(os.getcwd(), '../../src'))
sys.path.append(os.path.join(os.getcwd(), '../../Python_Files'))

# Standard imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Try to import MASH from the src structure first, then fallback
try:
    from graph_manifold_alignment.alignment_methods.MASH_MD import MASH
    print("MASH imported successfully from src structure")
except ImportError:
    try:
        from AlignmentMethods.MASH_MD import MASH
        print("MASH imported from Python_Files structure")
    except ImportError as e:
        print(f"MASH import failed: {e}")
        MASH = None

# Try to import utilities
try:
    from graph_manifold_alignment.helpers.utils import *
    print("Utils imported successfully from src structure")
except ImportError:
    try:
        from Helpers.utils import *
        print("Utils imported from Python_Files structure")
    except ImportError as e:
        print(f"Utils import failed: {e}")

# Try to import test_manifold_algorithms for tma functionality
try:
    from graph_manifold_alignment.main.test_manifold_algorithms import test_manifold_algorithms
    print("test_manifold_algorithms imported from src structure")
except ImportError:
    try:
        from Main.test_manifold_algorithms import test_manifold_algorithms  
        print("test_manifold_algorithms imported from Python_Files structure")
    except ImportError:
        print("test_manifold_algorithms not available")
        test_manifold_algorithms = None

# SPUD from the mashspud package
try:
    from mashspud import SPUD
    print("SPUD imported successfully from mashspud package")
except ImportError:
    print("SPUD from mashspud package not available. Ensure mashspud is installed: pip install git+https://github.com/rustadadam/mashspud.git")
    SPUD = None

print("Import setup complete!")

MASH imported successfully from src structure
Utils imported successfully from src structure
test_manifold_algorithms imported from src structure
SPUD imported successfully from mashspud package
Import setup complete!


# Testing with Timeless Variables

In [6]:
import pandas as pd

# Check if the file exists in the data directory
timeless_file = r"..\..\data\classification\Timeless Variables 2024-07-09.csv"
try:
    timeless = pd.read_csv(timeless_file)
    print(f"Successfully loaded timeless data from: {timeless_file}")
    print(f"Data shape: {timeless.shape}")
except FileNotFoundError:
    print(f"File not found at {timeless_file}")
    # Try alternative locations
    alt_files = [
        r"..\..\data\classification\Profile Variables 2024-07-31.xlsx",
        r"..\..\data\classification\Visit Variables 2024-08-01.xlsx"
    ]
    
    for alt_file in alt_files:
        try:
            if alt_file.endswith('.xlsx'):
                timeless = pd.read_excel(alt_file, index_col=0)
            else:
                timeless = pd.read_csv(alt_file)
            print(f"Using alternative file: {alt_file}")
            print(f"Data shape: {timeless.shape}")
            break
        except FileNotFoundError:
            continue
    else:
        print("No suitable data file found")
        timeless = None

if timeless is not None:
    # Fill NaN values and drop problematic columns if they exist
    timeless = timeless.fillna(-4)
    columns_to_drop = []
    for col in ["PTRACCAT", "DX_bl", "PTMARRY"]:
        if col in timeless.columns:
            columns_to_drop.append(col)
    
    if columns_to_drop:
        timeless = timeless.drop(columns=columns_to_drop)
        print(f"Dropped columns: {columns_to_drop}")
    
    print(f"Final data shape: {timeless.shape}")
    print(f"Sample columns: {list(timeless.columns[:10])}")

File not found at ..\..\data\classification\Timeless Variables 2024-07-09.csv
Using alternative file: ..\..\data\classification\Profile Variables 2024-07-31.xlsx
Data shape: (4385, 43)
Dropped columns: ['PTRACCAT', 'DX_bl', 'PTMARRY']
Final data shape: (4385, 40)
Sample columns: ['PTEDUCAT', 'PTHAND', 'MOTHAD', 'FATHAD', 'MOTHDEM', 'FATHDEM', 'AGE', 'AB42_RAW', 'PHC_AB42', 'Tau_RAW']
Using alternative file: ..\..\data\classification\Profile Variables 2024-07-31.xlsx
Data shape: (4385, 43)
Dropped columns: ['PTRACCAT', 'DX_bl', 'PTMARRY']
Final data shape: (4385, 40)
Sample columns: ['PTEDUCAT', 'PTHAND', 'MOTHAD', 'FATHAD', 'MOTHDEM', 'FATHDEM', 'AGE', 'AB42_RAW', 'PHC_AB42', 'Tau_RAW']


In [7]:
timeless.sample(5)

Unnamed: 0_level_0,PTEDUCAT,PTHAND,MOTHAD,FATHAD,MOTHDEM,FATHDEM,AGE,AB42_RAW,PHC_AB42,Tau_RAW,...,PTGENDER,PTDOBYY,PTPLANG,PTETHCAT,PTIDENT,PTENGSPK,PTETHCATH,Mean,SD,CV
RID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
504,14.0,1.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,...,1.0,1927.0,1.0,2.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0
4258,14.0,2.0,-4.0,-4.0,-4.0,-4.0,75.8,142.0,-0.4743,114.0,...,1.0,1935.0,1.0,2.0,-4.0,-4.0,-4.0,667.12598,28.392511,0.042559
7037,16.0,1.0,-4.0,-4.0,0.0,0.0,-4.0,-4.0,-4.0,-4.0,...,1.0,1959.0,1.0,2.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0
4580,16.0,1.0,-4.0,-4.0,-4.0,-4.0,69.7,115.0,-1.1494,106.0,...,2.0,1942.0,1.0,2.0,-4.0,-4.0,-4.0,204.578917,12.87956,0.062956
6465,18.0,1.0,-4.0,-4.0,1.0,0.0,66.8,-4.0,-4.0,-4.0,...,1.0,1951.0,1.0,2.0,-4.0,-4.0,-4.0,165.063529,13.67882,0.08287


In [8]:
len(timeless.columns), len(timeless)

(40, 4385)

In [9]:
np.array(timeless)[:, 1]

array([-4.,  2.,  1., ...,  1., -4., -4.], shape=(4385,))

In [10]:
domain_a = np.array(timeless)[:200, :10].astype(float)
domain_b = np.array(timeless)[:200, 10:].astype(float)

In [11]:
# Create proper anchors for the domains
# Since we're working with the same dataset split into two domains,
# we can create simple paired anchors
if 'domain_a' in locals() and 'domain_b' in locals():
    num_samples = min(len(domain_a), len(domain_b))
    anchors = [[i, i] for i in range(num_samples)]
    print(f"Created {len(anchors)} anchor pairs")
    print(f"First 22 anchors: {anchors[:22]}")
else:
    print("domain_a and domain_b not yet defined. Run the previous cells first.")
    anchors = []

Created 200 anchor pairs
First 22 anchors: [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9], [10, 10], [11, 11], [12, 12], [13, 13], [14, 14], [15, 15], [16, 16], [17, 17], [18, 18], [19, 19], [20, 20], [21, 21]]


In [12]:
domain_a.shape

(200, 10)

In [13]:
# Create MASH instance
if MASH is not None:
    try:
        mash = MASH(t = -1, knn = 2, verbose = 2)
        print("MASH instance created successfully")
        print(f"MASH object type: {type(mash)}")
        print(f"MASH methods: {[method for method in dir(mash) if not method.startswith('_')][:10]}...")
    except Exception as e:
        print(f"Error creating MASH instance: {e}")
        mash = None
else:
    print("MASH class not available")
    mash = None

# Verify the variable exists
print(f"mash variable created: {'mash' in locals()}")
if 'mash' in locals():
    print(f"mash is None: {mash is None}")
else:
    print("mash variable not found in locals")

MASH instance created successfully
MASH object type: <class 'graph_manifold_alignment.alignment_methods.MASH_MD.MASH'>
MASH methods: ['DTM', 'FOSCTTM', 'IDC', 'apply_aggregation', 'build_graphs', 'burn_in', 'cross_embedding_knn', 'density_normalized_kernel', 'distance_measures', 'embeddings']...
mash variable created: True
mash is None: False


In [14]:
# Fit the MASH model
if 'mash' in locals() and mash is not None and \
   'domain_a' in locals() and 'domain_b' in locals() and \
   'anchors' in locals():
    try:
        print("Attempting to fit MASH model...")
        print(f"domain_a shape: {domain_a.shape if hasattr(domain_a, 'shape') else 'unknown'}")
        print(f"domain_b shape: {domain_b.shape if hasattr(domain_b, 'shape') else 'unknown'}")
        print(f"anchors shape: {anchors.shape if hasattr(anchors, 'shape') else len(anchors) if hasattr(anchors, '__len__') else 'unknown'}")
        
        # MASH expects domains as a tuple and anchors as 2D array
        domains = (domain_a, domain_b)
        
        # Convert 1D anchors to 2D format expected by MASH
        if hasattr(anchors, 'shape') and len(anchors.shape) == 2:
            known_anchors = anchors  # Already 2D
        else:
            # Convert anchors from pairs format to MASH format
            # anchors is [[a1,b1], [a2,b2], ...] so convert to proper format
            known_anchors = np.array(anchors)
            
        print(f"Converted domains tuple length: {len(domains)}")
        print(f"Converted known_anchors shape: {known_anchors.shape}")
        print(f"Sample anchors: {known_anchors[:5]}")
        
        # Fit MASH with correct API
        mash.fit(domains, known_anchors)
        print("MASH model fitted successfully!")
        
    except Exception as e:
        print(f"Error during MASH fitting: {e}")
        print("Available MASH methods:", [method for method in dir(mash) if not method.startswith('_')][:10] if 'mash' in locals() else 'mash not available')
else:
    missing = []
    if 'mash' not in locals(): missing.append('mash')
    if 'domain_a' not in locals(): missing.append('domain_a')  
    if 'domain_b' not in locals(): missing.append('domain_b')
    if 'anchors' not in locals(): missing.append('anchors')
    print(f"Cannot fit MASH - missing variables: {missing}")

Attempting to fit MASH model...
domain_a shape: (200, 10)
domain_b shape: (200, 30)
anchors shape: 200
Converted domains tuple length: 2
Converted known_anchors shape: (200, 2)
Sample anchors: [[0 0]
 [1 1]
 [2 2]
 [3 3]
 [4 4]]
Error during MASH fitting: 'tuple' object does not support item assignment
Available MASH methods: ['DTM', 'FOSCTTM', 'IDC', 'apply_aggregation', 'build_graphs', 'burn_in', 'cross_embedding_knn', 'density_normalized_kernel', 'distance_measures', 'domain_count']


In [15]:
dig_time.plot_heat_maps()

NameError: name 'dig_time' is not defined

In [None]:
if dig_time is not None and hasattr(dig_time, 'plot_emb') and timeless is not None:
    try:
        # Create labels - use a column that exists or create dummy labels
        if "PTGENDER" in timeless.columns:
            labels = pd.concat([timeless["PTGENDER"][:200], timeless["PTGENDER"][:200]])
            print("Using PTGENDER for labels")
        else:
            # Create dummy labels based on row index
            dummy_labels = np.random.randint(0, 3, 200)  # 3 categories
            labels = pd.concat([pd.Series(dummy_labels), pd.Series(dummy_labels)])
            print("Using dummy labels")
        
        dig_time.plot_emb(labels = labels)
        print("Embedding plot created successfully")
    except Exception as e:
        print(f"Error creating embedding plot: {e}")
else:
    print("Cannot create plot: missing dig_time object, plot_emb method, or timeless data")

# Testing with Timeless and Timefull variables

In [None]:
# Create dataclasses. Note, we want to distort to leave domain unchanged
if test_manifold_algorithms is not None:
    try:
        # Check if the CSV files exist first
        timeless_file = "../../data/classification/Timeless Variables 2024-07-09.csv"
        visits_file = "../../data/classification/Visit Variables 2024-08-01.xlsx"
        
        # For now, create dummy data classes since the exact files may not exist
        print("Creating test manifold algorithms instances...")
        
        # Create instances (these may fail if the files don't exist)
        try:
            timeless_data = test_manifold_algorithms("Timeless Variables 2024-07-09.csv", split = "distort", verbose = 3, random_state=2816)
            print("Timeless data loaded successfully")
        except:
            print("Failed to load timeless data - using dummy data")
            # Create a dummy data class
            class DummyData:
                def __init__(self):
                    self.split_A = np.random.rand(100, 20)
                    self.labels = np.random.randint(0, 3, 100)
            timeless_data = DummyData()
        
        try:
            visits_data = test_manifold_algorithms("Visit Variables 2024-07-09.csv", split = "distort", verbose = 3, random_state=2816)  
            print("Visits data loaded successfully")
        except:
            print("Failed to load visits data - using dummy data")
            class DummyData:
                def __init__(self):
                    self.split_A = np.random.rand(150, 25)
                    self.labels = np.random.randint(0, 3, 150)
            visits_data = DummyData()
            
    except Exception as e:
        print(f"Error creating test_manifold_algorithms instances: {e}")
        timeless_data = None
        visits_data = None
else:
    print("test_manifold_algorithms not available. Cannot create data instances.")
    timeless_data = None
    visits_data = None

In [None]:
# Create MASH alignment for the test data
if 'timeless_data' in locals() and 'visits_data' in locals() and \
   timeless_data is not None and visits_data is not None:
    try:
        print("Creating MASH model for timeless vs visits alignment...")
        tv = MASH(n_pca = 10)
        
        # MASH expects domains as tuple and known_anchors as 2D array
        domains = (timeless_data.split_A, visits_data.split_A)
        
        # Create simple identity anchors for demonstration
        min_samples = min(len(timeless_data.split_A), len(visits_data.split_A))
        n_anchors = min(50, min_samples)  # Use up to 50 anchors
        known_anchors = np.array([[i, i] for i in range(n_anchors)])
        
        print(f"Domains: {len(domains)} domains with shapes {[d.shape for d in domains]}")
        print(f"Anchors: {known_anchors.shape} anchors")
        
        # Fit the MASH model
        tv.fit(domains, known_anchors)
        print("MASH model fitted successfully")
        
        # Try to get alignment score if method exists
        try:
            alignment_score = tv.FOSCTTM()
            print(f"Alignment score: {alignment_score}")
        except:
            print("Could not compute alignment score")
        
    except Exception as e:
        print(f"Error creating MASH alignment: {e}")
        tv = None
else:
    print("Cannot create MASH alignment - missing data objects")
    tv = None

In [None]:
# Load visits labels data
try:
    # Try multiple possible file paths
    possible_files = [
        "../../data/classification/Visit Variables 2024-08-01.xlsx",
        "../../data/classification/Visit Variables 2024-07-09.csv",
        "../../data/classification/Progression Variables 2024-08-17.xlsx"
    ]
    
    visits_labels = None
    for file_path in possible_files:
        try:
            if file_path.endswith('.csv'):
                df = pd.read_csv(file_path)
            else:
                df = pd.read_excel(file_path)
            
            # Look for ID column (could be RID, ID, PatientID, etc.)
            id_columns = ['RID', 'ID', 'PatientID', 'Subject_ID', 'SubjectID']
            id_col = None
            for col in id_columns:
                if col in df.columns:
                    id_col = col
                    break
            
            if id_col:
                visits_labels = df[id_col].values
                print(f"Successfully loaded visits labels from {file_path}")
                print(f"Found {len(visits_labels)} labels using column '{id_col}'")
                break
            else:
                print(f"No suitable ID column found in {file_path}")
                
        except Exception as e:
            print(f"Could not load {file_path}: {e}")
            continue
    
    if visits_labels is None:
        print("Could not load visits labels from any source - creating dummy labels")
        if 'visits_data' in locals() and visits_data is not None:
            visits_labels = np.arange(len(visits_data.split_A))
        else:
            visits_labels = np.arange(100)  # Default dummy labels
            
except Exception as e:
    print(f"Error loading visits labels: {e}")
    visits_labels = np.arange(100)  # Fallback

In [None]:
#Reset NaN values to be -4 :)
timeless_data.split_A[np.isnan(timeless_data.split_A)] = -4
visits_data.split_A[np.isnan(visits_data.split_A)] = -4

#Lets subset the data
timeless_data.split_A = timeless_data.split_A[:SUBSET_VAL]
visits_data.split_A = visits_data.split_A[:max(TV_anchors[:, 1])+1]

In [None]:
len(TV_anchors), len(visits_data.split_A), max(TV_anchors[:, 1])

In [None]:
TV_anchors[-10:]

In [None]:
#Create the merged values --> This will take up the rest of your day
tv = MASH(n_pca = 10)
tv.fit(timeless_data.split_A, visits_data.split_A, known_anchors = TV_anchors)

In [None]:
tv.plot_heat_maps()

In [None]:
tv.plot_emb(show_anchors = False, show_lines = False, n_comp = 2)

In [None]:
import graphtools

In [None]:
help(graphtools.Graph)

In [None]:
# Create plots with the MASH-aligned data
if 'mash' in locals() and 'timeless_data' in locals() and 'visits_data' in locals() and \
   timeless_data is not None and visits_data is not None:
    try:
        # Get the transformed data from MASH
        transformed_timeless = mash.get_aligned_data(timeless_data.split_A)
        transformed_visits = mash.get_aligned_data(visits_data.split_A)
        
        # Plot the results
        plt.figure(figsize=(15, 5))
        
        # Original timeless data
        plt.subplot(1, 3, 1)
        plt.scatter(timeless_data.split_A[:, 0], timeless_data.split_A[:, 1], 
                   c=timeless_data.labels, alpha=0.7)
        plt.title('Original Timeless Data')
        plt.xlabel('Feature 1')
        plt.ylabel('Feature 2')
        
        # Original visits data  
        plt.subplot(1, 3, 2)
        plt.scatter(visits_data.split_A[:, 0], visits_data.split_A[:, 1], 
                   c=visits_data.labels, alpha=0.7)
        plt.title('Original Visits Data')
        plt.xlabel('Feature 1')
        plt.ylabel('Feature 2')
        
        # MASH aligned data
        plt.subplot(1, 3, 3)
        if len(transformed_timeless) > 0:
            plt.scatter(transformed_timeless[:, 0], transformed_timeless[:, 1], 
                       c=timeless_data.labels, alpha=0.7, label='Timeless')
        if len(transformed_visits) > 0:
            plt.scatter(transformed_visits[:, 0], transformed_visits[:, 1], 
                       c=visits_data.labels, alpha=0.7, label='Visits')
        plt.title('MASH Aligned Data')
        plt.xlabel('Aligned Feature 1')
        plt.ylabel('Aligned Feature 2')
        plt.legend()
        
        plt.tight_layout()
        plt.show()
        
    except Exception as e:
        print(f"Error creating plots: {e}")
        print("Creating simple visualization with available data...")
        
        # Fallback plotting
        plt.figure(figsize=(10, 4))
        
        plt.subplot(1, 2, 1)
        if hasattr(timeless_data, 'split_A') and timeless_data.split_A is not None:
            plt.scatter(timeless_data.split_A[:, 0], timeless_data.split_A[:, 1], alpha=0.7)
            plt.title('Timeless Data')
        else:
            plt.text(0.5, 0.5, 'No timeless data available', ha='center', va='center')
            plt.title('Timeless Data (Not Available)')
            
        plt.subplot(1, 2, 2)
        if hasattr(visits_data, 'split_A') and visits_data.split_A is not None:
            plt.scatter(visits_data.split_A[:, 0], visits_data.split_A[:, 1], alpha=0.7)
            plt.title('Visits Data')
        else:
            plt.text(0.5, 0.5, 'No visits data available', ha='center', va='center')
            plt.title('Visits Data (Not Available)')
            
        plt.tight_layout()
        plt.show()
        
else:
    print("Cannot create plots - missing MASH model or data objects")
    print("Available variables:", [var for var in locals().keys() if not var.startswith('_')])