# Analysis of CaM compact
## Comparison of all CaM states

In [1]:
import sys
import os
import pandas as pd
import numpy as np

# Add the lda directory to Python path
lda_path = os.path.join(os.getcwd(), 'lda')
if lda_path not in sys.path:
    sys.path.append(lda_path)

# Import the interactive pipeline functions
from pipeline_helper import run_interactive_pipeline, create_interactive_pipeline_configs, summarize_and_evaluate

# Import your existing data access module
from data_access import create_dataframe_factory, list_available_constructs_subconstructs

print("‚úÖ Imports completed successfully!")

‚úÖ Imports completed successfully!


In [2]:
data_dir = '/work/hdd/bfri/jjeong7/analysis_output/dist_maps'
constructs_dict, subconstructs_dict = list_available_constructs_subconstructs(base_dir=data_dir)

# Specific states of specific proteins
data_factory = create_dataframe_factory(
    base_dir=data_dir, 
    constructs=['calmodulin-compact'],
    apply_boundary_filter=True, # üõ†Ô∏è ACTIVATE TRANSFORMATION
    n_edge=3,
    min_frame=0,
)

In [3]:
# Create all possible pipeline combinations
configs = create_interactive_pipeline_configs()

In [None]:
print("üöÄ Starting Interactive Pipeline Runner")
print("üí° Variance runs first, then feature selection, then dimensionality reduction")
print("‚öôÔ∏è  You'll set parameters for each phase")

# Use the pipeline runner
results, base_df = run_interactive_pipeline(data_factory, configs)

üöÄ Starting Interactive Pipeline Runner
üí° Variance runs first, then feature selection, then dimensionality reduction
‚öôÔ∏è  You'll set parameters for each phase

PHASE 1: VARIANCE


Found cached result for VARIANCE (pipeline_cache/variance.pkl). Load? (Y/n):  y


Loaded cached VARIANCE data.
Shape: (90000, 426)
Variance Output: (90000, 426)

CLASS ASSIGNMENT
Choose class assignment method:
1. Default: construct + subconstruct
2. GMM: Gaussian Mixture Model (shared states)
3. Spectral: Non-linear spectral clustering
4. TICA: Kinetic landscape state assignment


Enter choice (1-4):  3



[CLUSTERING : SPECTRAL]
üìã Parameter Controls:
  üéØ Method: Spectral Clustering (Nystroem + K-means)
‚öôÔ∏è  Hyperparameters:
  stride: 10 - Data sampling stride - higher = faster but less precise
  max_k: 15 - Maximum clusters to test - higher = more options
  n_components: 50 - Spectral embedding dimensions - higher = more complex
  S: 1.0 - Knee sensitivity - higher = more clusters, lower = fewer
  show_plots: True - Show Elbow diagnostics plot



Modify? (y/N):  n


Sampling dataset (stride=10) to learn Spectral Embedding...
Computing Nystroem Spectral Embedding (n_components=50)...
Evaluating K from 2 to 15 in spectral space...
‚úÖ Optimal Spectral Clusters: 7
Fitting final K-Means model on spectral components...
Projecting FULL dataset into Spectral Space and assigning labels...


In [None]:
import pickle
from pathlib import Path

export_dir = Path("results/cmpcam_spectral")
export_dir.mkdir(parents=True, exist_ok=True)

try:
    with open(export_dir / "pipeline_results.pkl", "wb") as f:
        # Protocol 5 is faster and handles larger objects better
        pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)

    with open(export_dir / "base_df_metadata.pkl", "wb") as f:
        pickle.dump(base_df, f, protocol=pickle.HIGHEST_PROTOCOL)
        
    print(f"‚úÖ State preserved in {export_dir}/")
except Exception as e:
    print(f"‚ùå Save failed: {e}")