# Analysis of CaM extended
## Comparison of all CaM states

In [1]:
import sys
import os
import pandas as pd
import numpy as np

# Add the lda directory to Python path
lda_path = os.path.join(os.getcwd(), 'lda')
if lda_path not in sys.path:
    sys.path.append(lda_path)

# Import the interactive pipeline functions
from pipeline_helper import run_interactive_pipeline, create_interactive_pipeline_configs

# Import your existing data access module
from data_access import create_dataframe_factory, list_available_constructs_subconstructs

print("‚úÖ Imports completed successfully!")

‚úÖ Imports completed successfully!


In [2]:
data_dir = '/work/hdd/bfri/jjeong7/analysis_output/dist_maps'
constructs_dict, subconstructs_dict = list_available_constructs_subconstructs(base_dir=data_dir)

# Specific states of specific proteins
data_factory = create_dataframe_factory(
    base_dir=data_dir, 
    constructs=['calmodulin']
)

In [3]:
# Create all possible pipeline combinations
configs = create_interactive_pipeline_configs()

In [None]:
print("üöÄ Starting Interactive Pipeline Runner")
print("üí° Variance runs first, then feature selection, then dimensionality reduction")
print("‚öôÔ∏è  You'll set parameters for each phase")

# Use the pipeline runner
results = run_interactive_pipeline(data_factory, configs)

üöÄ Starting Interactive Pipeline Runner
üí° Variance runs first, then feature selection, then dimensionality reduction
‚öôÔ∏è  You'll set parameters for each phase


Found cached result for VARIANCE (pipeline_cache/variance.pkl). Load? (Y/n):  y


Loaded cached VARIANCE data.
Shape: (90000, 509)


Found cached result for CHI_SQ_AMINO (pipeline_cache/chi_sq_amino.pkl). Load? (Y/n):  y


Loaded cached CHI_SQ_AMINO data.
Shape: (18000, 3)


Found cached result for FISHER_AMINO (pipeline_cache/fisher_amino.pkl). Load? (Y/n):  y


Loaded cached FISHER_AMINO data.
Shape: (90000, 3)

[FEATURE_SELECTION : MPSO]
  population_size: 20
  mpso_iters: 50
  alpha: 0.9
  threshold: 0.5
  stride: 5



Modify? (y/N):  n


Running MPSO...
Pass 1: Computing Fisher scores (stride=5)...
Pass 2: Loading top 250 features (stride=5)...
Running MPSO on 18000 strided samples...
Beginning Swarm Optimization...
‚úÖ MPSO Complete. Reduced 250 features to 5 dimensions.
MPSO Result Shape: (18000, 8)



Accept MPSO results? (y/N):  y


Results accepted and cached to pipeline_cache/mpso.pkl

[FEATURE_SELECTION : BPSO]
  population_size: 20
  max_iters: 50
  w: 0.729
  c1: 1.49445
  c2: 1.49445
  stride: 5



Modify? (y/N):  n


Running BPSO...
Pass 1: Filtering features via Streaming Fisher Score (stride=5)...
Pass 2: Loading top 150 features (stride=5)...
Beginning Swarm Optimization on 18000 samples...


In [None]:
print(f"üìä Successful pipelines: {len(results)}")

if results:
    print("\nüìà Results Summary:")
    for pipeline_name, result in results.items():
        final_df = result['final_result']
        feature_cols = [col for col in final_df.columns if col != 'class']
        
        print(f"   ‚úÖ {pipeline_name}:")
        print(f"      üìè Shape: {final_df.shape}")
        print(f"      üîß Features: {len(feature_cols)}")
        print(f"      üè∑Ô∏è  Classes: {final_df['class'].nunique()}")
        
        # Show feature names if not too many
        if len(feature_cols) <= 5:
            print(f"      üìã Features: {feature_cols}")
        else:
            print(f"      üìã Features: {feature_cols[:3]}...{feature_cols[-2:]}")
        print()
else:
    print("‚ö†Ô∏è  No pipelines completed successfully")

In [None]:
if results:
    import pickle
    import datetime
    
    # Create timestamp for filename
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    results_file = f"pipeline_results_{timestamp}.pkl"
    
    # Save results to file
    with open(results_file, 'wb') as f:
        pickle.dump(results, f)
    
    print(f"üíæ Results saved to: {results_file}")
    
    # Also save summary to CSV
    summary_data = []
    for pipeline_name, result in results.items():
        final_df = result['final_result']
        summary_data.append({
            'pipeline': pipeline_name,
            'samples': final_df.shape[0],
            'features': len([col for col in final_df.columns if col != 'class']),
            'classes': final_df['class'].nunique(),
            'feature_selection': result['config']['feature_selection'],
            'dimensionality_reduction': result['config']['dimensionality_reduction']
        })
    
    summary_df = pd.DataFrame(summary_data)
    summary_file = f"pipeline_summary_{timestamp}.csv"
    summary_df.to_csv(summary_file, index=False)
    print(f"üìä Summary saved to: {summary_file}")
else:
    print("‚ö†Ô∏è  No results to save")