# SPTnano HPC Parallel Processing

SPTnano parallel processing `calculate_time_windowed_metrics` across multiple cores.

## Usage:
1. **Cluster processing** - HPC usage via SSH


In [None]:
# SPTnano package imports
import SPTnano as spt

# Configuration
saved_data = spt.config.SAVED_DATA
time_between_frames = spt.config.TIME_BETWEEN_FRAMES

# Data processing imports
import numpy as np
import pandas as pd
import os
import time

print("✓ SPTnano loaded successfully")
print(f"✓ Saved data directory: {saved_data}")
print(f"✓ Time between frames: {time_between_frames} s")

# Load instant_df for processing
instant_df = pd.read_csv(saved_data + 'instant_df.csv')
print(f"✓ Loaded instant_df: {len(instant_df):,} points, {instant_df['unique_id'].nunique():,} tracks")


In [None]:
# number of unique unique ids
print(instant_df['unique_id'].nunique())

In [None]:
# CLUSTER PROCESSING CONFIGURATION
# UPDATE PATHS

# File paths 
cluster_input_file = '/path/to//full_instant_df.csv'  # Your full dataset on cluster
cluster_output_dir = '/path/to//output_directory'     # Where to save results

# Processing settings (ADJUST BASED ON CLUSTER RESOURCES!)  
cluster_n_jobs = -1        # -1 = use ALL available cores
cluster_chunk_size = 4000  # Tracks per chunk 

# SPTnano parameters (MODIFY IF NEEDED!)
cluster_time_between_frames = 0.01  # Time between frames
cluster_window_size = 60            # Window size in frames
cluster_overlap = 30               # Window overlap in frames
cluster_r2_threshold = 0.000001    # R² threshold for curve fitting

print("Cluster processing configuration:")
print(f"  Input file: {cluster_input_file}")
print(f"  Output directory: {cluster_output_dir}")
print(f"  CPU cores: {'ALL' if cluster_n_jobs == -1 else cluster_n_jobs}")
print(f"  Chunk size: {cluster_chunk_size} tracks")
print(f"  Window size: {cluster_window_size} frames")

# UNCOMMENT AND RUN THIS ON CLUSTER:
# ============================================
#
# from SPTnano.HPC import parallel_time_windowed_metrics
# import time
#
# print("Starting cluster processing...")
# start_time = time.time()
#
# cluster_instant_output, cluster_windowed_output = parallel_time_windowed_metrics(
#     input_file=cluster_input_file,
#     output_dir=cluster_output_dir,
#     n_jobs=cluster_n_jobs,
#     chunk_size=cluster_chunk_size,
#     time_between_frames=cluster_time_between_frames,
#     window_size=cluster_window_size,
#     overlap=cluster_overlap,
#     r2_threshold=cluster_r2_threshold
# )
#
# total_time = time.time() - start_time
# print(f"\\n🎉 CLUSTER PROCESSING COMPLETED in {total_time:.1f}s")
# print(f"Results saved to:")
# print(f"  - Instant DF: {cluster_instant_output}")
# print(f"  - Windowed DF: {cluster_windowed_output}")

print("\\n" + "="*60)
print("COMMAND LINE ALTERNATIVE:")
print("="*60)
print("# Run from terminal on cluster:")
print(f"python -m SPTnano.HPC \\\\")
print(f"    --input_file {cluster_input_file} \\\\")
print(f"    --output_dir {cluster_output_dir} \\\\")
print(f"    --n_jobs {cluster_n_jobs} \\\\")
print(f"    --chunk_size {cluster_chunk_size}")

print("\\n💡 Remember to update the file paths above for your cluster!")
