# Notebook 03: Data Archiving for C. elegans Motility Analysis

This notebook handles:
- Organizing and archiving processed data
- Creating comprehensive analysis reports
- Archiving raw data with metadata
- Generating summary documentation
- Preparing data packages for publication/sharing
- Timestamped backups of analysis outputs

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import shutil
from datetime import datetime
import json
import yaml
import zipfile
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

## 2. Configuration and Path Setup

In [None]:
# Base paths
BASE_DIR = Path(r'C:\Users\MBF\Motility_analysis')
RAW_DATA_DIR = BASE_DIR / 'Data' / 'Raw'
PROCESSED_DATA_DIR = BASE_DIR / 'Data' / 'Processed'
ARCHIVE_DIR = BASE_DIR / 'Data' / 'Archive'
RESULTS_DIR = BASE_DIR / 'results'
FIGURES_DIR = RESULTS_DIR / 'figures'

# Create timestamp for this archiving session
TIMESTAMP = datetime.now().strftime('%Y%m%d_%H%M%S')
ARCHIVE_SESSION_DIR = ARCHIVE_DIR / f'archive_{TIMESTAMP}'

# Create archive directories
ARCHIVE_SESSION_DIR.mkdir(parents=True, exist_ok=True)
(ARCHIVE_SESSION_DIR / 'processed_data').mkdir(exist_ok=True)
(ARCHIVE_SESSION_DIR / 'results').mkdir(exist_ok=True)
(ARCHIVE_SESSION_DIR / 'figures').mkdir(exist_ok=True)
(ARCHIVE_SESSION_DIR / 'metadata').mkdir(exist_ok=True)

print(f"Archive session directory: {ARCHIVE_SESSION_DIR}")
print(f"Timestamp: {TIMESTAMP}")

## 3. Archive Processed Data

In [None]:
# Copy processed data files
processed_files = list(PROCESSED_DATA_DIR.glob('*.csv'))

print(f"Archiving {len(processed_files)} processed data file(s)...")

for file_path in processed_files:
    dest_path = ARCHIVE_SESSION_DIR / 'processed_data' / file_path.name
    shutil.copy2(file_path, dest_path)
    print(f"  ✓ Archived: {file_path.name}")

print("\nProcessed data archived successfully!")

## 4. Archive Results and Figures

In [None]:
# Copy result CSV files
result_files = [f for f in RESULTS_DIR.glob('*.csv') if f.is_file()]
json_files = [f for f in RESULTS_DIR.glob('*.json') if f.is_file()]

print(f"Archiving {len(result_files)} result file(s)...")
for file_path in result_files:
    dest_path = ARCHIVE_SESSION_DIR / 'results' / file_path.name
    shutil.copy2(file_path, dest_path)
    print(f"  ✓ Archived: {file_path.name}")

print(f"\nArchiving {len(json_files)} JSON file(s)...")
for file_path in json_files:
    dest_path = ARCHIVE_SESSION_DIR / 'results' / file_path.name
    shutil.copy2(file_path, dest_path)
    print(f"  ✓ Archived: {file_path.name}")

# Copy figures
if FIGURES_DIR.exists():
    figure_files = list(FIGURES_DIR.glob('*'))
    print(f"\nArchiving {len(figure_files)} figure(s)...")
    for file_path in figure_files:
        if file_path.is_file():
            dest_path = ARCHIVE_SESSION_DIR / 'figures' / file_path.name
            shutil.copy2(file_path, dest_path)
            print(f"  ✓ Archived: {file_path.name}")

print("\nResults and figures archived successfully!")

## 5. Collect and Archive Metadata

In [None]:
# Find all metadata YAML files from raw data
metadata_files = list(RAW_DATA_DIR.rglob('metadata_*.yaml'))

print(f"Found {len(metadata_files)} metadata file(s)")

# Copy metadata files to archive with organized structure
for metadata_path in metadata_files:
    # Create a readable name based on path
    relative_path = metadata_path.relative_to(RAW_DATA_DIR)
    # Replace path separators with underscores for flat structure
    archive_name = str(relative_path).replace('\\', '_').replace('/', '_')
    
    dest_path = ARCHIVE_SESSION_DIR / 'metadata' / archive_name
    shutil.copy2(metadata_path, dest_path)
    print(f"  ✓ Archived: {archive_name}")

print("\nMetadata archived successfully!")

## 6. Generate Analysis Summary Report

In [None]:
# Load processed data to generate summary
df_metrics = pd.read_csv(PROCESSED_DATA_DIR / 'track_metrics.csv')
df_thrashing = pd.read_csv(PROCESSED_DATA_DIR / 'thrashing_data.csv')

# Create comprehensive summary report
report_lines = []
report_lines.append("="*80)
report_lines.append("C. ELEGANS MOTILITY ANALYSIS - ARCHIVE SUMMARY REPORT")
report_lines.append("="*80)
report_lines.append(f"\nArchive Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
report_lines.append(f"Archive ID: {TIMESTAMP}")
report_lines.append("\n" + "="*80)

# Data summary
report_lines.append("\nDATA SUMMARY")
report_lines.append("-" * 80)
report_lines.append(f"Total tracks analyzed: {len(df_metrics)}")
report_lines.append(f"Tracks with thrashing data: {len(df_thrashing)}")
report_lines.append(f"\nGenotypes analyzed:")

genotypes = sorted(df_metrics['genotype'].unique())
for genotype in genotypes:
    n_tracks = len(df_metrics[df_metrics['genotype'] == genotype])
    n_videos = df_metrics[df_metrics['genotype'] == genotype]['video_id'].nunique()
    report_lines.append(f"  - {genotype}: {n_tracks} tracks from {n_videos} video(s)")

# Speed summary
report_lines.append("\n" + "="*80)
report_lines.append("MEAN SPEED SUMMARY (μm/s)")
report_lines.append("-" * 80)
speed_summary = df_metrics.groupby('genotype')['mean_speed'].agg(['mean', 'std', 'count'])
speed_summary['sem'] = speed_summary['std'] / np.sqrt(speed_summary['count'])
for genotype in genotypes:
    stats = speed_summary.loc[genotype]
    report_lines.append(
        f"  {genotype}: {stats['mean']:.2f} ± {stats['sem']:.2f} μm/s (n={int(stats['count'])})"
    )

# Straightness summary
report_lines.append("\n" + "="*80)
report_lines.append("STRAIGHTNESS INDEX SUMMARY")
report_lines.append("-" * 80)
straightness_summary = df_metrics.groupby('genotype')['straightness'].agg(['mean', 'std', 'count'])
straightness_summary['sem'] = straightness_summary['std'] / np.sqrt(straightness_summary['count'])
for genotype in genotypes:
    stats = straightness_summary.loc[genotype]
    report_lines.append(
        f"  {genotype}: {stats['mean']:.3f} ± {stats['sem']:.3f} (n={int(stats['count'])})"
    )

# Fatigue summary
report_lines.append("\n" + "="*80)
report_lines.append("FATIGUE INDEX SUMMARY")
report_lines.append("-" * 80)
fatigue_summary = df_metrics.groupby('genotype')['fatigue_index'].agg(['mean', 'std', 'count'])
fatigue_summary['sem'] = fatigue_summary['std'] / np.sqrt(fatigue_summary['count'])
for genotype in genotypes:
    stats = fatigue_summary.loc[genotype]
    report_lines.append(
        f"  {genotype}: {stats['mean']:.3f} ± {stats['sem']:.3f} (n={int(stats['count'])})"
    )

# Thrashing summary
if len(df_thrashing) > 0:
    report_lines.append("\n" + "="*80)
    report_lines.append("THRASHING FREQUENCY SUMMARY (Hz)")
    report_lines.append("-" * 80)
    thrashing_summary = df_thrashing.groupby('genotype')['thrashing_frequency_hz'].agg(['mean', 'std', 'count'])
    thrashing_summary['sem'] = thrashing_summary['std'] / np.sqrt(thrashing_summary['count'])
    for genotype in sorted(df_thrashing['genotype'].unique()):
        if genotype in thrashing_summary.index:
            stats = thrashing_summary.loc[genotype]
            report_lines.append(
                f"  {genotype}: {stats['mean']:.3f} ± {stats['sem']:.3f} Hz (n={int(stats['count'])})"
            )

# Files archived
report_lines.append("\n" + "="*80)
report_lines.append("ARCHIVED FILES")
report_lines.append("-" * 80)
report_lines.append("\nProcessed Data:")
for f in (ARCHIVE_SESSION_DIR / 'processed_data').glob('*'):
    report_lines.append(f"  - {f.name}")

report_lines.append("\nResults:")
for f in (ARCHIVE_SESSION_DIR / 'results').glob('*'):
    report_lines.append(f"  - {f.name}")

report_lines.append("\nFigures:")
for f in (ARCHIVE_SESSION_DIR / 'figures').glob('*'):
    report_lines.append(f"  - {f.name}")

report_lines.append("\nMetadata Files:")
for f in (ARCHIVE_SESSION_DIR / 'metadata').glob('*'):
    report_lines.append(f"  - {f.name}")

report_lines.append("\n" + "="*80)
report_lines.append("END OF REPORT")
report_lines.append("="*80)

# Save report
report_text = "\n".join(report_lines)
report_path = ARCHIVE_SESSION_DIR / 'ANALYSIS_SUMMARY_REPORT.txt'
with open(report_path, 'w') as f:
    f.write(report_text)

print(report_text)
print(f"\n\nReport saved to: {report_path}")

## 7. Create Archive Manifest

In [None]:
# Create a detailed manifest of all archived files
manifest = {
    'archive_info': {
        'timestamp': TIMESTAMP,
        'date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'archive_directory': str(ARCHIVE_SESSION_DIR)
    },
    'data_summary': {
        'total_tracks': len(df_metrics),
        'total_thrashing_tracks': len(df_thrashing),
        'genotypes': genotypes,
        'tracks_per_genotype': df_metrics['genotype'].value_counts().to_dict()
    },
    'archived_files': {
        'processed_data': [f.name for f in (ARCHIVE_SESSION_DIR / 'processed_data').glob('*')],
        'results': [f.name for f in (ARCHIVE_SESSION_DIR / 'results').glob('*')],
        'figures': [f.name for f in (ARCHIVE_SESSION_DIR / 'figures').glob('*')],
        'metadata': [f.name for f in (ARCHIVE_SESSION_DIR / 'metadata').glob('*')]
    },
    'notebooks': {
        'data_wrangling': '01_Data_Wrangling.ipynb',
        'analysis_plotting': '02_Analysis_and_Plotting.ipynb',
        'archiving': '03_Archiving.ipynb'
    }
}

# Save manifest as JSON
manifest_path = ARCHIVE_SESSION_DIR / 'archive_manifest.json'
with open(manifest_path, 'w') as f:
    json.dump(manifest, f, indent=2)

print("Archive manifest created:")
print(json.dumps(manifest, indent=2))
print(f"\nManifest saved to: {manifest_path}")

## 8. Create README for Archive

In [None]:
# Create README file for the archive
readme_content = f"""# C. elegans Motility Analysis Archive

**Archive ID:** {TIMESTAMP}  
**Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

## Overview

This archive contains the complete analysis pipeline and results for C. elegans motility analysis.

## Directory Structure

```
archive_{TIMESTAMP}/
├── processed_data/       # Processed track data, metrics, and thrashing data
├── results/              # Statistical summaries and test results
├── figures/              # Publication-quality figures (PNG and PDF)
├── metadata/             # Experimental metadata from YAML files
├── ANALYSIS_SUMMARY_REPORT.txt  # Comprehensive summary report
├── archive_manifest.json        # Detailed file manifest
└── README.md                     # This file
```

## Analysis Pipeline

The analysis was performed using three Jupyter notebooks:

1. **01_Data_Wrangling.ipynb** - Data loading, processing, and metric calculation
2. **02_Analysis_and_Plotting.ipynb** - Statistical analysis and visualization
3. **03_Archiving.ipynb** - Data archiving and documentation

## Data Summary

- **Total tracks analyzed:** {len(df_metrics)}
- **Tracks with thrashing data:** {len(df_thrashing)}
- **Genotypes:** {', '.join(genotypes)}

## Metrics Calculated

1. **Mean Speed** - Average movement speed (μm/s)
2. **Straightness Index** - Ratio of displacement to path length (0-1)
3. **Fatigue Index** - Ratio of late-phase to early-phase speed
4. **Thrashing Frequency** - Body oscillation frequency (Hz)

## Statistical Tests

- One-way ANOVA for overall group differences
- Pairwise t-tests for genotype comparisons
- Mean ± SEM reported for all metrics

## Files Included

### Processed Data
- `track_metrics.csv` - Per-track metrics (speed, straightness, fatigue, etc.)
- `thrashing_data.csv` - Thrashing frequency data
- `normalized_tracks.csv` - XY coordinates normalized to origin

### Results
- `*_group_summary.csv` - Summary statistics by genotype
- `*_pairwise_tests.csv` - Statistical test results
- `stats_summary.json` - Comprehensive statistical summary

### Figures
- `mean_speed_comparison.*` - Speed comparison by genotype
- `straightness_comparison.*` - Straightness index comparison
- `fatigue_comparison.*` - Fatigue index comparison
- `thrashing_frequency_comparison.*` - Thrashing frequency comparison
- `track_trajectories_*.* ` - XY trajectory plots
- `summary_figure.*` - Multi-panel summary figure

## Citation

If using this data, please cite:
- Analysis pipeline: Custom C. elegans motility analysis (Notebooks 01-03)
- WormLab: MBF Bioscience (https://www.mbfbioscience.com/wormlab)

## Contact

For questions about this analysis, please refer to the experimental metadata files.

---

*Archive generated automatically by 03_Archiving.ipynb*
"""

readme_path = ARCHIVE_SESSION_DIR / 'README.md'
with open(readme_path, 'w') as f:
    f.write(readme_content)

print("README created successfully!")
print(f"\nSaved to: {readme_path}")

## 9. Optional: Create ZIP Archive

In [None]:
# Create a compressed ZIP file of the entire archive
zip_path = ARCHIVE_DIR / f'archive_{TIMESTAMP}.zip'

print(f"Creating ZIP archive: {zip_path.name}")
print("This may take a moment...\n")

with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for file_path in ARCHIVE_SESSION_DIR.rglob('*'):
        if file_path.is_file():
            arc_name = file_path.relative_to(ARCHIVE_SESSION_DIR.parent)
            zipf.write(file_path, arc_name)
            print(f"  Added: {arc_name}")

zip_size_mb = zip_path.stat().st_size / (1024 * 1024)
print(f"\nZIP archive created successfully!")
print(f"Size: {zip_size_mb:.2f} MB")
print(f"Location: {zip_path}")

## 10. Clean Up Old Archives (Optional)

In [None]:
# List all archive directories
archive_dirs = sorted([d for d in ARCHIVE_DIR.iterdir() if d.is_dir() and d.name.startswith('archive_')])

print(f"Found {len(archive_dirs)} archive directory/directories:")
for archive_dir in archive_dirs:
    creation_time = datetime.fromtimestamp(archive_dir.stat().st_ctime)
    print(f"  - {archive_dir.name} (created: {creation_time.strftime('%Y-%m-%d %H:%M:%S')})")

# Optional: Delete archives older than X days (UNCOMMENT TO ENABLE)
# DAYS_TO_KEEP = 30
# cutoff_date = datetime.now() - pd.Timedelta(days=DAYS_TO_KEEP)
# 
# for archive_dir in archive_dirs:
#     creation_time = datetime.fromtimestamp(archive_dir.stat().st_ctime)
#     if creation_time < cutoff_date and archive_dir.name != f'archive_{TIMESTAMP}':
#         print(f"Deleting old archive: {archive_dir.name}")
#         shutil.rmtree(archive_dir)

print("\nNote: Automatic cleanup is disabled by default. Edit this cell to enable.")

## 11. Archive Summary and Completion

In [None]:
print("="*80)
print("ARCHIVING COMPLETE!")
print("="*80)
print(f"\nArchive ID: {TIMESTAMP}")
print(f"Archive Location: {ARCHIVE_SESSION_DIR}")

print("\nArchived Contents:")
print(f"  - Processed data files: {len(list((ARCHIVE_SESSION_DIR / 'processed_data').glob('*')))}")
print(f"  - Result files: {len(list((ARCHIVE_SESSION_DIR / 'results').glob('*')))}")
print(f"  - Figure files: {len(list((ARCHIVE_SESSION_DIR / 'figures').glob('*')))}")
print(f"  - Metadata files: {len(list((ARCHIVE_SESSION_DIR / 'metadata').glob('*')))}")

print("\nDocumentation:")
print(f"  ✓ README.md")
print(f"  ✓ ANALYSIS_SUMMARY_REPORT.txt")
print(f"  ✓ archive_manifest.json")

if zip_path.exists():
    print(f"\nCompressed Archive:")
    print(f"  ✓ {zip_path.name} ({zip_size_mb:.2f} MB)")

print("\n" + "="*80)
print("All analysis results have been archived and documented.")
print("The archive is ready for long-term storage or sharing.")
print("="*80)