# Notebook 03: Data Archiving for C. elegans Motility Analysis

This notebook handles:
- Organizing and archiving processed data
- Creating comprehensive analysis reports
- Archiving raw data with metadata
- Generating summary documentation
- Preparing data packages for publication/sharing
- Timestamped backups of analysis outputs

## 1. Import Libraries

In [10]:
import pandas as pd
import numpy as np
from pathlib import Path
import shutil
from datetime import datetime
import json
import yaml
import zipfile
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

Libraries imported successfully!


## 2. Configuration and Path Setup

In [11]:
# Base paths
BASE_DIR = Path(r'C:\Users\MBF\Motility_analysis')
RAW_DATA_DIR = BASE_DIR / 'Data' / 'Raw'
PROCESSED_DATA_DIR = BASE_DIR / 'Data' / 'Processed'
ARCHIVE_DIR = BASE_DIR / 'Data' / 'Archive'
RESULTS_DIR = BASE_DIR / 'results'
FIGURES_DIR = RESULTS_DIR / 'figures'

# Create timestamp for this archiving session
TIMESTAMP = datetime.now().strftime('%Y%m%d_%H%M%S')
ARCHIVE_SESSION_DIR = ARCHIVE_DIR / f'archive_{TIMESTAMP}'

# Create archive directories
ARCHIVE_SESSION_DIR.mkdir(parents=True, exist_ok=True)
(ARCHIVE_SESSION_DIR / 'processed_data').mkdir(exist_ok=True)
(ARCHIVE_SESSION_DIR / 'results').mkdir(exist_ok=True)
(ARCHIVE_SESSION_DIR / 'figures').mkdir(exist_ok=True)
(ARCHIVE_SESSION_DIR / 'metadata').mkdir(exist_ok=True)
(ARCHIVE_SESSION_DIR / 'raw_data').mkdir(exist_ok=True)
(ARCHIVE_SESSION_DIR / 'notebooks').mkdir(exist_ok=True)

print(f"Archive session directory: {ARCHIVE_SESSION_DIR}")
print(f"Timestamp: {TIMESTAMP}")

Archive session directory: C:\Users\MBF\Motility_analysis\Data\Archive\archive_20260224_164750
Timestamp: 20260224_164750


## 3. Archive Processed Data

In [12]:
# Copy processed data files
processed_files = list(PROCESSED_DATA_DIR.glob('*.csv'))

print(f"Archiving {len(processed_files)} processed data file(s)...")

for file_path in processed_files:
    dest_path = ARCHIVE_SESSION_DIR / 'processed_data' / file_path.name
    shutil.copy2(file_path, dest_path)
    print(f"  ✓ Archived: {file_path.name}")

print("\nProcessed data archived successfully!")

Archiving 3 processed data file(s)...
  ✓ Archived: normalized_tracks.csv
  ✓ Archived: thrashing_data.csv
  ✓ Archived: track_metrics.csv

Processed data archived successfully!


## 4. Archive Results and Figures

In [13]:
# Copy result CSV files
result_files = [f for f in RESULTS_DIR.glob('*.csv') if f.is_file()]
json_files = [f for f in RESULTS_DIR.glob('*.json') if f.is_file()]

print(f"Archiving {len(result_files)} result file(s)...")
for file_path in result_files:
    dest_path = ARCHIVE_SESSION_DIR / 'results' / file_path.name
    shutil.copy2(file_path, dest_path)
    print(f"  ✓ Archived: {file_path.name}")

print(f"\nArchiving {len(json_files)} JSON file(s)...")
for file_path in json_files:
    dest_path = ARCHIVE_SESSION_DIR / 'results' / file_path.name
    shutil.copy2(file_path, dest_path)
    print(f"  ✓ Archived: {file_path.name}")

# Copy figures
if FIGURES_DIR.exists():
    figure_files = list(FIGURES_DIR.glob('*'))
    print(f"\nArchiving {len(figure_files)} figure(s)...")
    for file_path in figure_files:
        if file_path.is_file():
            dest_path = ARCHIVE_SESSION_DIR / 'figures' / file_path.name
            shutil.copy2(file_path, dest_path)
            print(f"  ✓ Archived: {file_path.name}")

print("\nResults and figures archived successfully!")

Archiving 12 result file(s)...
  ✓ Archived: fatigue_group_summary.csv
  ✓ Archived: fatigue_pairwise_tests.csv
  ✓ Archived: group_summary.csv
  ✓ Archived: motility_metrics.csv
  ✓ Archived: per_track_mean_speeds.csv
  ✓ Archived: speed_group_summary.csv
  ✓ Archived: speed_pairwise_tests.csv
  ✓ Archived: straightness_group_summary.csv
  ✓ Archived: straightness_pairwise_tests.csv
  ✓ Archived: thrashing_frequency.csv
  ✓ Archived: thrashing_group_summary.csv
  ✓ Archived: thrashing_pairwise_tests.csv

Archiving 1 JSON file(s)...
  ✓ Archived: stats_summary.json

Archiving 14 figure(s)...
  ✓ Archived: fatigue_comparison.pdf
  ✓ Archived: fatigue_comparison.png
  ✓ Archived: mean_speed_comparison.pdf
  ✓ Archived: mean_speed_comparison.png
  ✓ Archived: straightness_comparison.pdf
  ✓ Archived: straightness_comparison.png
  ✓ Archived: summary_figure.pdf
  ✓ Archived: summary_figure.png
  ✓ Archived: thrashing_frequency_comparison.pdf
  ✓ Archived: thrashing_frequency_comparison.png

## 5. Collect and Archive Metadata

In [14]:
# Find all metadata YAML files from raw data
metadata_files = list(RAW_DATA_DIR.rglob('metadata_*.yaml'))

print(f"Found {len(metadata_files)} metadata file(s)")

# Copy metadata files to archive with organized structure
for metadata_path in metadata_files:
    # Create a readable name based on path
    relative_path = metadata_path.relative_to(RAW_DATA_DIR)
    # Replace path separators with underscores for flat structure
    archive_name = str(relative_path).replace('\\', '_').replace('/', '_')
    
    dest_path = ARCHIVE_SESSION_DIR / 'metadata' / archive_name
    shutil.copy2(metadata_path, dest_path)
    print(f"  ✓ Archived: {archive_name}")

print("\nMetadata archived successfully!")

Found 5 metadata file(s)
  ✓ Archived: 260221_Wormlab_processed_N2_metadata_N2.yaml
  ✓ Archived: 260221_Wormlab_processed_RK202_metadata_RK202.yaml
  ✓ Archived: 260221_Wormlab_processed_RK203_metadata_RK203.yaml
  ✓ Archived: 260221_Wormlab_processed_RK204_metadata_RK204.yaml
  ✓ Archived: 260221_Wormlab_processed_RK205_metadata_RK205.yaml

Metadata archived successfully!


## 6. Archive Raw Data and Notebooks

In [15]:
# Archive raw data (preserve folder structure)
raw_files = [p for p in RAW_DATA_DIR.rglob('*') if p.is_file()]
print(f"Archiving {len(raw_files)} raw data file(s)...")
for file_path in raw_files:
    dest_path = ARCHIVE_SESSION_DIR / 'raw_data' / file_path.relative_to(RAW_DATA_DIR)
    dest_path.parent.mkdir(parents=True, exist_ok=True)
    shutil.copy2(file_path, dest_path)
print("\nRaw data archived successfully!")

# Archive current notebooks
notebook_files = sorted(BASE_DIR.glob('*.ipynb'))
print(f"\nArchiving {len(notebook_files)} notebook file(s)...")
for file_path in notebook_files:
    dest_path = ARCHIVE_SESSION_DIR / 'notebooks' / file_path.name
    shutil.copy2(file_path, dest_path)
    print(f"  ✓ Archived: {file_path.name}")
print("\nNotebooks archived successfully!")

Archiving 154 raw data file(s)...

Raw data archived successfully!

Archiving 4 notebook file(s)...
  ✓ Archived: 01_Data_Wrangling.ipynb
  ✓ Archived: 02_Analysis_and_Plotting.ipynb
  ✓ Archived: 03_Archiving.ipynb
  ✓ Archived: Motility_analysis.ipynb

Notebooks archived successfully!


## 7. Generate Analysis Summary Report

In [None]:
# Generate summary report
strains = []
if 'df_metrics' in globals() and isinstance(df_metrics, pd.DataFrame) and 'strain_genotype' in df_metrics.columns:
    strains = sorted(df_metrics['strain_genotype'].dropna().unique().tolist())
elif 'df_thrashing' in globals() and isinstance(df_thrashing, pd.DataFrame) and 'strain_genotype' in df_thrashing.columns:
    strains = sorted(df_thrashing['strain_genotype'].dropna().unique().tolist())
elif 'metadata_files' in globals():
    strains = sorted({p.stem.replace('metadata_', '') for p in metadata_files})

if not strains:
    strains = ["(unknown)"]

report_content = [
    "# Motility Analysis Archive Summary",
    f"Archive Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
    f"Archive Location: {ARCHIVE_SESSION_DIR}",
    "",
    "## Dataset Summary",
    f"- Processed data files: {len(list(PROCESSED_DATA_DIR.glob('*.csv')))}",
    f"- Results files: {len(list(RESULTS_DIR.glob('*.csv')))}",
    f"- Figures: {len(list(FIGURES_DIR.glob('*.png')))}",
    f"- Metadata files: {len(list((ARCHIVE_SESSION_DIR / 'metadata').glob('*.yaml')))}",
    f"- Raw data files: {len([p for p in (ARCHIVE_SESSION_DIR / 'raw_data').rglob('*') if p.is_file()])}",
    f"- Notebooks: {len(list((ARCHIVE_SESSION_DIR / 'notebooks').glob('*.ipynb')))}",
    "",
    "## Strains Analyzed",
    f"- Strains: {', '.join(strains)}",
    "",
    "## Processing Pipeline",
    "1. Data Wrangling (01_Data_Wrangling.ipynb)",
    "2. Analysis and Plotting (02_Analysis_and_Plotting.ipynb)",
    "3. Archiving (03_Archiving.ipynb)",
    ""
 ]

# Save report to file
report_path = ARCHIVE_SESSION_DIR / 'analysis_summary.md'
with open(report_path, 'w') as f:
    f.write('\n'.join(report_content))

print(f"Summary report created: {report_path}")

NameError: name 'strains' is not defined

## 8. Create Archive Manifest

In [None]:
# Create manifest of all files in archive
manifest = {
    'archive_date': datetime.now().isoformat(),
    'archive_dir': str(ARCHIVE_SESSION_DIR),
    'files': {
        'processed_data': [],
        'results': [],
        'figures': [],
        'metadata': [],
        'raw_data': [],
        'notebooks': []
    }
}

# Add files to manifest
for file in (ARCHIVE_SESSION_DIR / 'processed_data').glob('*'):
    manifest['files']['processed_data'].append(file.name)

for file in (ARCHIVE_SESSION_DIR / 'results').glob('*'):
    manifest['files']['results'].append(file.name)

for file in (ARCHIVE_SESSION_DIR / 'figures').glob('*'):
    manifest['files']['figures'].append(file.name)

for file in (ARCHIVE_SESSION_DIR / 'metadata').glob('*'):
    manifest['files']['metadata'].append(file.name)

for file in (ARCHIVE_SESSION_DIR / 'raw_data').rglob('*'):
    if file.is_file():
        manifest['files']['raw_data'].append(str(file.relative_to(ARCHIVE_SESSION_DIR / 'raw_data')))

for file in (ARCHIVE_SESSION_DIR / 'notebooks').glob('*.ipynb'):
    manifest['files']['notebooks'].append(file.name)

# Save manifest to JSON file
manifest_path = ARCHIVE_SESSION_DIR / 'archive_manifest.json'
with open(manifest_path, 'w') as f:
    json.dump(manifest, f, indent=4)

print(f"Archive manifest created: {manifest_path}")

Archive manifest created:
{
  "archive_info": {
    "timestamp": "20260224_164017",
    "date": "2026-02-24 16:41:26",
    "archive_directory": "C:\\Users\\MBF\\Motility_analysis\\Data\\Archive\\archive_20260224_164017"
  },
  "data_summary": {
    "total_tracks": 107,
    "total_thrashing_tracks": 107,
    "genotypes": [
      "N2",
      "RK202",
      "RK203",
      "RK204",
      "RK205"
    ],
    "tracks_per_genotype": {
      "N2": 29,
      "RK202": 20,
      "RK204": 20,
      "RK205": 20,
      "RK203": 18
    }
  },
  "archived_files": {
    "processed_data": [
      "normalized_tracks.csv",
      "thrashing_data.csv",
      "track_metrics.csv"
    ],
    "results": [
      "fatigue_group_summary.csv",
      "fatigue_pairwise_tests.csv",
      "group_summary.csv",
      "motility_metrics.csv",
      "per_track_mean_speeds.csv",
      "speed_group_summary.csv",
      "speed_pairwise_tests.csv",
      "stats_summary.json",
      "straightness_group_summary.csv",
      "straigh

## 9. Create README for Archive

In [None]:
# Create README file for the archive
readme_content = [
    "# Motility Analysis Archive",
    f"Archive Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
    "",
    "## Directory Structure",
    "",
    "archive_{timestamp}/",
    "├── processed_data/   # Processed CSV data files",
    "├── results/          # Statistical results and summary tables",
    "├── figures/          # Generated plots and figures",
    "├── metadata/         # YAML metadata from raw data",
    "├── raw_data/         # Raw data copied from Data/Raw",
    "├── notebooks/        # Analysis notebooks",
    "├── analysis_summary.md",
    "├── archive_manifest.json",
    "└── README.md",
    "",
    "## Files Included",
    "",
    "- Processed data files",
    "- Statistical results",
    "- Figures and plots",
    "- Metadata files",
    "- Raw data",
    "- Analysis notebooks",
    "- Archive summary report",
    "- Archive manifest",
    "",
    "## Notes",
    "",
    "This archive contains all data and results from the motility analysis pipeline.",
    "The raw data and notebooks are included to support full reproducibility."

]

# Save README to file
readme_path = ARCHIVE_SESSION_DIR / 'README.md'
with open(readme_path, 'w') as f:
    f.write('\n'.join(readme_content))

print(f"README created: {readme_path}")

README created successfully!

Saved to: C:\Users\MBF\Motility_analysis\Data\Archive\archive_20260224_164017\README.md


## 10. Optional: Create ZIP Archive

In [None]:
# Create a compressed ZIP file of the entire archive
zip_path = ARCHIVE_DIR / f'archive_{TIMESTAMP}.zip'

print(f"Creating ZIP archive: {zip_path.name}")
print("This may take a moment...\n")

with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for file_path in ARCHIVE_SESSION_DIR.rglob('*'):
        if file_path.is_file():
            arc_name = file_path.relative_to(ARCHIVE_SESSION_DIR.parent)
            zipf.write(file_path, arc_name)
            print(f"  Added: {arc_name}")

zip_size_mb = zip_path.stat().st_size / (1024 * 1024)
print(f"\nZIP archive created successfully!")
print(f"Size: {zip_size_mb:.2f} MB")
print(f"Location: {zip_path}")

## 11. Clean Up Old Archives (Optional)

In [None]:
# List all archive directories
archive_dirs = sorted([d for d in ARCHIVE_DIR.iterdir() if d.is_dir() and d.name.startswith('archive_')])

print(f"Found {len(archive_dirs)} archive directory/directories:")
for archive_dir in archive_dirs:
    creation_time = datetime.fromtimestamp(archive_dir.stat().st_ctime)
    print(f"  - {archive_dir.name} (created: {creation_time.strftime('%Y-%m-%d %H:%M:%S')})")

# Optional: Delete archives older than X days (UNCOMMENT TO ENABLE)
# DAYS_TO_KEEP = 30
# cutoff_date = datetime.now() - pd.Timedelta(days=DAYS_TO_KEEP)
# 
# for archive_dir in archive_dirs:
#     creation_time = datetime.fromtimestamp(archive_dir.stat().st_ctime)
#     if creation_time < cutoff_date and archive_dir.name != f'archive_{TIMESTAMP}':
#         print(f"Deleting old archive: {archive_dir.name}")
#         shutil.rmtree(archive_dir)

print("\nNote: Automatic cleanup is disabled by default. Edit this cell to enable.")

## 12. Archive Summary and Completion

In [9]:
print("="*80)
print("ARCHIVING COMPLETE!")
print("="*80)
print(f"\nArchive ID: {TIMESTAMP}")
print(f"Archive Location: {ARCHIVE_SESSION_DIR}")

print("\nArchived Contents:")
print(f"  - Processed data files: {len(list((ARCHIVE_SESSION_DIR / 'processed_data').glob('*')))}")
print(f"  - Result files: {len(list((ARCHIVE_SESSION_DIR / 'results').glob('*')))}")
print(f"  - Figure files: {len(list((ARCHIVE_SESSION_DIR / 'figures').glob('*')))}")
print(f"  - Metadata files: {len(list((ARCHIVE_SESSION_DIR / 'metadata').glob('*')))}")

print("\nDocumentation:")
print(f"  ✓ README.md")
print(f"  ✓ ANALYSIS_SUMMARY_REPORT.txt")
print(f"  ✓ archive_manifest.json")

if zip_path.exists():
    print(f"\nCompressed Archive:")
    print(f"  ✓ {zip_path.name} ({zip_size_mb:.2f} MB)")

print("\n" + "="*80)
print("All analysis results have been archived and documented.")
print("The archive is ready for long-term storage or sharing.")
print("="*80)

ARCHIVING COMPLETE!

Archive ID: 20260224_164017
Archive Location: C:\Users\MBF\Motility_analysis\Data\Archive\archive_20260224_164017

Archived Contents:
  - Processed data files: 3
  - Result files: 13
  - Figure files: 14
  - Metadata files: 5

Documentation:
  ✓ README.md
  ✓ ANALYSIS_SUMMARY_REPORT.txt
  ✓ archive_manifest.json


NameError: name 'zip_path' is not defined