# ACIE Training on Google Colab

This notebook trains the ACIE model using the project folder from your Google Drive.

## Quick Start
1. **Drive Setup**: Upload your project folder to Google Drive.
2. **Runtime**: Ensure you are using a GPU runtime (`Runtime` > `Change runtime type` > `T4 GPU`).

In [None]:
# 1. Mount Google Drive
from google.colab import drive
import os
import sys
import glob

drive.mount('/content/drive')

# 2. Find Project Root (Robust Search)
SEARCH_ROOT = "/content/drive/My Drive/ACIE"
PROJECT_ROOT = None

print(f"Searching for setup.py in {SEARCH_ROOT}...")

# Look for setup.py recursively
# This handles duplicate folders like 'Project-ACIE-v1-main (1)'
candidates = glob.glob(f"{SEARCH_ROOT}/**/setup.py", recursive=True)

if candidates:
    # Sort by length to find the shortest path (likely the root project)
    candidates.sort(key=len)
    setup_path = candidates[0]
    PROJECT_ROOT = os.path.dirname(setup_path)
    
    print(f"‚úÖ Found setup.py at: {setup_path}")
    print(f"üìÇ Setting working directory to: {PROJECT_ROOT}")
    
    os.chdir(PROJECT_ROOT)
    sys.path.append(PROJECT_ROOT)
else:
    print(f"‚ùå NOT FOUND: Could not find setup.py in {SEARCH_ROOT} (or subfolders).")
    print("File listing:")
    for root, dirs, files in os.walk(SEARCH_ROOT):
        for name in files:
            print(os.path.join(root, name))

In [None]:
# 3. Install Dependencies
import os
# Double check we are in the right place
if os.path.exists("setup.py"):
    print(f"Installing dependencies from: {os.getcwd()}")
    !pip install -q pytorch-lightning torchmetrics python-dotenv
    !pip install -q "numpy<2.0" pandas scipy networkx
    !pip install -q "bcrypt<4.0.0" passlib python-jose[cryptography]
    # Install project in editable mode
    !pip install -e .
else:
    print("‚ùå Setup.py still not found. Please verify the previous cell output.")

In [None]:
# 4. Resolve Data Path
# Logic: Check root -> Check data/ -> Check parent -> Symlink if needed
import shutil

DATA_DIR = "data"

# 1. Check for CSVs in CURRENT directory (Project Root)
root_csvs = glob.glob("*.csv")
if root_csvs:
    print(f"‚úÖ Found {len(root_csvs)} CSV files in project root.")
    print("   Setting DATA_DIR to current directory (.).")
    DATA_DIR = "."

# 2. Check for 'data' subdirectory
elif os.path.exists("data") and os.listdir("data"):
    print(f"‚úÖ Found 'data' folder with content.")
    DATA_DIR = "data"

# 3. Fallback: Check parent directory (My Drive/ACIE)
else:
    print("Checking parent folder for CSVs...")
    parent_data_csvs = glob.glob(f"{SEARCH_ROOT}/*.csv")
    if parent_data_csvs:
        print(f"Found CSVs in parent folder ({SEARCH_ROOT}). Linking them...")
        os.makedirs("data", exist_ok=True)
        for csv in parent_data_csvs:
            filename = os.path.basename(csv)
            target = os.path.join("data", filename)
            if not os.path.exists(target):
                os.symlink(csv, target)
        DATA_DIR = "data"
        print(f"‚úÖ Linked {len(parent_data_csvs)} CSV files to data/ folder.")
    else:
        print("‚ö†Ô∏è Warning: No CSV files found in root, data/, or parent folder.")
        print("   Training might fail if not found.")

print(f"Using DATA_DIR: {os.path.abspath(DATA_DIR)}")

In [None]:
# 5. Run Training
# Configuration
DATASET_SIZE = "10k"
MAX_EPOCHS = 20
BATCH_SIZE = 64
OUTPUT_DIR = "outputs/colab_run1"

cmd = f"python acie/training/train.py --data_dir {DATA_DIR} --output_dir {OUTPUT_DIR} --dataset_size {DATASET_SIZE} --max_epochs {MAX_EPOCHS} --batch_size {BATCH_SIZE} --gpus 1"

print(f"Starting training command: {cmd}")
!{cmd}

In [None]:
# 6. Copy Outputs (Optional)
# Copy results back to the parent ACIE folder for easy access
dest_output = os.path.join(SEARCH_ROOT, "outputs")

if os.path.exists(OUTPUT_DIR):
    print(f"Copying results to {dest_output}...")
    if not os.path.exists(dest_output):
        os.makedirs(dest_output)
    !cp -r {OUTPUT_DIR} "{dest_output}"