# ACIE Training on Google Colab

This notebook trains the ACIE model using the `ACIE_Project.zip` and CSV datasets from your **ACIE_Training** folder on Google Drive.

## Quick Start
1. **Drive Setup**: Ensure you have a folder `ACIE_Training` in My Drive containing:
    - `ACIE_Project.zip`
    - Your CSV dataset files
2. **Runtime**: Ensure you are using a GPU runtime (`Runtime` > `Change runtime type` > `T4 GPU`).

In [None]:
# 1. Mount Google Drive
from google.colab import drive
import os
import glob
import shutil

drive.mount('/content/drive')

# 2. Setup Configuration
DRIVE_FOLDER = "/content/drive/My Drive/ACIE_Training"
WORK_DIR = "/content/ACIE_Work"
ZIP_NAME = "ACIE_Project.zip"

# Check if Drive folder exists
if not os.path.exists(DRIVE_FOLDER):
    print(f"❌ Error: Folder '{DRIVE_FOLDER}' not found.")
    print("Please create 'ACIE_Training' in your Drive and upload your files there.")
else:
    print(f"✅ Found Drive folder: {DRIVE_FOLDER}")

In [None]:
# 3. Unzip Project
zip_path = os.path.join(DRIVE_FOLDER, ZIP_NAME)

if os.path.exists(zip_path):
    print(f"Found zip: {zip_path}")
    if os.path.exists(WORK_DIR):
        shutil.rmtree(WORK_DIR)
    
    print(f"Unzipping to {WORK_DIR}...")
    !unzip -q "{zip_path}" -d "{WORK_DIR}"
    print("✅ Project unzipped successfully.")
else:
    print(f"❌ Error: {ZIP_NAME} not found in {DRIVE_FOLDER}")

In [None]:
# 4. Link Datasets
data_dir = os.path.join(WORK_DIR, "data")
os.makedirs(data_dir, exist_ok=True)

# Find all CSVs in the Drive folder
drive_csvs = glob.glob(os.path.join(DRIVE_FOLDER, "*.csv"))

if drive_csvs:
    print(f"Found {len(drive_csvs)} CSV files in Drive. Linking to workspace...")
    for csv_path in drive_csvs:
        filename = os.path.basename(csv_path)
        target_path = os.path.join(data_dir, filename)
        if not os.path.exists(target_path):
            os.symlink(csv_path, target_path)
    print(f"✅ Datasets ready in {data_dir}")
else:
    print("⚠️ Warning: No CSV files found in ACIE_Training folder.")

In [None]:
# 5. Install Dependencies
os.chdir(WORK_DIR)
print(f"Current PWD: {os.getcwd()}")

print("Installing dependencies...")
!pip install -q pytorch-lightning torchmetrics python-dotenv
!pip install -q "numpy<2.0" pandas scipy networkx
!pip install -q "bcrypt<4.0.0" passlib python-jose[cryptography]
!pip install -e .

In [None]:
# 6. Run Training
# Configuration
OUTPUT_DIR = "outputs/colab_run1"
DATASET_SIZE = "10k"  # Change this if using 20k or other sizes
MAX_EPOCHS = 20
BATCH_SIZE = 64

cmd = f"python acie/training/train.py --data_dir data/ --output_dir {OUTPUT_DIR} --dataset_size {DATASET_SIZE} --max_epochs {MAX_EPOCHS} --batch_size {BATCH_SIZE} --gpus 1"

print(f"Starting training command: {cmd}")
!{cmd}

In [None]:
# 7. Save Results
# Copy outputs back to Drive so you don't lose them
dest_output = os.path.join(DRIVE_FOLDER, "outputs")

if os.path.exists(OUTPUT_DIR):
    print(f"Copying results to {dest_output}...")
    if not os.path.exists(dest_output):
        os.makedirs(dest_output)
    !cp -r {OUTPUT_DIR}/* "{dest_output}/"
    print("✅ Results saved to Drive.")