<a href="https://colab.research.google.com/github/mythogenesys/Andrew-NG-DL-practises/blob/main/setup_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==============================================================================
#      STPC-EEG Project: One-Time Environment and Dataset Setup (Final Version)
# ==============================================================================
#
# This notebook is idempotent: it can be run multiple times without causing errors.
# It will only download and set up files that are not already present.
#
# ==============================================================================

import os
from google.colab import drive
import requests
from tqdm import tqdm
import zipfile
import shutil

print("--- Step 1: Mounting Google Drive ---")
drive.mount('/content/drive')
print("✅ Drive mounted successfully.")

# --- Define Project Structure ---
DRIVE_PROJECT_PATH = "/content/drive/MyDrive/stpc-eeg"
DATA_PATH = os.path.join(DRIVE_PROJECT_PATH, "data")
EEG_DATA_PATH = os.path.join(DATA_PATH, "chb-mit-scalp-eeg-database-1.0.0")
ECG_DATA_PATH = os.path.join(DATA_PATH, "mit-bih-arrhythmia-database-1.0.0")
NOISE_DATA_PATH = os.path.join(DATA_PATH, "mit-bih-noise-stress-test-database-1.0.0")

print("\n--- Step 2: Creating Project Directory Structure on Google Drive ---")
os.makedirs(EEG_DATA_PATH, exist_ok=True)
os.makedirs(ECG_DATA_PATH, exist_ok=True)
os.makedirs(NOISE_DATA_PATH, exist_ok=True)
print(f"Project root created at: {DRIVE_PROJECT_PATH}")
print("✅ Directory structure confirmed.")



In [None]:
# --- Helper function for downloading with progress bar ---
def download_file(url, destination):
    if os.path.exists(destination):
        print(f"File already exists: {os.path.basename(destination)}. Skipping download.")
        return
    print(f"Downloading {os.path.basename(destination)}...")
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    with open(destination, 'wb') as f, tqdm(
        desc=os.path.basename(destination), total=total_size, unit='iB',
        unit_scale=True, unit_divisor=1024,
    ) as bar:
        for data in response.iter_content(chunk_size=1024):
            size = f.write(data)
            bar.update(size)
    print("Download complete.")

# --- Step 3: Download Datasets ---

# --- 3a. CHB-MIT Scalp EEG Database (Subject chb01 only) ---
print("\n--- Downloading CHB-MIT EEG Dataset (Subject chb01) ---")
EEG_SUBJ01_PATH = os.path.join(EEG_DATA_PATH, 'chb01')
os.makedirs(EEG_SUBJ01_PATH, exist_ok=True)
chb01_files = [
    'chb01_01.edf', 'chb01_02.edf', 'chb01_03.edf', 'chb01_04.edf', 'chb01_05.edf',
    'chb01_06.edf', 'chb01_07.edf', 'chb01_03.edf.seizures', 'chb01_04.edf.seizures'
]
base_url = "https://physionet.org/files/chbmit/1.0.0/chb01/"
for fname in chb01_files:
    download_file(base_url + fname, os.path.join(EEG_SUBJ01_PATH, fname))



In [None]:
# --- DEFINITIVE FIX for robust unzipping and moving ---
def unzip_and_merge(zip_path, temp_unzip_dir, final_destination_dir):
    if not os.path.exists(zip_path): return
    print(f"Processing {os.path.basename(zip_path)}...")
    # Unzip to a temporary directory
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(temp_unzip_dir)

    # The zip files from PhysioNet create a folder with the same name. Find it.
    source_folder = os.path.join(temp_unzip_dir, os.path.basename(final_destination_dir))

    if not os.path.exists(source_folder):
        print(f"Warning: Expected unzipped folder not found at {source_folder}")
        return

    # Move each file from the source to the final destination, overwriting if needed
    # This is more robust than moving the whole folder.
    for item in os.listdir(source_folder):
        s_item = os.path.join(source_folder, item)
        d_item = os.path.join(final_destination_dir, item)
        # We can simply move, as shutil.move overwrites files by default if the destination is a directory
        shutil.move(s_item, d_item)

    # Clean up the temporary directories and the zip file
    shutil.rmtree(temp_unzip_dir)
    os.remove(zip_path)
    print("Setup complete.")

# --- 3b. MIT-BIH Arrhythmia Database (ECG) ---
print("\n--- Downloading MIT-BIH Arrhythmia ECG Dataset ---")
ecg_url = "https://physionet.org/static/published-projects/mitdb/mit-bih-arrhythmia-database-1.0.0.zip"
ecg_zip_path = os.path.join(DATA_PATH, 'mitdb.zip')
temp_ecg_unzip = os.path.join(DATA_PATH, 'temp_ecg')
download_file(ecg_url, ecg_zip_path)
unzip_and_merge(ecg_zip_path, temp_ecg_unzip, ECG_DATA_PATH)

# --- 3c. MIT-BIH Noise Stress Test Database ---
print("\n--- Downloading MIT-BIH Noise Stress Test Dataset ---")
noise_url = "https://physionet.org/static/published-projects/nstdb/mit-bih-noise-stress-test-database-1.0.0.zip"
noise_zip_path = os.path.join(DATA_PATH, 'nstdb.zip')
temp_noise_unzip = os.path.join(DATA_PATH, 'temp_noise')
download_file(noise_url, noise_zip_path)
unzip_and_merge(noise_zip_path, temp_noise_unzip, NOISE_DATA_PATH)


# --- Step 4: Verify Installation ---
print("\n\n--- Step 4: Verifying Final Directory Structure ---")
def verify_path(path, file_count):
    if os.path.exists(path) and len(os.listdir(path)) >= file_count:
        print(f"✅ SUCCESS: Found {len(os.listdir(path))} files in {path}")
    else:
        print(f"❌ FAILED: Path not found or is empty: {path}")

verify_path(EEG_SUBJ01_PATH, 9)
verify_path(ECG_DATA_PATH, 100) # Check for a reasonable number of files
verify_path(NOISE_DATA_PATH, 10)   # Check for a reasonable number of files

print("\n\n🎉🎉🎉 Environment setup is complete! You can now proceed to the main research notebook. 🎉🎉🎉")